# Concise Implementation of Dropout

Using PyTorch, all we need to do is add a ``Dropout`` layer (also in the nn package) after each fully-connected layer,
passing in the dropout probability as the only argument to its constructor. During training, the ``Dropout``
layer will randomly drop out outputs of the previous layer (or equivalently, the inputs to the subsequent
layer) according to the specified dropout probability. When PyTorch is not in training mode, the ``Dropout``
layer simply passes the data through during testing.

In [None]:
# import packages
import torch
import torchvision
from torch import nn, optim
import numpy as np
import d2l
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from IPython import display
import utils

In [None]:
class ThreeLayerNet(torch.nn.Module):
    def __init__(self, num_inputs, num_hiddens1, 
                 num_hiddens2, num_outputs):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(ThreeLayerNet, self).__init__()
        self.num_inputs = num_inputs
        self.linear1 = torch.nn.Linear(num_inputs, num_hiddens1)
        self.linear2 = torch.nn.Linear(num_hiddens1, num_hiddens2)
        self.linear3 = torch.nn.Linear(num_hiddens2, num_outputs)
        self.nonlinear_func = torch.nn.ReLU()
        # insert your code here

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary (differentiable) operations on Tensors.
        """
        h_relu1 = self.nonlinear_func(self.linear1(x.reshape(-1, self.num_inputs)))
        # insert your code here
        h_relu2 = self.nonlinear_func(self.linear2(h_relu1))
        # insert your code here
        y_pred = self.linear2(h_relu2)
        return y_pred

In [None]:
drop_prob1, drop_prob2 = 0.2, 0.5

net = ThreeLayerNet(num_inputs=784, num_hiddens1=256,
                  num_hiddens2=256, num_outputs=10)

In [None]:
batch_size = 256
train_iter, test_iter = utils.load_data_fashion_mnist(batch_size)
num_epochs, lr = 10, 0.5
optimizer = optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()

utils.train(net, train_iter, test_iter, loss, num_epochs, 
            optimizer)

In [None]:
utils.predict(net, test_iter)