In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

### Create Model

In [2]:
class Net(nn.Module):                           #Make a class to create a Model
    def __init__(self):                         #Init the class with the layer we want out model to have.
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)     #Conv layer with 1 i/p channel,20 o/p channel, 5 Kernel Size, 1 stride
        self.conv2 = nn.Conv2d(20, 50, 5, 1)    #Conv layer with 20 i/p channel,50 o/p channel, 5 Kernel Size, 1 stride
        self.fc1 = nn.Linear(4*4*50, 500)       #Linear layer with 400 fan_in and 500 fan_out
        self.fc2 = nn.Linear(500, 10)           #Linear layer with 400 fan_in and 10 fan_out(i.e. number of output)
    def forward(self, x):
        x = F.relu(self.conv1(x))               #Applying relu on conv layer.
        x = F.max_pool2d(x, 2, 2)               #Applying pooling of size3 on output of conv.
        x = F.relu(self.conv2(x))               #Applying conv and relu on the output of maxpool.
        x = F.max_pool2d(x, 2, 2)               #Applying maxpool on the result of 2 conv layer.
        x = x.view(-1, 4*4*50)                  #Changing the dimension of image in 1d so that we can apply Linear layer. 
        x = F.relu(self.fc1(x))                 #Relu on first FC layer
        x = self.fc2(x)                         #Second FC Layer
        return F.log_softmax(x, dim=1)          #Applying softmax on the output to get the result which class output belong to.

# Train the network

In [3]:
def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


# Test the network

In [4]:
def test(model, device, test_loader,forTrain=False):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    if forTrain:
        print('\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
    else:
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

# Define main

In [5]:
def main(model,lr = 0.01,epochs = 10):
    # Training settings
    
    momentum = 0.5
    batch_size = 64
    test_batch_size = 1000
    log_interval = 10
    
    
    use_cuda = torch.cuda.is_available()                  # Check for CUDA.
    device = torch.device("cuda" if use_cuda else "cpu")  # Set which device to use.
    """
    
    """
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}  
    
    
    """
    In order to make computations deterministic on your specific problem 
    on one specific platform
    """
    torch.manual_seed(1)  
    


    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=test_batch_size, shuffle=True, **kwargs)


    model = model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer, epoch, log_interval)
        test(model,device,train_loader,True)
        test(model, device, test_loader)


In [6]:
if __name__ == '__main__':
    model = Net()
    main(model)


Train set: Average loss: 0.1306, Accuracy: 57635/60000 (96%)


Test set: Average loss: 0.1237, Accuracy: 9615/10000 (96%)




Train set: Average loss: 0.0706, Accuracy: 58722/60000 (98%)


Test set: Average loss: 0.0641, Accuracy: 9809/10000 (98%)


Train set: Average loss: 0.0513, Accuracy: 59071/60000 (98%)


Test set: Average loss: 0.0470, Accuracy: 9857/10000 (99%)




Train set: Average loss: 0.0621, Accuracy: 58764/60000 (98%)


Test set: Average loss: 0.0640, Accuracy: 9783/10000 (98%)


Train set: Average loss: 0.0334, Accuracy: 59385/60000 (99%)


Test set: Average loss: 0.0370, Accuracy: 9874/10000 (99%)




Train set: Average loss: 0.0302, Accuracy: 59469/60000 (99%)


Test set: Average loss: 0.0355, Accuracy: 9881/10000 (99%)




Train set: Average loss: 0.0222, Accuracy: 59607/60000 (99%)


Test set: Average loss: 0.0293, Accuracy: 9895/10000 (99%)


Train set: Average loss: 0.0224, Accuracy: 59586/60000 (99%)


Test set: Average loss: 0.0304, Accuracy: 9899/10000 (99%)




Train set: Average loss: 0.0185, Accuracy: 59669/60000 (99%)


Test set: Average loss: 0.0282, Accuracy: 9907/10000 (99%)


Train set: Average loss: 0.0197, Accuracy: 59630/60000 (99%)


Test set: Average loss: 0.0319, Accuracy: 9883/10000 (99%)



# 4a Replace the ReLUs in the code with sigmoid units

### Create Model

In [7]:
class Net(nn.Module):                           #Make a class to create a Model
    def __init__(self):                         #Init the class with the layer we want out model to have.
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)     #Conv layer with 1 i/p channel,20 o/p channel, 5 Kernel Size, 1 stride
        self.conv2 = nn.Conv2d(20, 50, 5, 1)    #Conv layer with 20 i/p channel,50 o/p channel, 5 Kernel Size, 1 stride
        self.fc1 = nn.Linear(4*4*50, 500)       #Linear layer with 400 fan_in and 500 fan_out
        self.fc2 = nn.Linear(500, 10)           #Linear layer with 400 fan_in and 10 fan_out(i.e. number of output)
    def forward(self, x):
        x = F.sigmoid(self.conv1(x))               #Applying relu on conv layer.
        x = F.max_pool2d(x, 2, 2)               #Applying pooling of size3 on output of conv.
        x = F.sigmoid(self.conv2(x))               #Applying conv and relu on the output of maxpool.
        x = F.max_pool2d(x, 2, 2)               #Applying maxpool on the result of 2 conv layer.
        x = x.view(-1, 4*4*50)                  #Changing the dimension of image in 1d so that we can apply Linear layer. 
        x = F.relu(self.fc1(x))                 #Relu on first FC layer
        x = self.fc2(x)                         #Second FC Layer
        return F.log_softmax(x, dim=1)          #Applying softmax on the output to get the result which class output belong to.

In [8]:
if __name__ == '__main__':
    model = Net()
    main(model)




Train set: Average loss: 1.3263, Accuracy: 41292/60000 (69%)


Test set: Average loss: 1.3119, Accuracy: 6963/10000 (70%)




Train set: Average loss: 0.4236, Accuracy: 52504/60000 (88%)


Test set: Average loss: 0.4077, Accuracy: 8791/10000 (88%)


Train set: Average loss: 0.2888, Accuracy: 54791/60000 (91%)


Test set: Average loss: 0.2726, Accuracy: 9186/10000 (92%)




Train set: Average loss: 0.2361, Accuracy: 55655/60000 (93%)


Test set: Average loss: 0.2247, Accuracy: 9297/10000 (93%)


Train set: Average loss: 0.1872, Accuracy: 56686/60000 (94%)


Test set: Average loss: 0.1766, Accuracy: 9458/10000 (95%)




Train set: Average loss: 0.1604, Accuracy: 57133/60000 (95%)


Test set: Average loss: 0.1504, Accuracy: 9534/10000 (95%)




Train set: Average loss: 0.1377, Accuracy: 57507/60000 (96%)


Test set: Average loss: 0.1263, Accuracy: 9610/10000 (96%)


Train set: Average loss: 0.1255, Accuracy: 57782/60000 (96%)


Test set: Average loss: 0.1134, Accuracy: 9655/10000 (97%)




Train set: Average loss: 0.1105, Accuracy: 58041/60000 (97%)


Test set: Average loss: 0.1007, Accuracy: 9690/10000 (97%)


Train set: Average loss: 0.1025, Accuracy: 58147/60000 (97%)


Test set: Average loss: 0.0937, Accuracy: 9708/10000 (97%)



### Discuss the results (i.e., explain why one type of unit performs better than the other).
Sparse representations seem to be more beneficial than dense representations.

ReLu, gradient will be 0 because of which the weights will not get adjusted during descent. 
ReLu is less computationally expensive than sigmoid(as it computes exponential) because it involves simpler mathematical operations.

Major benefits of ReLUs are sparsity and a reduced likelihood of vanishing gradient.

The gradient of sigmoids becomes increasingly small as the absolute value of x increases. Vanishing Gradient problem.

The constant gradient of ReLUs results in faster learning.

Due to above reason ReLu works better than sigmoid in terms of accuracy and computation.

#### Train set: Average loss: 0.1025, Accuracy: 58147/60000 (97%)


#### Test set: Average loss: 0.0937, Accuracy: 9708/10000 (97%)

## 4b Compare the accuracy achieved when varying the level of dropout in a CNN.

In [11]:
class Net(nn.Module):                           #Make a class to create a Model
    def __init__(self,prob):                         #Init the class with the layer we want out model to have.
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)     #Conv layer with 1 i/p channel,20 o/p channel, 5 Kernel Size, 1 stride
        self.dropout1 = nn.Dropout2d(p=prob)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)    #Conv layer with 20 i/p channel,50 o/p channel, 5 Kernel Size, 1 stride
        self.dropout2 = nn.Dropout2d(p=prob)

        self.fc1 = nn.Linear(4*4*50, 500)       #Linear layer with 400 fan_in and 500 fan_out
        self.fc2 = nn.Linear(500, 10)           #Linear layer with 400 fan_in and 10 fan_out(i.e. number of output)
        
        
    def forward(self, x):
        x = torch.sigmoid(self.conv1(x))               #Applying relu on conv layer.
        x = F.max_pool2d(x, 2, 2)               #Applying pooling of size3 on output of conv.
        x = self.dropout1(x)
        x = torch.sigmoid(self.conv2(x))               #Applying conv and relu on the output of maxpool.
        x = F.max_pool2d(x, 2, 2)               #Applying maxpool on the result of 2 conv layer.
        x = x.view(-1, 4*4*50)                  #Changing the dimension of image in 1d so that we can apply Linear layer. 
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc1(x))                 #Relu on first FC layer
        x = self.fc2(x)                         #Second FC Layer
        return F.log_softmax(x, dim=1)          #Applying softmax on the output to get the result which class output belong to.

In [12]:
#For p=0.25 dropout setting
if __name__ == '__main__':
    model = Net(0.25)
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (dropout1): Dropout2d(p=0.25)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (dropout2): Dropout2d(p=0.25)
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 2.1085, Accuracy: 36038/60000 (60%)


Test set: Average loss: 2.1049, Accuracy: 6034/10000 (60%)




Train set: Average loss: 0.4894, Accuracy: 51661/60000 (86%)


Test set: Average loss: 0.4686, Accuracy: 8684/10000 (87%)


Train set: Average loss: 0.3166, Accuracy: 54461/60000 (91%)


Test set: Average loss: 0.2991, Accuracy: 9122/10000 (91%)




Train set: Average loss: 0.2459, Accuracy: 55462/60000 (92%)


Test set: Average loss: 0.2319, Accuracy: 9285/10000 (93%)


Train set: Average loss: 0.1970, Accuracy: 56495/60000 (94%)


Test set: Average loss: 0.1848, Accuracy: 9428/10000 (94%)




Train set: Average loss: 0.1699, Accuracy: 56965/60000 (95%)


Test set: Average loss: 0.1591, Accuracy: 9500/10000 (95%)




Train set: Average loss: 0.1483, Accuracy: 57292/60000 (95%)


Test set: Average loss: 0.1365, Accuracy: 9567/10000 (96%)


Train set: Average loss: 0.1325, Accuracy: 57630/60000 (96%)


Test set: Average loss: 0.1206, Accuracy: 9613/10000 (96%)




Train set: Average loss: 0.1227, Accuracy: 57784/60000 (96%)


Test set: Average loss: 0.1121, Accuracy: 9641/10000 (96%)


Train set: Average loss: 0.1126, Accuracy: 57977/60000 (97%)


Test set: Average loss: 0.1039, Accuracy: 9665/10000 (97%)



In [13]:
#For p=0.5 dropout setting
if __name__ == '__main__':
    model = Net(0.5)
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (dropout1): Dropout2d(p=0.5)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (dropout2): Dropout2d(p=0.5)
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 2.1628, Accuracy: 34898/60000 (58%)


Test set: Average loss: 2.1601, Accuracy: 5865/10000 (59%)




Train set: Average loss: 0.5193, Accuracy: 51169/60000 (85%)


Test set: Average loss: 0.4974, Accuracy: 8621/10000 (86%)


Train set: Average loss: 0.3281, Accuracy: 54259/60000 (90%)


Test set: Average loss: 0.3100, Accuracy: 9101/10000 (91%)




Train set: Average loss: 0.2552, Accuracy: 55194/60000 (92%)


Test set: Average loss: 0.2405, Accuracy: 9256/10000 (93%)


Train set: Average loss: 0.2035, Accuracy: 56357/60000 (94%)


Test set: Average loss: 0.1904, Accuracy: 9416/10000 (94%)




Train set: Average loss: 0.1744, Accuracy: 56846/60000 (95%)


Test set: Average loss: 0.1630, Accuracy: 9482/10000 (95%)




Train set: Average loss: 0.1537, Accuracy: 57196/60000 (95%)


Test set: Average loss: 0.1416, Accuracy: 9554/10000 (96%)


Train set: Average loss: 0.1379, Accuracy: 57527/60000 (96%)


Test set: Average loss: 0.1260, Accuracy: 9596/10000 (96%)




Train set: Average loss: 0.1279, Accuracy: 57691/60000 (96%)


Test set: Average loss: 0.1170, Accuracy: 9615/10000 (96%)


Train set: Average loss: 0.1174, Accuracy: 57852/60000 (96%)


Test set: Average loss: 0.1080, Accuracy: 9652/10000 (97%)



In [14]:
# For 0.75
if __name__ == '__main__':
    model = Net(0.75)
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (dropout1): Dropout2d(p=0.75)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (dropout2): Dropout2d(p=0.75)
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 2.2235, Accuracy: 29795/60000 (50%)


Test set: Average loss: 2.2218, Accuracy: 5042/10000 (50%)




Train set: Average loss: 0.5994, Accuracy: 50000/60000 (83%)


Test set: Average loss: 0.5766, Accuracy: 8449/10000 (84%)


Train set: Average loss: 0.3580, Accuracy: 53708/60000 (90%)


Test set: Average loss: 0.3392, Accuracy: 9017/10000 (90%)




Train set: Average loss: 0.2771, Accuracy: 54765/60000 (91%)


Test set: Average loss: 0.2627, Accuracy: 9198/10000 (92%)


Train set: Average loss: 0.2220, Accuracy: 56042/60000 (93%)


Test set: Average loss: 0.2088, Accuracy: 9364/10000 (94%)




Train set: Average loss: 0.1914, Accuracy: 56567/60000 (94%)


Test set: Average loss: 0.1785, Accuracy: 9430/10000 (94%)




Train set: Average loss: 0.1679, Accuracy: 56995/60000 (95%)


Test set: Average loss: 0.1561, Accuracy: 9505/10000 (95%)


Train set: Average loss: 0.1518, Accuracy: 57255/60000 (95%)


Test set: Average loss: 0.1402, Accuracy: 9548/10000 (95%)




Train set: Average loss: 0.1415, Accuracy: 57410/60000 (96%)


Test set: Average loss: 0.1309, Accuracy: 9572/10000 (96%)


Train set: Average loss: 0.1294, Accuracy: 57639/60000 (96%)


Test set: Average loss: 0.1194, Accuracy: 9612/10000 (96%)



In [15]:
# For p=1 dropout 
if __name__ == '__main__':
    model = Net(1)
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (dropout1): Dropout2d(p=1)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (dropout2): Dropout2d(p=1)
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 2.3030, Accuracy: 5923/60000 (10%)


Test set: Average loss: 2.3029, Accuracy: 980/10000 (10%)




Train set: Average loss: 2.3017, Accuracy: 5459/60000 (9%)


Test set: Average loss: 2.3017, Accuracy: 854/10000 (9%)


Train set: Average loss: 2.3019, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3018, Accuracy: 1135/10000 (11%)




Train set: Average loss: 2.3020, Accuracy: 6864/60000 (11%)


Test set: Average loss: 2.3018, Accuracy: 1152/10000 (12%)


Train set: Average loss: 2.3020, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3018, Accuracy: 1135/10000 (11%)




Train set: Average loss: 2.3019, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3017, Accuracy: 1135/10000 (11%)




Train set: Average loss: 2.3017, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3015, Accuracy: 1135/10000 (11%)


Train set: Average loss: 2.3017, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3016, Accuracy: 1135/10000 (11%)




Train set: Average loss: 2.3015, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3013, Accuracy: 1135/10000 (11%)


Train set: Average loss: 2.3014, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3012, Accuracy: 1135/10000 (11%)



### Dropouts
Regularization is way to prevent over-fitting. Regularization reduces over-fitting by adding a penalty to the loss function. Dropout is an approach to regularization in neural networks which helps reducing interdependent learning amongst the neurons. Like in some layer only particular neurons are getting activated not others thats why we use dropout so that other neurons also gets activated and helps in generalization. Dropout has the effect of making the training process noisy, forcing nodes within a layer to probabilistically take on more or less responsibility for the inputs.

### For p = 0.25 This one is the best dropout for the above setting as it gives better performance over others because it is taking 75% of neuron into consideration.

Train set: Average loss: 0.1126, Accuracy: 57977/60000 (97%)

Test set: Average loss: 0.1039, Accuracy: 9665/10000 (97%)


### For p = .5 This one is comparable to 0.25 not of a much difference.

Train set: Average loss: 0.1174, Accuracy: 57852/60000 (96%)

Test set: Average loss: 0.1080, Accuracy: 9652/10000 (97%)

### For p = .75 This one is also comparable as there is not of a much difference in accuracy.

Train set: Average loss: 0.1294, Accuracy: 57639/60000 (96%)

Test set: Average loss: 0.1194, Accuracy: 9612/10000 (96%)



### For p = 1 This one is bad of above as it is giving 11% accuracy due to dropout(p=1) as all the neurons are deactivated.

Train set: Average loss: 2.3014, Accuracy: 6742/60000 (11%)


Test set: Average loss: 2.3012, Accuracy: 1135/10000 (11%)


## 4c Include batch normalization in your ReLU model with the best DropOut setting that you achieved.

### Model`

In [11]:
class Net(nn.Module):                           #Make a class to create a Model
    def __init__(self,prob):                         #Init the class with the layer we want out model to have.
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)     #Conv layer with 1 i/p channel,20 o/p channel, 5 Kernel Size, 1 stride
        self.dropout1 = nn.Dropout2d(p=prob)    #DropOut with probability p
        self.bn1 = nn.BatchNorm2d(20)

        self.conv2 = nn.Conv2d(20, 50, 5, 1)    #Conv layer with 20 i/p channel,50 o/p channel, 5 Kernel Size, 1 stride
        self.dropout2 = nn.Dropout(p=prob)    #DropOut with probability p
        
        self.fc1 = nn.Linear(4*4*50, 500)       #Linear layer with 400 fan_in and 500 fan_out
        self.bn2 = nn.BatchNorm1d(num_features=500)
        self.fc2 = nn.Linear(500, 10)           #Linear layer with 400 fan_in and 10 fan_out(i.e. number of output)
    def forward(self, x):
        x = F.relu(self.conv1(x))               #Applying relu on conv layer.
        x = F.max_pool2d(x, 2, 2)               #Applying pooling of size3 on output of conv.
        #x = self.dropout1(x, training=self.training)
        x = self.dropout1(x)
        x = self.bn1(x)
        x = F.relu(self.conv2(x))               #Applying conv and relu on the output of maxpool.
        x = F.max_pool2d(x, 2, 2)               #Applying maxpool on the result of 2 conv layer.
        x = x.view(-1, 4*4*50)                  #Changing the dimension of image in 1d so that we can apply Linear layer. 
        x = self.dropout2(x)                    #Dropout on FC
        x = F.relu(self.fc1(x))                 #Relu on first FC layer
        x = self.bn2(x)
        x = self.fc2(x)                         #Second FC Layer
        return F.log_softmax(x, dim=1)          #Applying softmax on the output to get the result which class output belong to.

In [12]:
if __name__ == '__main__':
    model = Net(0.25)
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (dropout1): Dropout2d(p=0.25)
  (bn1): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (dropout2): Dropout(p=0.25)
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (bn2): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 0.0869, Accuracy: 58536/60000 (98%)


Test set: Average loss: 0.0778, Accuracy: 9770/10000 (98%)




Train set: Average loss: 0.0501, Accuracy: 59191/60000 (99%)


Test set: Average loss: 0.0462, Accuracy: 9866/10000 (99%)


Train set: Average loss: 0.0383, Accuracy: 59379/60000 (99%)


Test set: Average loss: 0.0374, Accuracy: 9887/10000 (99%)




Train set: Average loss: 0.0306, Accuracy: 59478/60000 (99%)


Test set: Average loss: 0.0316, Accuracy: 9893/10000 (99%)


Train set: Average loss: 0.0259, Accuracy: 59568/60000 (99%)


Test set: Average loss: 0.0271, Accuracy: 9909/10000 (99%)




Train set: Average loss: 0.0225, Accuracy: 59619/60000 (99%)


Test set: Average loss: 0.0255, Accuracy: 9911/10000 (99%)




Train set: Average loss: 0.0197, Accuracy: 59670/60000 (99%)


Test set: Average loss: 0.0246, Accuracy: 9919/10000 (99%)


Train set: Average loss: 0.0176, Accuracy: 59719/60000 (100%)


Test set: Average loss: 0.0229, Accuracy: 9931/10000 (99%)




Train set: Average loss: 0.0163, Accuracy: 59751/60000 (100%)


Test set: Average loss: 0.0227, Accuracy: 9929/10000 (99%)


Train set: Average loss: 0.0143, Accuracy: 59754/60000 (100%)


Test set: Average loss: 0.0207, Accuracy: 9933/10000 (99%)



### Report
Due to the above Regularization method both are giving really good results around 99.33% on test and approx 100% on train. BatchNorm help in not overshoot of weights due to gradient.

Batch normalization reduces the amount by what the hidden unit values shift around (covariance shift, as distribution changes).

Batch normalization allows each layer of a network to learn by itself a little bit more independently of other layers.

Train set: Average loss: 0.0143, Accuracy: 59754/60000 (100%)


Test set: Average loss: 0.0207, Accuracy: 9933/10000 (99%)


### With Removed Dropout from above setting

In [13]:
class Net(nn.Module):                           #Make a class to create a Model
    def __init__(self):                         #Init the class with the layer we want out model to have.
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)     #Conv layer with 1 i/p channel,20 o/p channel, 5 Kernel Size, 1 stride
        self.bn1 = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)    #Conv layer with 20 i/p channel,50 o/p channel, 5 Kernel Size, 1 stride
        self.fc1 = nn.Linear(4*4*50, 500)       #Linear layer with 400 fan_in and 500 fan_out
        self.bn2 = nn.BatchNorm1d(num_features=500)
        self.fc2 = nn.Linear(500, 10)           #Linear layer with 400 fan_in and 10 fan_out(i.e. number of output)
        
        
    def forward(self, x):
        x = F.relu(self.conv1(x))               #Applying relu on conv layer.
        x = F.max_pool2d(x, 2, 2)               #Applying pooling of size3 on output of conv.
        x = self.bn1(x)
        x = F.relu(self.conv2(x))               #Applying conv and relu on the output of maxpool.
        x = F.max_pool2d(x, 2, 2)               #Applying maxpool on the result of 2 conv layer.
        x = x.view(-1, 4*4*50)                  #Changing the dimension of image in 1d so that we can apply Linear layer. 
        x = F.relu(self.fc1(x))                 #Relu on first FC layer
        x = self.bn2(x)
        x = self.fc2(x)                         #Second FC Layer
        return F.log_softmax(x, dim=1)          #Applying softmax on the output to get the result which class output belong to.

In [14]:
if __name__ == '__main__':
    model = Net()
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (bn1): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (bn2): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 0.0590, Accuracy: 59139/60000 (99%)


Test set: Average loss: 0.0605, Accuracy: 9832/10000 (98%)




Train set: Average loss: 0.0349, Accuracy: 59488/60000 (99%)


Test set: Average loss: 0.0406, Accuracy: 9876/10000 (99%)


Train set: Average loss: 0.0269, Accuracy: 59615/60000 (99%)


Test set: Average loss: 0.0355, Accuracy: 9886/10000 (99%)




Train set: Average loss: 0.0221, Accuracy: 59677/60000 (99%)


Test set: Average loss: 0.0324, Accuracy: 9902/10000 (99%)


Train set: Average loss: 0.0161, Accuracy: 59826/60000 (100%)


Test set: Average loss: 0.0287, Accuracy: 9905/10000 (99%)




Train set: Average loss: 0.0114, Accuracy: 59868/60000 (100%)


Test set: Average loss: 0.0239, Accuracy: 9919/10000 (99%)




Train set: Average loss: 0.0086, Accuracy: 59921/60000 (100%)


Test set: Average loss: 0.0240, Accuracy: 9918/10000 (99%)


Train set: Average loss: 0.0072, Accuracy: 59949/60000 (100%)


Test set: Average loss: 0.0232, Accuracy: 9926/10000 (99%)




Train set: Average loss: 0.0064, Accuracy: 59950/60000 (100%)


Test set: Average loss: 0.0227, Accuracy: 9923/10000 (99%)


Train set: Average loss: 0.0057, Accuracy: 59959/60000 (100%)


Test set: Average loss: 0.0229, Accuracy: 9928/10000 (99%)



### Report
Similar to dropout, it adds some noise to each hidden layer’s activations. Therefore, if we use batch normalization, we will use less dropout, which is a good thing because we are not going to lose a lot of information. Since we are using two regularizer in above setting we are getting better result than above setting.

Train set: Average loss: 0.0057, Accuracy: 59959/60000 (100%)


Test set: Average loss: 0.0229, Accuracy: 9928/10000 (99%)

## 4d With ReLU units and batch normalization (no DropOut).

### Init Weights

In [8]:
def init_weights(m):
#     print(m)
    if isinstance(m,nn.Linear):
        #m.weight.data.fill_(1.0)
        torch.nn.init.xavier_uniform_(m.weight)
    if isinstance(m, nn.Conv2d):
        torch.nn.init.xavier_uniform_(m.weight)
#        m.weight.data.fill_(1.0)
#         if m.bias:
#             torch.nn.init.xavier_uniform_(m.bias)

### Model

In [9]:
class Net(nn.Module):                           #Make a class to create a Model
    def __init__(self,prob):                         #Init the class with the layer we want out model to have.
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)     #Conv layer with 1 i/p channel,20 o/p channel, 5 Kernel Size, 1 stride
        self.bn1 = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)    #Conv layer with 20 i/p channel,50 o/p channel, 5 Kernel Size, 1 stride        
        self.fc1 = nn.Linear(4*4*50, 500)       #Linear layer with 400 fan_in and 500 fan_out
        self.bn2 = nn.BatchNorm1d(num_features=500)
        self.fc2 = nn.Linear(500, 10)           #Linear layer with 400 fan_in and 10 fan_out(i.e. number of output)
    def forward(self, x):
        x = F.relu(self.conv1(x))               #Applying relu on conv layer.
        x = F.max_pool2d(x, 2, 2)               #Applying pooling of size3 on output of conv.
        x = self.bn1(x)
        x = F.relu(self.conv2(x))               #Applying conv and relu on the output of maxpool.
        x = F.max_pool2d(x, 2, 2)               #Applying maxpool on the result of 2 conv layer.
        x = x.view(-1, 4*4*50)                  #Changing the dimension of image in 1d so that we can apply Linear layer. 
        x = F.relu(self.fc1(x))                 #Relu on first FC layer
        x = self.bn2(x)
        x = self.fc2(x)                         #Second FC Layer
        return F.log_softmax(x, dim=1)          #Applying softmax on the output to get the result which class output belong to.

In [10]:
if __name__ == '__main__':
    model = Net(0.5)
    model.apply(init_weights)
    print(model)
    main(model)

Net(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (bn1): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (bn2): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)

Train set: Average loss: 0.0617, Accuracy: 59066/60000 (98%)


Test set: Average loss: 0.0618, Accuracy: 9835/10000 (98%)




Train set: Average loss: 0.0384, Accuracy: 59414/60000 (99%)


Test set: Average loss: 0.0452, Accuracy: 9865/10000 (99%)


Train set: Average loss: 0.0311, Accuracy: 59550/60000 (99%)


Test set: Average loss: 0.0380, Accuracy: 9878/10000 (99%)




Train set: Average loss: 0.0249, Accuracy: 59616/60000 (99%)


Test set: Average loss: 0.0345, Accuracy: 9894/10000 (99%)


Train set: Average loss: 0.0193, Accuracy: 59753/60000 (100%)


Test set: Average loss: 0.0313, Accuracy: 9908/10000 (99%)




Train set: Average loss: 0.0160, Accuracy: 59782/60000 (100%)


Test set: Average loss: 0.0283, Accuracy: 9917/10000 (99%)




Train set: Average loss: 0.0125, Accuracy: 59860/60000 (100%)


Test set: Average loss: 0.0275, Accuracy: 9915/10000 (99%)


Train set: Average loss: 0.0267, Accuracy: 59532/60000 (99%)


Test set: Average loss: 0.0420, Accuracy: 9853/10000 (99%)




Train set: Average loss: 0.0092, Accuracy: 59916/60000 (100%)


Test set: Average loss: 0.0256, Accuracy: 9916/10000 (99%)


Train set: Average loss: 0.0087, Accuracy: 59917/60000 (100%)


Test set: Average loss: 0.0268, Accuracy: 9913/10000 (99%)



### Report
I did xavier_uniform on the weight which are already there. I also tried weight =1 in FC which gave really bad result like 66% with batchnorm. With uniform on all the layer accuracy is really good.
So it doesn't matter of batchnorm for different weight initialization.