In [0]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [0]:
class NetSigmoid(nn.Module):
    def __init__(self):
        super(NetSigmoid, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x):
        x = F.sigmoid(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.sigmoid(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [0]:
class NetRelu(nn.Module):
    def __init__(self, d=0):
        super(NetRelu, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.drop = d

    def forward(self, x):
        x = F.dropout(F.relu(self.conv1(x)), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = F.dropout(F.relu(self.conv2(x)), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.dropout(F.relu(self.fc1(x)), p=self.drop)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [0]:
class NetReluNorm(nn.Module):
    def __init__(self, d=0):
        super(NetReluNorm, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.bn1 = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.bn2 = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.fc2 = nn.Linear(500, 10)
        self.drop = d

    def forward(self, x):
        x = F.dropout(self.bn1(F.relu(self.conv1(x))), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = F.dropout(self.bn2(F.relu(self.conv2(x))), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.dropout(F.relu(self.fc1(x)), p=self.drop)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [0]:
class NetReluXavier(nn.Module):
    def __init__(self, d=0):
        super(NetReluXavier, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        nn.init.xavier_normal_(self.conv1.weight)
        self.bn1 = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        nn.init.xavier_normal_(self.conv2.weight)
        self.bn2 = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        nn.init.xavier_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(500, 10)
        nn.init.xavier_normal_(self.fc2.weight)
        self.drop = d

    def forward(self, x):
        x = F.dropout(self.bn1(F.relu(self.conv1(x))), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = F.dropout(self.bn2(F.relu(self.conv2(x))), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.dropout(F.relu(self.fc1(x)), p=self.drop)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [0]:
class NetReluKaiming(nn.Module):
    def __init__(self, d=0):
        super(NetReluKaiming, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        nn.init.kaiming_normal_(self.conv1.weight)
        self.bn1 = nn.BatchNorm2d(20)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        nn.init.kaiming_normal_(self.conv2.weight)
        self.bn2 = nn.BatchNorm2d(50)
        self.fc1 = nn.Linear(4*4*50, 500)
        nn.init.kaiming_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(500, 10)
        nn.init.kaiming_normal_(self.fc2.weight)
        self.drop = d

    def forward(self, x):
        x = F.dropout(self.bn1(F.relu(self.conv1(x))), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = F.dropout(self.bn2(F.relu(self.conv2(x))), p=self.drop)
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, 4*4*50)
        x = F.dropout(F.relu(self.fc1(x)), p=self.drop)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [0]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [0]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [0]:
def main(flag, drop=0):
    # Training settings
    batchSize = 64
    testBatchSize = 1000
    epochs= 10
    learningRate = 0.01
    momentum = 0.5
    use_cuda = False
    seed = 1
    saveModel = False
    logInterval = 1000

    torch.manual_seed(seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batchSize, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batchSize, shuffle=True, **kwargs)


    if flag == "sig":
        model = NetSigmoid().to(device)
    elif flag == "relu":
        model = NetRelu(drop).to(device)
    elif flag == "normRelu":
        model = NetReluNorm(drop).to(device)
    elif flag == "NetReluXavier":
       model =  NetReluXavier().to(device)
    elif flag == "NetReluKaiming":
        model =  NetReluKaiming().to(device)
        
    optimizer = optim.SGD(model.parameters(), lr=learningRate, momentum=momentum)
    
    for epoch in range(1, epochs + 1):
        train(logInterval, model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)
        
    print("Train Accuracy")
    test(model, device, train_loader)
    print("Test Accuracy", end="")
    test(model, device, test_loader)

#     if (args.save_model):
#         torch.save(model.state_dict(),"mnist_cnn.pt")

<h4>a) We observe that relu performs better sigmoid on both testing and training. Sigmoid provides healthy gradients only in a very small range as the value of function saturates towards extremes. There is no such issue with Relu thereforce we observe a better performance.</h4>

In [19]:
# if __name__ == '__main__':
main("sig")




Test set: Average loss: 2.3035, Accuracy: 1287/10000 (13%)


Test set: Average loss: 2.2758, Accuracy: 1491/10000 (15%)


Test set: Average loss: 1.8694, Accuracy: 4889/10000 (49%)


Test set: Average loss: 0.8458, Accuracy: 7832/10000 (78%)


Test set: Average loss: 0.5315, Accuracy: 8470/10000 (85%)


Test set: Average loss: 0.4108, Accuracy: 8797/10000 (88%)


Test set: Average loss: 0.3411, Accuracy: 9007/10000 (90%)


Test set: Average loss: 0.2958, Accuracy: 9123/10000 (91%)


Test set: Average loss: 0.2570, Accuracy: 9219/10000 (92%)


Test set: Average loss: 0.2308, Accuracy: 9317/10000 (93%)

Train Accuracy

Test set: Average loss: 0.2448, Accuracy: 55753/60000 (93%)

Test Accuracy
Test set: Average loss: 0.2308, Accuracy: 9317/10000 (93%)



In [31]:
main("relu")


Test set: Average loss: 0.1017, Accuracy: 9661/10000 (97%)


Test set: Average loss: 0.0611, Accuracy: 9826/10000 (98%)


Test set: Average loss: 0.0563, Accuracy: 9807/10000 (98%)


Test set: Average loss: 0.0405, Accuracy: 9864/10000 (99%)


Test set: Average loss: 0.0387, Accuracy: 9870/10000 (99%)


Test set: Average loss: 0.0337, Accuracy: 9892/10000 (99%)


Test set: Average loss: 0.0342, Accuracy: 9873/10000 (99%)


Test set: Average loss: 0.0393, Accuracy: 9874/10000 (99%)


Test set: Average loss: 0.0295, Accuracy: 9907/10000 (99%)


Test set: Average loss: 0.0320, Accuracy: 9892/10000 (99%)

Train Accuracy

Test set: Average loss: 0.0181, Accuracy: 59677/60000 (99%)

Test Accuracy
Test set: Average loss: 0.0320, Accuracy: 9892/10000 (99%)



<h4>b) It was observed that a dropout of 0.25 gives optimal performance. The accuracy is 98.97%.<br>
Following are accuracy of various settings given as a tuple of (training, testing) accuracy:<br>
0.25: (59458/60000, 9897/10000)<br>
0.5: (59174/60000, 9860/10000)<br>
0.75: (57948/60000, 9693/10000)<br>
1: (5923/60000, 980/10000)<br><br>

The performance with 100% dropout is justified as whole network is wasted and network don't learn at all. It's peroformance should be equivalent to random guessing(0.1). It is observed that accuracy decreases monotonically as we increase dropout from 0.25 to 1.</h4>

In [32]:
dropout = [0.25, 0.5, 0.75, 0.99]
for d in dropout:
    print("Dropout =",d)
    main("relu", d)

Dropout = 0.25

Test set: Average loss: 0.1045, Accuracy: 9657/10000 (97%)


Test set: Average loss: 0.0743, Accuracy: 9751/10000 (98%)


Test set: Average loss: 0.0543, Accuracy: 9826/10000 (98%)


Test set: Average loss: 0.0480, Accuracy: 9841/10000 (98%)


Test set: Average loss: 0.0532, Accuracy: 9823/10000 (98%)


Test set: Average loss: 0.0394, Accuracy: 9869/10000 (99%)


Test set: Average loss: 0.0371, Accuracy: 9888/10000 (99%)


Test set: Average loss: 0.0358, Accuracy: 9880/10000 (99%)


Test set: Average loss: 0.0320, Accuracy: 9890/10000 (99%)


Test set: Average loss: 0.0354, Accuracy: 9887/10000 (99%)

Train Accuracy

Test set: Average loss: 0.0287, Accuracy: 59458/60000 (99%)

Test Accuracy
Test set: Average loss: 0.0331, Accuracy: 9897/10000 (99%)

Dropout = 0.5

Test set: Average loss: 0.1451, Accuracy: 9534/10000 (95%)


Test set: Average loss: 0.1030, Accuracy: 9689/10000 (97%)


Test set: Average loss: 0.0764, Accuracy: 9760/10000 (98%)


Test set: Average loss: 0.

<h4>c) Relu+BatchNorm+Dropout of 0.25: 9919/10000<br>
Relu+BatchNorm: 9892/10000<br><br>
We observe that batch normalisation with dropout performs slightly better then model without dropout setting. Both batch normalisation and dropout to used to perform regularisation. When we use both of them together they complement each other and we get better results.</h4>

In [33]:
main("normRelu", 0.25)


Test set: Average loss: 0.0636, Accuracy: 9823/10000 (98%)


Test set: Average loss: 0.0522, Accuracy: 9829/10000 (98%)


Test set: Average loss: 0.0379, Accuracy: 9870/10000 (99%)


Test set: Average loss: 0.0373, Accuracy: 9876/10000 (99%)


Test set: Average loss: 0.0385, Accuracy: 9875/10000 (99%)


Test set: Average loss: 0.0273, Accuracy: 9909/10000 (99%)


Test set: Average loss: 0.0299, Accuracy: 9901/10000 (99%)


Test set: Average loss: 0.0274, Accuracy: 9904/10000 (99%)


Test set: Average loss: 0.0234, Accuracy: 9926/10000 (99%)


Test set: Average loss: 0.0284, Accuracy: 9910/10000 (99%)

Train Accuracy

Test set: Average loss: 0.0157, Accuracy: 59731/60000 (100%)

Test Accuracy
Test set: Average loss: 0.0268, Accuracy: 9919/10000 (99%)



In [10]:
main("normRelu")

  0%|          | 16384/9912422 [00:00<01:12, 136204.94it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


9920512it [00:02, 3759110.86it/s]                             


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw


32768it [00:00, 442426.65it/s]
  1%|          | 16384/1648877 [00:00<00:11, 147024.67it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz
Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


1654784it [00:00, 2161377.45it/s]                            
8192it [00:00, 184644.40it/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw
Processing...
Done!

Test set: Average loss: 0.0505, Accuracy: 9852/10000 (99%)


Test set: Average loss: 0.0393, Accuracy: 9878/10000 (99%)


Test set: Average loss: 0.0320, Accuracy: 9889/10000 (99%)


Test set: Average loss: 0.0319, Accuracy: 9883/10000 (99%)


Test set: Average loss: 0.0280, Accuracy: 9909/10000 (99%)


Test set: Average loss: 0.0276, Accuracy: 9905/10000 (99%)


Test set: Average loss: 0.0273, Accuracy: 9905/10000 (99%)


Test set: Average loss: 0.0327, Accuracy: 9894/10000 (99%)


Test set: Average loss: 0.0296, Accuracy: 9904/10000 (99%)


Test set: Average loss: 0.0312, Accuracy: 9892/10000 (99%)

Train Accuracy

Test set: Average loss: 0.0028, Accuracy: 59990/60000 (100%)

Test Accuracy
Test set

<h4>d) (training accuracy, testing accuracy)<br>
Xavier+Relu+BatchNorm: (59993/60000, 9914/10000)<br>
Kaiming+Relu+BatchNorm: (59989/60000, 9896/10000)<br><br>
It is  observed that Xavier performs slightly better than Kaiming. This is probably because Xavier gives a better generalisation performance.</h4>

In [17]:
main("NetReluXavier")


Test set: Average loss: 0.0507, Accuracy: 9833/10000 (98%)


Test set: Average loss: 0.0394, Accuracy: 9876/10000 (99%)


Test set: Average loss: 0.0339, Accuracy: 9892/10000 (99%)


Test set: Average loss: 0.0353, Accuracy: 9878/10000 (99%)


Test set: Average loss: 0.0311, Accuracy: 9898/10000 (99%)


Test set: Average loss: 0.0307, Accuracy: 9897/10000 (99%)


Test set: Average loss: 0.0284, Accuracy: 9909/10000 (99%)


Test set: Average loss: 0.0286, Accuracy: 9907/10000 (99%)


Test set: Average loss: 0.0276, Accuracy: 9911/10000 (99%)


Test set: Average loss: 0.0278, Accuracy: 9914/10000 (99%)

Train Accuracy

Test set: Average loss: 0.0025, Accuracy: 59993/60000 (100%)

Test Accuracy
Test set: Average loss: 0.0278, Accuracy: 9914/10000 (99%)



In [18]:
main("NetReluKaiming")


Test set: Average loss: 0.0629, Accuracy: 9799/10000 (98%)


Test set: Average loss: 0.0491, Accuracy: 9835/10000 (98%)


Test set: Average loss: 0.0402, Accuracy: 9868/10000 (99%)


Test set: Average loss: 0.0363, Accuracy: 9875/10000 (99%)


Test set: Average loss: 0.0339, Accuracy: 9895/10000 (99%)


Test set: Average loss: 0.0335, Accuracy: 9893/10000 (99%)


Test set: Average loss: 0.0320, Accuracy: 9897/10000 (99%)


Test set: Average loss: 0.0321, Accuracy: 9884/10000 (99%)


Test set: Average loss: 0.0309, Accuracy: 9895/10000 (99%)


Test set: Average loss: 0.0313, Accuracy: 9896/10000 (99%)

Train Accuracy

Test set: Average loss: 0.0045, Accuracy: 59989/60000 (100%)

Test Accuracy
Test set: Average loss: 0.0313, Accuracy: 9896/10000 (99%)

