In [0]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)                  # Dropout - 50%
        self.drop2 = nn.Dropout(0.2)                  # Dropout - 20%
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)   
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d1 = nn.BatchNorm2d(64)              # Batch normalization
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bnm2d2 = nn.BatchNorm2d(256)             # Batch normalization
        self.conv5 = nn.Conv2d(256, 512, 3)
        self.conv6 = nn.Conv2d(512, 1024, 3)
        self.conv7 = nn.Conv2d(1024, 10, 3)

# Dropout layer added after each convolution layer till conv4. Since dropout should not be used near to output layer,
# conv5, conv6 and conv7 didnot have dropout
# ReLU activation function was removed after conv7 - as it clips off negative values, such loss of information shouldn't be there when we are close to o/p layer

    def forward(self, x):
        x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))    
        x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
# Reduced the number of channels in each layer, so that the number of parameters is less than 20k (17930 parameters)
# Every layer has only 16 channels
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1) #input -? OUtput? RF
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d1 = nn.BatchNorm2d(16) 
        self.conv3 = nn.Conv2d(16, 16, 3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bnm2d2 = nn.BatchNorm2d(16) 
        self.conv5 = nn.Conv2d(16, 16, 3)
        self.conv6 = nn.Conv2d(16, 16, 3)
        self.conv7 = nn.Conv2d(16, 10, 3)

    def forward(self, x):
        x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
        x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

# Model summary
# ----------------------------------------------------------------
#         Layer (type)               Output Shape         Param #
# ================================================================
#             Conv2d-1           [-1, 16, 28, 28]             160
#            Dropout-2           [-1, 16, 28, 28]               0
#             Conv2d-3           [-1, 32, 28, 28]           4,640
#            Dropout-4           [-1, 32, 28, 28]               0
#          MaxPool2d-5           [-1, 32, 14, 14]               0
#        BatchNorm2d-6           [-1, 32, 14, 14]              64
#             Conv2d-7           [-1, 16, 14, 14]           4,624
#            Dropout-8           [-1, 16, 14, 14]               0
#             Conv2d-9           [-1, 16, 14, 14]           2,320
#           Dropout-10           [-1, 16, 14, 14]               0
#         MaxPool2d-11             [-1, 16, 7, 7]               0
#       BatchNorm2d-12             [-1, 16, 7, 7]              32
#            Conv2d-13             [-1, 16, 5, 5]           2,320
#            Conv2d-14             [-1, 16, 3, 3]           2,320
#            Conv2d-15             [-1, 10, 1, 1]           1,450
# ================================================================
# Total params: 17,930
# Trainable params: 17,930
# Non-trainable params: 0
# ----------------------------------------------------------------

In [0]:
# Few layers have 32 channels and few layers 16, to keep the number of parameters below 20k.
# Got 99.47% accuracy
# Link to cell having result log
# https://colab.research.google.com/drive/1bt70G53grT-_bE8YUFue9VA98KnlQw69#scrollTo=5kPk664rXC4o&line=52&uniqifier=1

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1) #input -? OUtput? RF
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d1 = nn.BatchNorm2d(32) 
        self.conv3 = nn.Conv2d(32, 16, 3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bnm2d2 = nn.BatchNorm2d(16) 
        self.conv5 = nn.Conv2d(16, 16, 3)
        self.conv6 = nn.Conv2d(16, 16, 3)
        self.conv7 = nn.Conv2d(16, 10, 3)

    def forward(self, x):
        x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
        x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
# Trying the accuracy with Dropout = 50% with earlier layers and dropout = 20% with conv3 and conv4
# Drop out 50% reduced the performance to 98.15%
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1) 
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d1 = nn.BatchNorm2d(32) 
        self.conv3 = nn.Conv2d(32, 16, 3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bnm2d2 = nn.BatchNorm2d(16) 
        self.conv5 = nn.Conv2d(16, 16, 3)
        self.conv6 = nn.Conv2d(16, 16, 3)
        self.conv7 = nn.Conv2d(16, 10, 3)

    def forward(self, x):
        x = self.bnm2d1(self.pool1(F.relu(self.drop5(self.conv2(F.relu(self.drop5(self.conv1(x))))))))
        x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
# Initial layers have 32 channels, later only 16, to keep the parameters less. This iteration has 22k parameters
# Since we have lot of info in the initial layers, it would be better to have more channels there and reduce gradually
# Accuracy = 99.2
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1) 
        self.conv2 = nn.Conv2d(32, 32, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d1 = nn.BatchNorm2d(32) 
        self.conv3 = nn.Conv2d(32, 16, 3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bnm2d2 = nn.BatchNorm2d(16) 
        self.conv5 = nn.Conv2d(16, 16, 3)
        self.conv6 = nn.Conv2d(16, 16, 3)
        self.conv7 = nn.Conv2d(16, 10, 3)

    def forward(self, x):
        x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
        x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
# Another iteration choosing combination of channels to have 19498 parameters
# Accuracy = 99.45%
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1) 
        self.conv2 = nn.Conv2d(32, 16, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d1 = nn.BatchNorm2d(16) 
        self.conv3 = nn.Conv2d(16, 16, 3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.bnm2d2 = nn.BatchNorm2d(16) 
        self.conv5 = nn.Conv2d(16, 16, 3)
        self.conv6 = nn.Conv2d(16, 32, 3)
        self.conv7 = nn.Conv2d(32, 10, 3)

    def forward(self, x):
        x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
        x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
# Another iteration with Batch normalization after every initial convolution and pooling layers - 19658 parameters
# This will reduce the accuracy to 82%. Also accuracy of each epoch varies between 78 to 86%. 

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.drop5 = nn.Dropout(0.5)
        self.drop2 = nn.Dropout(0.2)
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1) 
        self.bnm2d32 = nn.BatchNorm2d(32) 
        self.conv2 = nn.Conv2d(32, 16, 3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.bnm2d16 = nn.BatchNorm2d(16) 
        self.conv3 = nn.Conv2d(16, 16, 3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        # self.bnm2d2 = nn.BatchNorm2d(16) 
        self.conv5 = nn.Conv2d(16, 16, 3)
        self.conv6 = nn.Conv2d(16, 32, 3)
        self.conv7 = nn.Conv2d(32, 10, 3)

    def forward(self, x):
        x = self.bnm2d16(self.pool1(F.relu(self.drop2(self.bnm2d16(self.conv2(F.relu(self.drop2(self.bnm2d32(self.conv1(x))))))))))
        x = self.bnm2d16(self.pool2(F.relu(self.drop2(self.bnm2d16(self.conv4(F.relu(self.drop2(self.bnm2d16(self.conv3(x))))))))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [66]:
# !pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 28, 28]             160
           Dropout-2           [-1, 16, 28, 28]               0
            Conv2d-3           [-1, 32, 28, 28]           4,640
           Dropout-4           [-1, 32, 28, 28]               0
         MaxPool2d-5           [-1, 32, 14, 14]               0
       BatchNorm2d-6           [-1, 32, 14, 14]              64
            Conv2d-7           [-1, 16, 14, 14]           4,624
           Dropout-8           [-1, 16, 14, 14]               0
            Conv2d-9           [-1, 16, 14, 14]           2,320
          Dropout-10           [-1, 16, 14, 14]               0
        MaxPool2d-11             [-1, 16, 7, 7]               0
      BatchNorm2d-12             [-1, 16, 7, 7]              32
           Conv2d-13             [-1, 16, 5, 5]           2,320
           Conv2d-14             [-1, 1



In [0]:


torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


In [0]:
from tqdm import tqdm
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [6]:
#original run
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=1.9518176317214966 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.52it/s]



Test set: Average loss: 1.8771, Accuracy: 2876/10000 (29%)



In [11]:
#Run with 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=1.9518245458602905 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.62it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.8772, Accuracy: 2873/10000 (28.73%)



loss=1.3488942384719849 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.58it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.4432, Accuracy: 4867/10000 (48.67%)



loss=1.4429212808609009 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.81it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.4210, Accuracy: 4907/10000 (49.07%)



loss=1.3823367357254028 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.67it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.4152, Accuracy: 4928/10000 (49.28%)



loss=1.4422550201416016 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.75it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.4114, Accuracy: 4941/10000 (49.41%)



loss=1.4391508102416992 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.63it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.4173, Accuracy: 4926/10000 (49.26%)



loss=1.0829514265060425 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.74it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9664, Accuracy: 6880/10000 (68.80%)



loss=1.0823291540145874 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.77it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9627, Accuracy: 6893/10000 (68.93%)



loss=0.7476081848144531 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.73it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9578, Accuracy: 6897/10000 (68.97%)



loss=0.8245716691017151 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.72it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9501, Accuracy: 6949/10000 (69.49%)



loss=1.055822491645813 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.83it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9494, Accuracy: 6942/10000 (69.42%)



loss=1.007554054260254 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.74it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9488, Accuracy: 6954/10000 (69.54%)



loss=0.9596124291419983 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.79it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9495, Accuracy: 6943/10000 (69.43%)



loss=0.8884830474853516 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.65it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9554, Accuracy: 6936/10000 (69.36%)



loss=0.9154046177864075 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.67it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9539, Accuracy: 6941/10000 (69.41%)



loss=1.0554593801498413 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.68it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9519, Accuracy: 6958/10000 (69.58%)



loss=1.1754626035690308 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.60it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9505, Accuracy: 6952/10000 (69.52%)



loss=0.8664731979370117 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.59it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7334, Accuracy: 7917/10000 (79.17%)



loss=0.6963953375816345 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.71it/s]



Test set: Average loss: 0.7231, Accuracy: 7935/10000 (79.35%)



In [18]:
# def forward(self, x):
#     x = self.bnm2d(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
#     x = self.bnm2d(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
#     x = F.relu(self.conv6(F.relu(self.conv5(x))))
#     x = self.conv7(x)
#     x = x.view(-1, 10)
#     return F.log_softmax(x)

#Run with 10 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 10):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.022708410397171974 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.31it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0454, Accuracy: 9877/10000 (98.77%)



loss=0.01673244498670101 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.24it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0300, Accuracy: 9910/10000 (99.10%)



loss=0.0038116872310638428 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.19it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0244, Accuracy: 9929/10000 (99.29%)



loss=0.0325201153755188 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.38it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0199, Accuracy: 9940/10000 (99.40%)



loss=0.0009327034349553287 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.22it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0191, Accuracy: 9936/10000 (99.36%)



loss=0.03523152694106102 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.15it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0175, Accuracy: 9945/10000 (99.45%)



loss=0.03151605650782585 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 20.19it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0172, Accuracy: 9945/10000 (99.45%)



loss=0.0004445711674634367 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 19.90it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0203, Accuracy: 9937/10000 (99.37%)



loss=0.0011947700986638665 batch_id=468: 100%|██████████| 469/469 [00:23<00:00, 19.90it/s]



Test set: Average loss: 0.0170, Accuracy: 9944/10000 (99.44%)



In [23]:
# Every layer has only 16 channels
# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         self.drop5 = nn.Dropout(0.5)
#         self.drop2 = nn.Dropout(0.2)
#         self.conv1 = nn.Conv2d(1, 16, 3, padding=1) #input -? OUtput? RF
#         self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
#         self.pool1 = nn.MaxPool2d(2, 2)
#         self.bnm2d1 = nn.BatchNorm2d(16) 
#         self.conv3 = nn.Conv2d(16, 16, 3, padding=1)
#         self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
#         self.pool2 = nn.MaxPool2d(2, 2)
#         self.bnm2d2 = nn.BatchNorm2d(16) 
#         self.conv5 = nn.Conv2d(16, 16, 3)
#         self.conv6 = nn.Conv2d(16, 16, 3)
#         self.conv7 = nn.Conv2d(16, 10, 3)

#     def forward(self, x):
#         x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
#         x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
#         x = F.relu(self.conv6(F.relu(self.conv5(x))))
#         x = self.conv7(x)
#         x = x.view(-1, 10)
#         return F.log_softmax(x)

# Model summary:
# ----------------------------------------------------------------
#         Layer (type)               Output Shape         Param #
# ================================================================
#             Conv2d-1           [-1, 16, 28, 28]             160
#            Dropout-2           [-1, 16, 28, 28]               0
#             Conv2d-3           [-1, 16, 28, 28]           2,320
#            Dropout-4           [-1, 16, 28, 28]               0
#          MaxPool2d-5           [-1, 16, 14, 14]               0
#        BatchNorm2d-6           [-1, 16, 14, 14]              32
#             Conv2d-7           [-1, 16, 14, 14]           2,320
#            Dropout-8           [-1, 16, 14, 14]               0
#             Conv2d-9           [-1, 16, 14, 14]           2,320
#           Dropout-10           [-1, 16, 14, 14]               0
#         MaxPool2d-11             [-1, 16, 7, 7]               0
#       BatchNorm2d-12             [-1, 16, 7, 7]              32
#            Conv2d-13             [-1, 16, 5, 5]           2,320
#            Conv2d-14             [-1, 16, 3, 3]           2,320
#            Conv2d-15             [-1, 10, 1, 1]           1,450
# ================================================================
# Total params: 13,274
# Trainable params: 13,274
# Non-trainable params: 0
# ----------------------------------------------------------------
# Input size (MB): 0.00
# Forward/backward pass size (MB): 0.54
# Params size (MB): 0.05
# Estimated Total Size (MB): 0.60

#Run with 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.05805510655045509 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.01it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0783, Accuracy: 9790/10000 (97.90%)



loss=0.07486288994550705 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.00it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0543, Accuracy: 9838/10000 (98.38%)



loss=0.012703180313110352 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.60it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0465, Accuracy: 9858/10000 (98.58%)



loss=0.04207509383559227 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.47it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0354, Accuracy: 9893/10000 (98.93%)



loss=0.023821985349059105 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.52it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0334, Accuracy: 9897/10000 (98.97%)



loss=0.05374038219451904 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.35it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0264, Accuracy: 9917/10000 (99.17%)



loss=0.006659810896962881 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.61it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0290, Accuracy: 9905/10000 (99.05%)



loss=0.0080045061185956 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.26it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0292, Accuracy: 9913/10000 (99.13%)



loss=0.016972729936242104 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.59it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0281, Accuracy: 9917/10000 (99.17%)



loss=0.01912788487970829 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.67it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0275, Accuracy: 9920/10000 (99.20%)



loss=0.03635774552822113 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.78it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0256, Accuracy: 9925/10000 (99.25%)



loss=0.02600800432264805 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.60it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0282, Accuracy: 9910/10000 (99.10%)



loss=0.017928967252373695 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.38it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0266, Accuracy: 9913/10000 (99.13%)



loss=0.0074200681410729885 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.36it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0248, Accuracy: 9925/10000 (99.25%)



loss=0.0030089218635112047 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.70it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0241, Accuracy: 9930/10000 (99.30%)



loss=0.03418828919529915 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.57it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0308, Accuracy: 9905/10000 (99.05%)



loss=0.026800105348229408 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.58it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0215, Accuracy: 9935/10000 (99.35%)



loss=0.012669798918068409 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.41it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0213, Accuracy: 9940/10000 (99.40%)



loss=0.008372788317501545 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.40it/s]



Test set: Average loss: 0.0311, Accuracy: 9912/10000 (99.12%)



In [38]:
# Few layers have 32 channels
# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         self.drop5 = nn.Dropout(0.5)
#         self.drop2 = nn.Dropout(0.2)
#         self.conv1 = nn.Conv2d(1, 16, 3, padding=1) #input -? OUtput? RF
#         self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
#         self.pool1 = nn.MaxPool2d(2, 2)
#         self.bnm2d1 = nn.BatchNorm2d(32) 
#         self.conv3 = nn.Conv2d(32, 16, 3, padding=1)
#         self.conv4 = nn.Conv2d(16, 16, 3, padding=1)
#         self.pool2 = nn.MaxPool2d(2, 2)
#         self.bnm2d2 = nn.BatchNorm2d(16) 
#         self.conv5 = nn.Conv2d(16, 16, 3)
#         self.conv6 = nn.Conv2d(16, 16, 3)
#         self.conv7 = nn.Conv2d(16, 10, 3)

#     def forward(self, x):
#         x = self.bnm2d1(self.pool1(F.relu(self.drop2(self.conv2(F.relu(self.drop2(self.conv1(x))))))))
#         x = self.bnm2d2(self.pool2(F.relu(self.drop2(self.conv4(F.relu(self.drop2(self.conv3(x))))))))
#         x = F.relu(self.conv6(F.relu(self.conv5(x))))
#         x = self.conv7(x)
#         x = x.view(-1, 10)
#         return F.log_softmax(x)


# Model summary
# ----------------------------------------------------------------
#         Layer (type)               Output Shape         Param #
# ================================================================
#             Conv2d-1           [-1, 16, 28, 28]             160
#            Dropout-2           [-1, 16, 28, 28]               0
#             Conv2d-3           [-1, 32, 28, 28]           4,640
#            Dropout-4           [-1, 32, 28, 28]               0
#          MaxPool2d-5           [-1, 32, 14, 14]               0
#        BatchNorm2d-6           [-1, 32, 14, 14]              64
#             Conv2d-7           [-1, 16, 14, 14]           4,624
#            Dropout-8           [-1, 16, 14, 14]               0
#             Conv2d-9           [-1, 16, 14, 14]           2,320
#           Dropout-10           [-1, 16, 14, 14]               0
#         MaxPool2d-11             [-1, 16, 7, 7]               0
#       BatchNorm2d-12             [-1, 16, 7, 7]              32
#            Conv2d-13             [-1, 16, 5, 5]           2,320
#            Conv2d-14             [-1, 16, 3, 3]           2,320
#            Conv2d-15             [-1, 10, 1, 1]           1,450
# ================================================================
# Total params: 17,930
# Trainable params: 17,930
# Non-trainable params: 0
# ----------------------------------------------------------------
# Input size (MB): 0.00
# Forward/backward pass size (MB): 0.78
# Params size (MB): 0.07
# Estimated Total Size (MB): 0.85

#Run on 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.12399271875619888 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.61it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0989, Accuracy: 9766/10000 (97.66%)



loss=0.2528211772441864 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.95it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0611, Accuracy: 9846/10000 (98.46%)



loss=0.054707471281290054 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.51it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0544, Accuracy: 9852/10000 (98.52%)



loss=0.03220691159367561 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.37it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0423, Accuracy: 9899/10000 (98.99%)



loss=0.025242725387215614 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.62it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0329, Accuracy: 9909/10000 (99.09%)



loss=0.03200244903564453 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.82it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0259, Accuracy: 9933/10000 (99.33%)



loss=0.05244546756148338 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.07it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0280, Accuracy: 9919/10000 (99.19%)



loss=0.01254651416093111 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.37it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0256, Accuracy: 9923/10000 (99.23%)



loss=0.004881918430328369 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.11it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0229, Accuracy: 9936/10000 (99.36%)



loss=0.04011973366141319 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.18it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0232, Accuracy: 9931/10000 (99.31%)



loss=0.03027212619781494 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.08it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0251, Accuracy: 9928/10000 (99.28%)



loss=0.012233312241733074 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.36it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0282, Accuracy: 9909/10000 (99.09%)



loss=0.024521028622984886 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.33it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0236, Accuracy: 9922/10000 (99.22%)



loss=0.03932979702949524 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.52it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0214, Accuracy: 9934/10000 (99.34%)



loss=0.00937757920473814 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.51it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0235, Accuracy: 9928/10000 (99.28%)



loss=0.02239391766488552 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.39it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0204, Accuracy: 9937/10000 (99.37%)



loss=0.014169435016810894 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.42it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0191, Accuracy: 9945/10000 (99.45%)



loss=0.011173729784786701 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.70it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0228, Accuracy: 9926/10000 (99.26%)



loss=0.013916552066802979 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.73it/s]



Test set: Average loss: 0.0192, Accuracy: 9947/10000 (99.47%)



In [43]:
# Extra trial with different dropout values

#Run on 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.1705964058637619 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.70it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.4001, Accuracy: 8849/10000 (88.49%)



loss=0.06826430559158325 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.28it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.1225, Accuracy: 9661/10000 (96.61%)



loss=0.006736104842275381 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.41it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.1226, Accuracy: 9623/10000 (96.23%)



loss=0.021738365292549133 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.30it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0743, Accuracy: 9787/10000 (97.87%)



loss=0.02650623954832554 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.38it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.1124, Accuracy: 9656/10000 (96.56%)



loss=0.009350140579044819 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.67it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0622, Accuracy: 9802/10000 (98.02%)



loss=0.017443561926484108 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.42it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0587, Accuracy: 9815/10000 (98.15%)



loss=0.05367700755596161 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.27it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0758, Accuracy: 9765/10000 (97.65%)



loss=0.02917124330997467 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.58it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0583, Accuracy: 9809/10000 (98.09%)



loss=0.04064783826470375 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.54it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0591, Accuracy: 9807/10000 (98.07%)



loss=0.005118578672409058 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.50it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0493, Accuracy: 9843/10000 (98.43%)



loss=0.03270355984568596 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.43it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0926, Accuracy: 9686/10000 (96.86%)



loss=0.014696896076202393 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.35it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0565, Accuracy: 9809/10000 (98.09%)



loss=0.02245255373418331 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.41it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0581, Accuracy: 9798/10000 (97.98%)



loss=0.04608713090419769 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.16it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0538, Accuracy: 9806/10000 (98.06%)



loss=0.01052247453480959 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.38it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0435, Accuracy: 9855/10000 (98.55%)



loss=0.07749127596616745 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.61it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0471, Accuracy: 9843/10000 (98.43%)



loss=0.04132729396224022 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.60it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0755, Accuracy: 9747/10000 (97.47%)



loss=0.010548372752964497 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.15it/s]



Test set: Average loss: 0.0551, Accuracy: 9815/10000 (98.15%)



In [46]:
# Model summary
# ----------------------------------------------------------------
#         Layer (type)               Output Shape         Param #
# ================================================================
#             Conv2d-1           [-1, 32, 28, 28]             320
#            Dropout-2           [-1, 32, 28, 28]               0
#             Conv2d-3           [-1, 32, 28, 28]           9,248
#            Dropout-4           [-1, 32, 28, 28]               0
#          MaxPool2d-5           [-1, 32, 14, 14]               0
#        BatchNorm2d-6           [-1, 32, 14, 14]              64
#             Conv2d-7           [-1, 16, 14, 14]           4,624
#            Dropout-8           [-1, 16, 14, 14]               0
#             Conv2d-9           [-1, 16, 14, 14]           2,320
#           Dropout-10           [-1, 16, 14, 14]               0
#         MaxPool2d-11             [-1, 16, 7, 7]               0
#       BatchNorm2d-12             [-1, 16, 7, 7]              32
#            Conv2d-13             [-1, 16, 5, 5]           2,320
#            Conv2d-14             [-1, 16, 3, 3]           2,320
#            Conv2d-15             [-1, 10, 1, 1]           1,450
# ================================================================
# Total params: 22,698
# Trainable params: 22,698
# Non-trainable params: 0
# ----------------------------------------------------------------
# Input size (MB): 0.00
# Forward/backward pass size (MB): 0.97
# Params size (MB): 0.09
# Estimated Total Size (MB): 1.06
# ----------------------------------------------------------------

#Run on 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.13682545721530914 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.43it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.1022, Accuracy: 9744/10000 (97.44%)



loss=0.06964701414108276 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.32it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0605, Accuracy: 9852/10000 (98.52%)



loss=0.04869457706809044 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.26it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0430, Accuracy: 9874/10000 (98.74%)



loss=0.05754116177558899 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.05it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0468, Accuracy: 9874/10000 (98.74%)



loss=0.017422517761588097 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.73it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0366, Accuracy: 9890/10000 (98.90%)



loss=0.07107679545879364 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.17it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0427, Accuracy: 9874/10000 (98.74%)



loss=0.0611858069896698 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.35it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0408, Accuracy: 9881/10000 (98.81%)



loss=0.008027260191738605 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.11it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0243, Accuracy: 9927/10000 (99.27%)



loss=0.0032284557819366455 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.34it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0266, Accuracy: 9926/10000 (99.26%)



loss=0.010616849176585674 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.19it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0292, Accuracy: 9917/10000 (99.17%)



loss=0.02522732876241207 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.15it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0220, Accuracy: 9927/10000 (99.27%)



loss=0.02081177569925785 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.18it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0263, Accuracy: 9916/10000 (99.16%)



loss=0.0660279169678688 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.94it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0294, Accuracy: 9900/10000 (99.00%)



loss=0.0016837219009175897 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.08it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0278, Accuracy: 9904/10000 (99.04%)



loss=0.00106082356069237 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.96it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0222, Accuracy: 9925/10000 (99.25%)



loss=0.015425602905452251 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.89it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0228, Accuracy: 9923/10000 (99.23%)



loss=0.025559991598129272 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.66it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0227, Accuracy: 9932/10000 (99.32%)



loss=0.009056180715560913 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.01it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0270, Accuracy: 9910/10000 (99.10%)



loss=0.009181295521557331 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.16it/s]



Test set: Average loss: 0.0216, Accuracy: 9924/10000 (99.24%)



In [57]:
# Model summary
# ----------------------------------------------------------------
#         Layer (type)               Output Shape         Param #
# ================================================================
#             Conv2d-1           [-1, 32, 28, 28]             320
#            Dropout-2           [-1, 32, 28, 28]               0
#             Conv2d-3           [-1, 16, 28, 28]           4,624
#            Dropout-4           [-1, 16, 28, 28]               0
#          MaxPool2d-5           [-1, 16, 14, 14]               0
#        BatchNorm2d-6           [-1, 16, 14, 14]              32
#             Conv2d-7           [-1, 16, 14, 14]           2,320
#            Dropout-8           [-1, 16, 14, 14]               0
#             Conv2d-9           [-1, 16, 14, 14]           2,320
#           Dropout-10           [-1, 16, 14, 14]               0
#         MaxPool2d-11             [-1, 16, 7, 7]               0
#       BatchNorm2d-12             [-1, 16, 7, 7]              32
#            Conv2d-13             [-1, 16, 5, 5]           2,320
#            Conv2d-14             [-1, 32, 3, 3]           4,640
#            Conv2d-15             [-1, 10, 1, 1]           2,890
# ================================================================
# Total params: 19,498
# Trainable params: 19,498
# Non-trainable params: 0
# ----------------------------------------------------------------
# Input size (MB): 0.00
# Forward/backward pass size (MB): 0.74
# Params size (MB): 0.07
# Estimated Total Size (MB): 0.81
# ----------------------------------------------------------------

#Run on 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.058711741119623184 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.95it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0734, Accuracy: 9844/10000 (98.44%)



loss=0.12986265122890472 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.03it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0535, Accuracy: 9888/10000 (98.88%)



loss=0.019692251458764076 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.38it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0378, Accuracy: 9914/10000 (99.14%)



loss=0.13556668162345886 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.31it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0356, Accuracy: 9910/10000 (99.10%)



loss=0.00846536923199892 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.59it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0323, Accuracy: 9921/10000 (99.21%)



loss=0.02578524686396122 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.23it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0300, Accuracy: 9922/10000 (99.22%)



loss=0.03594869002699852 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.99it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0301, Accuracy: 9923/10000 (99.23%)



loss=0.011368860490620136 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.55it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0245, Accuracy: 9936/10000 (99.36%)



loss=0.015428234823048115 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.39it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0214, Accuracy: 9937/10000 (99.37%)



loss=0.01822889782488346 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.29it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0257, Accuracy: 9930/10000 (99.30%)



loss=0.028771663084626198 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.34it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0218, Accuracy: 9935/10000 (99.35%)



loss=0.017092352733016014 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.40it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0223, Accuracy: 9936/10000 (99.36%)



loss=0.011721868999302387 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.18it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0195, Accuracy: 9949/10000 (99.49%)



loss=0.005151242017745972 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.61it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0177, Accuracy: 9949/10000 (99.49%)



loss=0.004404544830322266 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.35it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0208, Accuracy: 9940/10000 (99.40%)



loss=0.002698143245652318 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.43it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0192, Accuracy: 9943/10000 (99.43%)



loss=0.025732412934303284 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.40it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0246, Accuracy: 9930/10000 (99.30%)



loss=0.023253699764609337 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.44it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0177, Accuracy: 9945/10000 (99.45%)



loss=0.02134968340396881 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 37.17it/s]



Test set: Average loss: 0.0183, Accuracy: 9945/10000 (99.45%)



In [64]:
# ----------------------------------------------------------------
#         Layer (type)               Output Shape         Param #
# ================================================================
#             Conv2d-1           [-1, 32, 28, 28]             320
#        BatchNorm2d-2           [-1, 32, 28, 28]              64
#            Dropout-3           [-1, 32, 28, 28]               0
#             Conv2d-4           [-1, 16, 28, 28]           4,624
#        BatchNorm2d-5           [-1, 16, 28, 28]              32
#            Dropout-6           [-1, 16, 28, 28]               0
#          MaxPool2d-7           [-1, 16, 14, 14]               0
#        BatchNorm2d-8           [-1, 16, 14, 14]              32
#             Conv2d-9           [-1, 16, 14, 14]           2,320
#       BatchNorm2d-10           [-1, 16, 14, 14]              32
#           Dropout-11           [-1, 16, 14, 14]               0
#            Conv2d-12           [-1, 16, 14, 14]           2,320
#       BatchNorm2d-13           [-1, 16, 14, 14]              32
#           Dropout-14           [-1, 16, 14, 14]               0
#         MaxPool2d-15             [-1, 16, 7, 7]               0
#       BatchNorm2d-16             [-1, 16, 7, 7]              32
#            Conv2d-17             [-1, 16, 5, 5]           2,320
#            Conv2d-18             [-1, 32, 3, 3]           4,640
#            Conv2d-19             [-1, 10, 1, 1]           2,890
# ================================================================
# Total params: 19,658
# Trainable params: 19,658
# Non-trainable params: 0
# ----------------------------------------------------------------
# Input size (MB): 0.00
# Forward/backward pass size (MB): 1.07
# Params size (MB): 0.07
# Estimated Total Size (MB): 1.15
# ----------------------------------------------------------------

#Run on 20 epochs
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=0.09150230884552002 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.66it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.3097, Accuracy: 7980/10000 (79.80%)



loss=0.07592587918043137 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.45it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 1.1212, Accuracy: 7447/10000 (74.47%)



loss=0.05018043518066406 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.57it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8996, Accuracy: 8865/10000 (88.65%)



loss=0.012876520864665508 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.48it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.9117, Accuracy: 7876/10000 (78.76%)



loss=0.09149856120347977 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.47it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7791, Accuracy: 8502/10000 (85.02%)



loss=0.09828982502222061 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.14it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8007, Accuracy: 8448/10000 (84.48%)



loss=0.00414624810218811 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.43it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7949, Accuracy: 8240/10000 (82.40%)



loss=0.002258519409224391 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.37it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8163, Accuracy: 8442/10000 (84.42%)



loss=0.045278340578079224 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.47it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8979, Accuracy: 7491/10000 (74.91%)



loss=0.00286758947186172 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.29it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8486, Accuracy: 8015/10000 (80.15%)



loss=0.010775461792945862 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.30it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7540, Accuracy: 8637/10000 (86.37%)



loss=0.0056432634592056274 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.29it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8726, Accuracy: 8194/10000 (81.94%)



loss=0.013707722537219524 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.29it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8243, Accuracy: 8030/10000 (80.30%)



loss=0.010399550199508667 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.29it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7483, Accuracy: 8271/10000 (82.71%)



loss=0.02255135215818882 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.43it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7491, Accuracy: 8706/10000 (87.06%)



loss=0.018836280331015587 batch_id=468: 100%|██████████| 469/469 [00:13<00:00, 35.84it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.8905, Accuracy: 8095/10000 (80.95%)



loss=0.002731218934059143 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.59it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7878, Accuracy: 8414/10000 (84.14%)



loss=0.004798064474016428 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.49it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.7801, Accuracy: 8661/10000 (86.61%)



loss=0.0005885760183446109 batch_id=468: 100%|██████████| 469/469 [00:12<00:00, 36.75it/s]



Test set: Average loss: 0.7973, Accuracy: 8294/10000 (82.94%)

