Targets: Above 99% test_acc with very less number of params  
Results:  
    best test_acc = 99.1% on 12th epoch  

    total parameters = 6016  
Analysis:  It has Batch normalization, Regularization,  slightly increased capacity and removed GAP
File Link:  

# 1. Import external libraries

In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
%matplotlib inline
import matplotlib.pyplot as plt


#### Basically working model
#### Img Aug RandomRotation
#### StepLR Scheduler
#### Train & Test Graphs

# 2. Convolutional Neural Network (model) architecture

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        set1 = 8 #channels
        set2 = 8 #channels
        out = 10 #channels
        avg = 7 #channels
        self.conv1 = nn.Conv2d(1, set1, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(num_features=set1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv2 = nn.Conv2d(set1, set1, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(num_features=set1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(set1, set2, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(num_features=set2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv4 = nn.Conv2d(set2, set2, 3, padding=1)
        self.bn4 = nn.BatchNorm2d(num_features=set2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.pool2 = nn.MaxPool2d(2, 2)
        self.conv5 = nn.Conv2d(set2, set2, 3, padding=1)
        self.bn5 = nn.BatchNorm2d(num_features=set2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv6 = nn.Conv2d(set2, out, 3, padding=1)
        self.bn6 = nn.BatchNorm2d(num_features=out, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.pool3 = nn.MaxPool2d(2, 2)
        self.conv7 = nn.Conv2d(out, out, 3, padding=1)
        self.bn7 = nn.BatchNorm2d(num_features=out, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv8 = nn.Conv2d(out, out, 3, padding=1)
        self.bn8 = nn.BatchNorm2d(num_features=out, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        self.conv9 = nn.Conv2d(out, out, 3)
        self.bn9 = nn.BatchNorm2d(num_features=out, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)


        self.drop = nn.Dropout(0.25)
        self.gap = nn.AvgPool2d(kernel_size=[avg,avg], stride=[avg,avg], padding=0, ceil_mode=False, count_include_pad=False)

    def forward(self, x):
        x = self.drop(self.pool1(self.bn2(F.relu(self.conv2(self.bn1(F.relu(self.conv1(x))))))))
        x = self.drop(self.pool2(self.bn4(F.relu(self.conv4(self.bn3(F.relu(self.conv3(x))))))))
        x = self.drop(self.pool3(self.bn6(F.relu(self.conv6(self.bn5(F.relu(self.conv5(x)))))))) # ToDo Try adding MP here
        x = self.drop(self.bn8(F.relu(self.conv8(self.bn7(F.relu(self.conv7(x))))))) # ToDo Try adding MP here
        x = self.conv9(x)
        #x = self.gap(x) # Raja ToDo Try printing shape here
        #print(x.shape)
        x = x.view(-1, 10) # Raja ToDo Try printing shape here
        return F.log_softmax(x)

#### RuntimeError: running_mean should contain 8 elements not 4
Implies check the bn channels, whether it matches with the ouput channel of its previous layer.

# 3. Display summary of model

In [3]:
#!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available() #bool
str_gpu_cpu = "cuda" if use_cuda else "cpu" #string
device = torch.device(str_gpu_cpu)
print("device is " + str_gpu_cpu)
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

device is cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              80
       BatchNorm2d-2            [-1, 8, 28, 28]              16
            Conv2d-3            [-1, 8, 28, 28]             584
       BatchNorm2d-4            [-1, 8, 28, 28]              16
         MaxPool2d-5            [-1, 8, 14, 14]               0
           Dropout-6            [-1, 8, 14, 14]               0
            Conv2d-7            [-1, 8, 14, 14]             584
       BatchNorm2d-8            [-1, 8, 14, 14]              16
            Conv2d-9            [-1, 8, 14, 14]             584
      BatchNorm2d-10            [-1, 8, 14, 14]              16
        MaxPool2d-11              [-1, 8, 7, 7]               0
          Dropout-12              [-1, 8, 7, 7]               0
           Conv2d-13              [-1, 8, 7, 7]             584
      BatchNorm2d-14    

  return F.log_softmax(x)


# 4. Preparation of dataset  

In [4]:

SEED = 1
torch.manual_seed(SEED)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 92919906.41it/s]


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 61930313.82it/s]


Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 32444602.16it/s]


Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 19223540.63it/s]

Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw






# 5. Train and Test Functions

In [5]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    correct = 0
    processed = 0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        train_losses.append(loss)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'epoch={epoch} loss={loss.item()} batch_id={batch_idx}')
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
        processed += len(data)
        train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            test_losses.append(test_loss)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\n  Test set: Average loss: {:.4f}, Test Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    test_acc.append(100. * correct / len(test_loader.dataset))


# 6. Run the model with a device and an optimizer

In [6]:

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
ep = 21
for epoch in range(1, ep):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

#With params = 2158, on 7th epoch, test accuracy = 94.15%
#With params = 4018, on 8th epoch, test accuracy = 95.9% with imgaug
#With params = 4906, on 8th epoch, test accuracy = 98.2% without imgaug
#With params = 6016, on 8th epoch, test accuracy = 99.02% without imgaug

  return F.log_softmax(x)
epoch=1 loss=0.074665367603302 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.67it/s]



  Test set: Average loss: 0.0757, Test Accuracy: 9756/10000 (97.56%)



epoch=2 loss=0.11504825204610825 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.61it/s]



  Test set: Average loss: 0.0584, Test Accuracy: 9807/10000 (98.07%)



epoch=3 loss=0.09645208716392517 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.65it/s]



  Test set: Average loss: 0.0500, Test Accuracy: 9836/10000 (98.36%)



epoch=4 loss=0.17671720683574677 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.56it/s]



  Test set: Average loss: 0.0411, Test Accuracy: 9868/10000 (98.68%)



epoch=5 loss=0.05189940705895424 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.66it/s]



  Test set: Average loss: 0.0385, Test Accuracy: 9874/10000 (98.74%)



epoch=6 loss=0.0948757603764534 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.40it/s]



  Test set: Average loss: 0.0413, Test Accuracy: 9871/10000 (98.71%)



epoch=7 loss=0.054727163165807724 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.59it/s]



  Test set: Average loss: 0.0360, Test Accuracy: 9883/10000 (98.83%)



epoch=8 loss=0.05284030735492706 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.31it/s]



  Test set: Average loss: 0.0372, Test Accuracy: 9888/10000 (98.88%)



epoch=9 loss=0.15050342679023743 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 24.73it/s]



  Test set: Average loss: 0.0324, Test Accuracy: 9899/10000 (98.99%)



epoch=10 loss=0.040694840252399445 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.29it/s]



  Test set: Average loss: 0.0365, Test Accuracy: 9890/10000 (98.90%)



epoch=11 loss=0.04004470631480217 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.40it/s]



  Test set: Average loss: 0.0324, Test Accuracy: 9899/10000 (98.99%)



epoch=12 loss=0.03488033264875412 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.53it/s]



  Test set: Average loss: 0.0279, Test Accuracy: 9910/10000 (99.10%)



epoch=13 loss=0.1315692961215973 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.77it/s]



  Test set: Average loss: 0.0288, Test Accuracy: 9911/10000 (99.11%)



epoch=14 loss=0.042361900210380554 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.28it/s]



  Test set: Average loss: 0.0295, Test Accuracy: 9903/10000 (99.03%)



epoch=15 loss=0.026655392721295357 batch_id=468: 100%|██████████| 469/469 [00:20<00:00, 22.90it/s]



  Test set: Average loss: 0.0285, Test Accuracy: 9907/10000 (99.07%)



epoch=16 loss=0.10933852940797806 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.62it/s]



  Test set: Average loss: 0.0276, Test Accuracy: 9914/10000 (99.14%)



epoch=17 loss=0.0691598579287529 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.32it/s]



  Test set: Average loss: 0.0334, Test Accuracy: 9888/10000 (98.88%)



epoch=18 loss=0.08772187680006027 batch_id=468: 100%|██████████| 469/469 [00:17<00:00, 26.52it/s]



  Test set: Average loss: 0.0294, Test Accuracy: 9909/10000 (99.09%)



epoch=19 loss=0.0071752858348190784 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 24.75it/s]



  Test set: Average loss: 0.0273, Test Accuracy: 9914/10000 (99.14%)



epoch=20 loss=0.010388033464550972 batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.08it/s]



  Test set: Average loss: 0.0269, Test Accuracy: 9909/10000 (99.09%)



Raja ToDo :
Try below ::
1. BatchNormalization
2. Dropout
3. LR scheduler
4. GAP