## Base Model
 **[Target, Result, Analysis] at the end**



In [19]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from prettytable import PrettyTable

In [20]:
# Train Phase transformations
train_transforms = transforms.Compose([
                                      #  transforms.Resize((28, 28)),
                                      #  transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                                       # transforms.RandomRotation((-7.0, 7.0), fill=(1,)),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,)) # The mean and std have to be sequences (e.g., tuples), therefore you should add a comma after the values.
                                       # Note the difference between (0.1307) and (0.1307,)
                                       ])

# Test Phase transformations
test_transforms = transforms.Compose([
                                      #  transforms.Resize((28, 28)),
                                      #  transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,))
                                       ])


In [21]:
train = datasets.MNIST('./data', train=True, download=True, transform=train_transforms)
test = datasets.MNIST('./data', train=False, download=True, transform=test_transforms)

In [22]:
SEED = 1

# CUDA?
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
cuda = torch.cuda.is_available()
print("GPU Available?", device)

# For reproducibility
torch.manual_seed(SEED)

if device == "cuda":
    torch.cuda.manual_seed(SEED)
elif device == "mps":
    torch.manual_seed(SEED)

# dataloader arguments - something you'll fetch these from cmdprmt
dataloader_args = dict(shuffle=True, batch_size=128, num_workers=4, pin_memory=True) if device else dict(shuffle=True, batch_size=64)

# train dataloader
train_loader = torch.utils.data.DataLoader(train, **dataloader_args)

# test dataloader
test_loader = torch.utils.data.DataLoader(test, **dataloader_args)

# Pretty table for collecting all the accuracy and loss parameters in a table
log_table = PrettyTable()

GPU Available? mps


In [23]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Input Block
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=8, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU()
        ) # output_size = 26

        # CONVOLUTION BLOCK 1
        self.convblock2 = nn.Sequential(
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU()
        ) # output_size = 24

         # TRANSITION BLOCK 1
        self.pool1 = nn.MaxPool2d(2, 2) # output_size = 12
        self.convblock3 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=8, kernel_size=(1, 1), padding=0, bias=False),
            nn.ReLU()
        ) # output_size = 12

        # CONVOLUTION BLOCK 2
        self.convblock4 = nn.Sequential(
            nn.Conv2d(in_channels=8, out_channels=12, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU()
        ) # output_size = 10

        self.convblock5 = nn.Sequential(
            nn.Conv2d(in_channels=12, out_channels=12, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU()
        ) # output_size = 8

        self.convblock6 = nn.Sequential(
            nn.Conv2d(in_channels=12, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU()
        ) # output_size = 6

        # OUTPUT BLOCK
        self.convblock7 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=1, bias=False),
            nn.ReLU()
        ) # output_size = 6

        self.gap = nn.Sequential(
            nn.AvgPool2d(kernel_size=6) # 7>> 9... nn.AdaptiveAvgPool((1, 1))
        ) # output_size = 1

        self.convblock8 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=10, kernel_size=(1, 1), padding=0, bias=False),
            # nn.BatchNorm2d(10), NEVER
            # nn.ReLU() NEVER!
        ) # output_size = 1


    def forward(self, x):
        x = self.convblock1(x)
        x = self.convblock2(x)

        x = self.pool1(x)
        x = self.convblock3(x) #transition

        x = self.convblock4(x)
        x = self.convblock5(x)
        x = self.convblock6(x)
        x = self.convblock7(x) #padding=1

        x = self.gap(x)
        x = self.convblock8(x) #transition
        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

In [24]:
from torchsummary import summary
use_cuda = torch.cuda.is_available()
cuda = torch.device("cuda" if use_cuda else "cpu")
print(cuda)
model = Net().to(cuda)
summary(model, input_size=(1, 28, 28))

cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 26, 26]              72
              ReLU-2            [-1, 8, 26, 26]               0
            Conv2d-3           [-1, 16, 24, 24]           1,152
              ReLU-4           [-1, 16, 24, 24]               0
         MaxPool2d-5           [-1, 16, 12, 12]               0
            Conv2d-6            [-1, 8, 12, 12]             128
              ReLU-7            [-1, 8, 12, 12]               0
            Conv2d-8           [-1, 12, 10, 10]             864
              ReLU-9           [-1, 12, 10, 10]               0
           Conv2d-10             [-1, 12, 8, 8]           1,296
             ReLU-11             [-1, 12, 8, 8]               0
           Conv2d-12             [-1, 16, 6, 6]           1,728
             ReLU-13             [-1, 16, 6, 6]               0
           Conv2d-14             [-

In [25]:
from tqdm import tqdm

train_losses = []
test_losses = []
train_acc = []
test_acc = []

def train(model, device, train_loader, optimizer, epoch):
  model.train()
  pbar = tqdm(train_loader)
  correct = 0
  processed = 0
  for batch_idx, (data, target) in enumerate(pbar):
    # get samples
    data, target = data.to(device), target.to(device)

    # Init
    optimizer.zero_grad()
    # In PyTorch, we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes.
    # Because of this, when you start your training loop, ideally you should zero out the gradients so that you do the parameter update correctly.

    # Predict
    y_pred = model(data)

    # Calculate loss
    loss = F.nll_loss(y_pred, target)
    train_losses.append(loss)

    # Backpropagation
    loss.backward()
    optimizer.step()

    # Update pbar-tqdm

    pred = y_pred.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
    correct += pred.eq(target.view_as(pred)).sum().item()
    processed += len(data)

    pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')
    train_acc.append(100*correct/processed)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_losses.append(test_loss)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    test_acc.append(100. * correct / len(test_loader.dataset))

In [26]:
print("model running on: ", device)
log_table = PrettyTable()
log_table.field_names = ["Epoch", "Training Accuracy", "Test Accuracy", "Diff", "Training Loss", "Test Loss"]

model =  Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

EPOCHS = 15
for epoch in range(EPOCHS):
    print("EPOCH:", epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)
    log_table.add_row([epoch, f"{train_acc[-1]:.2f}%", f"{test_acc[-1]:.2f}%", f"{float(train_acc[-1]) - float(test_acc[-1]):.2f}" ,f"{train_losses[-1]:.4f}", f"{test_losses[-1]:.4f}"])
print(log_table)

model running on:  mps
EPOCH: 0


Loss=2.291754961013794 Batch_id=468 Accuracy=9.71: 100%|██████████| 469/469 [00:07<00:00, 59.11it/s]  



Test set: Average loss: 2.2901, Accuracy: 974/10000 (9.74%)

EPOCH: 1


Loss=0.18167071044445038 Batch_id=468 Accuracy=65.16: 100%|██████████| 469/469 [00:06<00:00, 70.44it/s] 



Test set: Average loss: 0.2051, Accuracy: 9385/10000 (93.85%)

EPOCH: 2


Loss=0.11839846521615982 Batch_id=468 Accuracy=94.34: 100%|██████████| 469/469 [00:06<00:00, 69.79it/s]  



Test set: Average loss: 0.1330, Accuracy: 9597/10000 (95.97%)

EPOCH: 3


Loss=0.058226268738508224 Batch_id=468 Accuracy=96.14: 100%|██████████| 469/469 [00:06<00:00, 69.81it/s] 



Test set: Average loss: 0.1092, Accuracy: 9663/10000 (96.63%)

EPOCH: 4


Loss=0.13024090230464935 Batch_id=468 Accuracy=96.79: 100%|██████████| 469/469 [00:06<00:00, 69.80it/s]  



Test set: Average loss: 0.1190, Accuracy: 9635/10000 (96.35%)

EPOCH: 5


Loss=0.02751939743757248 Batch_id=468 Accuracy=97.34: 100%|██████████| 469/469 [00:06<00:00, 69.57it/s]  



Test set: Average loss: 0.0746, Accuracy: 9778/10000 (97.78%)

EPOCH: 6


Loss=0.10898806899785995 Batch_id=468 Accuracy=97.57: 100%|██████████| 469/469 [00:06<00:00, 69.71it/s]  



Test set: Average loss: 0.0845, Accuracy: 9752/10000 (97.52%)

EPOCH: 7


Loss=0.013457237742841244 Batch_id=468 Accuracy=97.78: 100%|██████████| 469/469 [00:06<00:00, 70.09it/s] 



Test set: Average loss: 0.0623, Accuracy: 9806/10000 (98.06%)

EPOCH: 8


Loss=0.09019219875335693 Batch_id=468 Accuracy=97.96: 100%|██████████| 469/469 [00:06<00:00, 69.93it/s]  



Test set: Average loss: 0.0685, Accuracy: 9810/10000 (98.10%)

EPOCH: 9


Loss=0.030781090259552002 Batch_id=468 Accuracy=98.25: 100%|██████████| 469/469 [00:06<00:00, 68.31it/s] 



Test set: Average loss: 0.0620, Accuracy: 9801/10000 (98.01%)

EPOCH: 10


Loss=0.06561276316642761 Batch_id=468 Accuracy=98.31: 100%|██████████| 469/469 [00:06<00:00, 67.21it/s]   



Test set: Average loss: 0.0478, Accuracy: 9849/10000 (98.49%)

EPOCH: 11


Loss=0.03385394811630249 Batch_id=468 Accuracy=98.39: 100%|██████████| 469/469 [00:06<00:00, 68.36it/s]  



Test set: Average loss: 0.0603, Accuracy: 9830/10000 (98.30%)

EPOCH: 12


Loss=0.015782849863171577 Batch_id=468 Accuracy=98.55: 100%|██████████| 469/469 [00:06<00:00, 68.48it/s] 



Test set: Average loss: 0.0587, Accuracy: 9828/10000 (98.28%)

EPOCH: 13


Loss=0.028418252244591713 Batch_id=468 Accuracy=98.60: 100%|██████████| 469/469 [00:06<00:00, 67.18it/s]  



Test set: Average loss: 0.0500, Accuracy: 9859/10000 (98.59%)

EPOCH: 14


Loss=0.14184729754924774 Batch_id=468 Accuracy=98.63: 100%|██████████| 469/469 [00:07<00:00, 64.49it/s]   



Test set: Average loss: 0.0483, Accuracy: 9861/10000 (98.61%)

+-------+-------------------+---------------+--------+---------------+-----------+
| Epoch | Training Accuracy | Test Accuracy |  Diff  | Training Loss | Test Loss |
+-------+-------------------+---------------+--------+---------------+-----------+
|   0   |       9.71%       |     9.74%     | -0.03  |     2.2918    |   2.2901  |
|   1   |       65.16%      |     93.85%    | -28.69 |     0.1817    |   0.2051  |
|   2   |       94.34%      |     95.97%    | -1.63  |     0.1184    |   0.1330  |
|   3   |       96.14%      |     96.63%    | -0.49  |     0.0582    |   0.1092  |
|   4   |       96.79%      |     96.35%    |  0.44  |     0.1302    |   0.1190  |
|   5   |       97.34%      |     97.78%    | -0.44  |     0.0275    |   0.0746  |
|   6   |       97.57%      |     97.52%    |  0.05  |     0.1090    |   0.0845  |
|   7   |       97.78%      |     98.06%    | -0.28  |     0.0135    |   0.0623  |
|   8   |       97.96% 

### TARGET
- Carefully think and plan the Model architecture

- The requirement is to achive high accuracy with less parameters within 15 epoch

- Approach is to after 2 layers of convolution when RF is 5, we will do MaxPooling. This is MNIST dataset, hence we can find edges and gradients at RF 5.

- Transition block of 1x1 convolution follows Maxpooling to summerize the channels from depth perspective as well as reduce the channel numbers.

- Then we will continue 4 convolution layers of 3x3 kernel until we reach GAP layer.

- GAP layer summerizes each feature channel as 1x1

- There is one last layer from GAP layer output channels to 10

### RESULT
- Parameters            : 7704

- Best Train Accuracy   : 98.63%

- Best Test Accuracy    : 98.61%

### Analysis
- **Model Architecture**
   
    - **Channels** :  1 → 8 → 16 → MaxPool → 16 → (Transition) → 8 → 12 → 12 → 16 → 16 → GAP → 16 → Transition → 10 

    - **Receptive Field**

        | Block               | Layer      | Input Size | Kernel x Stride | Padding | Input Channel | Output Channel | Output Size | Receptive Field |
        | ------------------- | ---------- | ---------- | --------------- | ------- | ------------- | -------------- | ----------- | --------------- |
        | Input Block         | Conv1      | 28         | 3 x 1           | 0       | 1             | 8              | 26          | 3               |
        | Convolution Block 1 | Conv2      | 26         | 3 x 1           | 0       | 8             | 16             | 24          | 5               |
        | Transition Block 1  | Pool1      | 24         | 2 x 2           | 0       | 16            | 16             | 12          | 6               |
        | Transition Block 1  | Conv3      | 12         | 1 x 1           | 0       | 16            | 8              | 12          | 6               |
        | Convolution Block 2 | Conv4      | 12         | 3 x 1           | 0       | 8             | 12             | 10          | 10              |
        | Convolution Block 2 | conv5      | 10         | 3 x 1           | 0       | 12            | 12             | 8           | 14              |
        | Convolution Block 2 | Conv6      | 8          | 3 x 1           | 0       | 12            | 16             | 6           | 18              |
        | Convolution Block 2 | Conv7      | 6          | 3 x 1           | 1       | 16            | 16             | 6           | 22              |
        | Output Block        | GAP        | 6          | 3 x 1           | 0       | 16            | 16             | 1           | 28              |
        | Output Block        | Conv8      | 1          | 1 x 1           | 0       | 16            | 10             | 1           | 28              |

- **Observation**
    - Model has got maximum training accuracy as 98.63%. It still can be trined more.
    - But we will not change the architecture for now, rather we add batch normalization hoping for the training to be faster and accuracy to be increased.