<a href="https://colab.research.google.com/github/satyasundar/ERAv3/blob/colab/era_s6_base_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Different techniques used in writing new model
1. How many layers,
2. MaxPooling,
3. 1x1 Convolutions,
4. 3x3 Convolutions,
5. Receptive Field,
6. SoftMax,
7. Learning Rate,
8. Kernels and how do we decide the number of kernels?
9. Batch Normalization,
10. Image Normalization,
11. Position of MaxPooling,
12. Concept of Transition Layers,
13. Position of Transition Layer,
14. DropOut
15. When do we introduce DropOut, or when do we know we have some overfitting
16. The distance of MaxPooling from Prediction,
17. The distance of Batch Normalization from Prediction,
18. When do we stop convolutions and go ahead with a larger kernel or some other alternative (which we have not yet covered)
19. How do we know our network is not going well, comparatively, very early
20. Batch Size, and Effects of Batch Size
21. etc (you can add more if we missed it here)




## WRITE THE MODEL AGAIN SUCH THAT IT ACHIEVES
1. 99.4% validation/test accuracy (50/10k split, basically we are calling Validation dataset as test dataset itself)
2. Less than 20k Parameters
3. You can use anything from above you want.
4. Less than 20 Epochs
5. Have used BN, Dropout,
6. (Optional): a Fully connected layer or, have used GAP.
7. To learn how to add different things we covered in this session, you can refer to this code: https://www.kaggle.com/enwei26/mnist-digits-pytorch-cnn-99 DONT COPY ARCHITECTURE, JUST LEARN HOW TO INTEGRATE THINGS LIKE DROPOUT, BATCHNORM, ETC.
8. **Someone has achieved 99.6% in 3400 parameters in 6 epochs**


In [4]:
!pip install torchsummary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [23]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from tqdm import tqdm
from torchsummary import summary

In [None]:
# My MNIST model training

class Network(nn.Module):
    def __init__(self):
        super().__init__()

        #FC model
        self.conv1   = nn.Conv2d(1, 8, 3, padding=1)
        self.conv2   = nn.Conv2d(8, 8, 3, padding=1)
        self.fc1    = nn.Linear(8*7*7, 60)
        self.fc2    = nn.Linear(60, 10)

        # GAP model
        # self.conv3 = nn.Conv2d(1, 16, 3, padding=1)
        # self.conv4 = nn.Conv2d(16, 32, 3, padding=1)
        # self.gap = nn.AdaptiveAvgPool2d(1)
        # self.fc_gap = nn.Linear(32, 10)

        self.pool = nn.MaxPool2d(2, 2)
        self.bn8 = nn.BatchNorm2d(8)
        self.bn16 = nn.BatchNorm2d(16)
        self.bn32 = nn.BatchNorm2d(32)
        

        self.dropout = nn.Dropout(0.2)



    def forward(self, x):
        x = self.conv1(x)
        #x = self.bn8(x)
        x = F.relu(x)
        x = self.pool(x)
        #x = self.dropout(x)


        x = self.conv2(x)
        #x = self.bn8(x)
        x = F.relu(x)
        x = self.pool(x)

        #x = self.dropout(x)

        # x = self.pool(self.relu(self.conv1(x)))
        # x = self.pool(self.relu(self.conv2(x)))
        # x = x.view(-1, 8*7*7)

        #x = self.gap(x)
        x = x.view(-1, 8*7*7)
        #x = self.fc_gap(x)


        x = self.fc1(x)
        #x = self.bn2(x)
        x = F.relu(x)
        #x = self.dropout(x)

        x = self.fc2(x)
        return x

use_cuda = torch.cuda.is_available()
#device = torch.device("cuda" if use_cuda else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = Network().to(device)
#summary(model, input_size=(1, 28,28))

network = Network()
print(network)

total_parameters = sum(p.numel() for p in network.parameters())
print(f"Total Parameters : {total_parameters}")
#network.fc2.weight.shape

# for name, param in network.named_parameters():
#     print(name, '\t\t', param.shape)


mps
Network(
  (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(8, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=392, out_features=60, bias=True)
  (fc2): Linear(in_features=60, out_features=10, bias=True)
  (conv3): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (gap): AdaptiveAvgPool2d(output_size=1)
  (fc_gap): Linear(in_features=32, out_features=10, bias=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (bn8): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn16): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn32): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Total Parameters : 30096


In [48]:
torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        # transforms.RandomRotation(15),
                        # transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        #loss = F.nll_loss(output, target)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

model = Network().to(device)
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(0, 10):
    print(epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

0


loss=0.09432337433099747 batch_id=468: 100%|██████████| 469/469 [00:07<00:00, 64.21it/s] 



Test set: Average loss: -8.5752, Accuracy: 9541/10000 (95.4%)

1


loss=0.1088179424405098 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 68.28it/s]  



Test set: Average loss: -10.0071, Accuracy: 9693/10000 (96.9%)

2


loss=0.03112841583788395 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 69.96it/s] 



Test set: Average loss: -10.1077, Accuracy: 9763/10000 (97.6%)

3


loss=0.023765726014971733 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 70.22it/s]



Test set: Average loss: -10.7318, Accuracy: 9816/10000 (98.2%)

4


loss=0.044282834976911545 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 70.66it/s]



Test set: Average loss: -11.6408, Accuracy: 9824/10000 (98.2%)

5


loss=0.05270056799054146 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 71.76it/s] 



Test set: Average loss: -11.8088, Accuracy: 9850/10000 (98.5%)

6


loss=0.06171127036213875 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 71.92it/s]  



Test set: Average loss: -12.0259, Accuracy: 9868/10000 (98.7%)

7


loss=0.030854828655719757 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 71.69it/s] 



Test set: Average loss: -12.2427, Accuracy: 9855/10000 (98.5%)

8


loss=0.03992094472050667 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 71.97it/s]  



Test set: Average loss: -13.2757, Accuracy: 9815/10000 (98.2%)

9


loss=0.006953431759029627 batch_id=468: 100%|██████████| 469/469 [00:06<00:00, 72.38it/s] 



Test set: Average loss: -13.8007, Accuracy: 9869/10000 (98.7%)



In [None]:
# The Base file provided

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)      #input:28x28 Output:28x28 RF:3      #input: Output: RF:
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)     #input:28x28x8 Output:28x28x16 RF:5
        self.pool1 = nn.MaxPool2d(2, 2)                 #input:28x28x16 Output:14x14x16 RF: 10
        self.conv3 = nn.Conv2d(16, 32, 3, padding=1)   #input: Output: RF:
        self.conv4 = nn.Conv2d(32, 64, 3, padding=1)  #input: Output: RF:
        self.pool2 = nn.MaxPool2d(2, 2)                 #input: Output: RF:
        self.conv5 = nn.Conv2d(256, 512, 3)             #input: Output: RF:
        self.conv6 = nn.Conv2d(512, 1024, 3)            #input: Output: RF:
        self.conv7 = nn.Conv2d(1024, 10, 3)             #input: Output: RF:

        self.fc1 = nn.Linear(16*7*7, 10)
        #self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = F.relu(self.conv7(x))
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [None]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              80
         MaxPool2d-2            [-1, 8, 14, 14]               0
            Conv2d-3           [-1, 16, 14, 14]           1,168
         MaxPool2d-4             [-1, 16, 7, 7]               0
            Linear-5                   [-1, 10]           7,850
Total params: 9,098
Trainable params: 9,098
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.09
Params size (MB): 0.03
Estimated Total Size (MB): 0.13
----------------------------------------------------------------


In [None]:


torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


In [None]:
from tqdm import tqdm
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [None]:

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2):
    print(epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=nan batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.17it/s]



Test set: Average loss: nan, Accuracy: 980/10000 (10%)

