<a href="https://colab.research.google.com/github/satyasundar/ERAv3/blob/colab/era_s6_base_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Different techniques used in writing new model
1. How many layers,
2. MaxPooling,
3. 1x1 Convolutions,
4. 3x3 Convolutions,
5. Receptive Field,
6. SoftMax,
7. Learning Rate,
8. Kernels and how do we decide the number of kernels?
9. Batch Normalization,
10. Image Normalization,
11. Position of MaxPooling,
12. Concept of Transition Layers,
13. Position of Transition Layer,
14. DropOut
15. When do we introduce DropOut, or when do we know we have some overfitting
16. The distance of MaxPooling from Prediction,
17. The distance of Batch Normalization from Prediction,
18. When do we stop convolutions and go ahead with a larger kernel or some other alternative (which we have not yet covered)
19. How do we know our network is not going well, comparatively, very early
20. Batch Size, and Effects of Batch Size
21. etc (you can add more if we missed it here)




## WRITE THE MODEL AGAIN SUCH THAT IT ACHIEVES
1. 99.4% validation/test accuracy (50/10k split, basically we are calling Validation dataset as test dataset itself)
2. Less than 20k Parameters
3. You can use anything from above you want.
4. Less than 20 Epochs
5. Have used BN, Dropout,
6. (Optional): a Fully connected layer or, have used GAP.
7. To learn how to add different things we covered in this session, you can refer to this code: https://www.kaggle.com/enwei26/mnist-digits-pytorch-cnn-99 DONT COPY ARCHITECTURE, JUST LEARN HOW TO INTEGRATE THINGS LIKE DROPOUT, BATCHNORM, ETC.
8. **Someone has achieved 99.6% in 3400 parameters in 6 epochs**


In [None]:
!pip install torchsummary

In [7]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from tqdm import tqdm
from torchsummary import summary

In [44]:
# My MNIST model training

class Network(nn.Module):
    def __init__(self):
        super().__init__()


        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)

        self.gap = nn.AdaptiveAvgPool2d(1)
        self.fc_gap = nn.Linear(16, 10)

        # self.conv3 = nn.Conv2d(32, 64, 3, padding=1)

        self.fc1 = nn.Linear(8*7*7, 50)
        self.fc2 = nn.Linear(50, 10)

        self.pool = nn.MaxPool2d(2, 2)
        self.bn1 = nn.BatchNorm2d(8)
        self.bn2 = nn.BatchNorm2d(16)
        self.bn3 = nn.BatchNorm1d(50)

        self.dropout = nn.Dropout(0.25)



    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.pool(x)
        #x = self.dropout(x)


        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.pool(x)
        #x = self.dropout(x)

        # x = self.pool(self.relu(self.conv1(x)))
        # x = self.pool(self.relu(self.conv2(x)))
        # x = x.view(-1, 8*7*7)

        x = self.gap(x)
        x = x.view(-1, 16)
        x = self.fc_gap(x)


        #x = self.fc1(x)
        #x = self.bn2(x)
        #x = self.relu(x)
        #x = self.dropout(x)

        #x = self.fc2(x)
        return x

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
model = Network().to(device)
summary(model, input_size=(1, 28,28))
network = Network()
print(network)
network.fc2.weight.shape

for name, param in network.named_parameters():
    print(name, '\t\t', param.shape)


cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              80
       BatchNorm2d-2            [-1, 8, 28, 28]              16
         MaxPool2d-3            [-1, 8, 14, 14]               0
            Conv2d-4           [-1, 16, 14, 14]           1,168
       BatchNorm2d-5           [-1, 16, 14, 14]              32
         MaxPool2d-6             [-1, 16, 7, 7]               0
 AdaptiveAvgPool2d-7             [-1, 16, 1, 1]               0
            Linear-8                   [-1, 10]             170
Total params: 1,466
Trainable params: 1,466
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.16
Params size (MB): 0.01
Estimated Total Size (MB): 0.17
----------------------------------------------------------------
Network(
  (conv1): Conv2d(1, 8, kernel_s

In [46]:
torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        #loss = F.nll_loss(output, target)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.1f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

model = Network().to(device)
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(0, 20):
    print(epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

0


loss=1.6147176027297974 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.31it/s]



Test set: Average loss: -1.1008, Accuracy: 5062/10000 (50.6%)

1


loss=1.164997935295105 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.82it/s]



Test set: Average loss: -1.6387, Accuracy: 6547/10000 (65.5%)

2


loss=1.074528694152832 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.54it/s]



Test set: Average loss: -1.9531, Accuracy: 7171/10000 (71.7%)

3


loss=0.8416366577148438 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.67it/s]



Test set: Average loss: -2.1820, Accuracy: 7651/10000 (76.5%)

4


loss=0.6681423187255859 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.33it/s]



Test set: Average loss: -2.2537, Accuracy: 8088/10000 (80.9%)

5


loss=0.5396824479103088 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.79it/s]



Test set: Average loss: -2.3470, Accuracy: 8584/10000 (85.8%)

6


loss=0.46318402886390686 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 29.13it/s]



Test set: Average loss: -2.3556, Accuracy: 8517/10000 (85.2%)

7


loss=0.4312988519668579 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.05it/s]



Test set: Average loss: -2.5443, Accuracy: 8426/10000 (84.3%)

8


loss=0.37665653228759766 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.10it/s]



Test set: Average loss: -2.6050, Accuracy: 8773/10000 (87.7%)

9


loss=0.5190789103507996 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.64it/s]



Test set: Average loss: -2.5867, Accuracy: 9019/10000 (90.2%)

10


loss=0.364950567483902 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.20it/s]



Test set: Average loss: -2.7249, Accuracy: 9104/10000 (91.0%)

11


loss=0.38213708996772766 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 29.06it/s]



Test set: Average loss: -2.7691, Accuracy: 9144/10000 (91.4%)

12


loss=0.31533968448638916 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.67it/s]



Test set: Average loss: -2.7589, Accuracy: 9150/10000 (91.5%)

13


loss=0.43233048915863037 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.38it/s]



Test set: Average loss: -2.7514, Accuracy: 9006/10000 (90.1%)

14


loss=0.25698861479759216 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.93it/s]



Test set: Average loss: -2.7058, Accuracy: 9276/10000 (92.8%)

15


loss=0.3465283215045929 batch_id=468: 100%|██████████| 469/469 [00:16<00:00, 28.49it/s]



Test set: Average loss: -2.9220, Accuracy: 9236/10000 (92.4%)

16


loss=0.20077134668827057 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.69it/s]



Test set: Average loss: -2.7388, Accuracy: 9266/10000 (92.7%)

17


loss=0.26054638624191284 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.58it/s]



Test set: Average loss: -2.8578, Accuracy: 9268/10000 (92.7%)

18


loss=0.4773117005825043 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 29.87it/s]



Test set: Average loss: -2.9197, Accuracy: 9121/10000 (91.2%)

19


loss=0.39057254791259766 batch_id=468: 100%|██████████| 469/469 [00:15<00:00, 30.01it/s]



Test set: Average loss: -2.8473, Accuracy: 9174/10000 (91.7%)



In [None]:
# The Base file provided

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, 3, padding=1)      #input:28x28 Output:28x28 RF:3      #input: Output: RF:
        self.conv2 = nn.Conv2d(8, 16, 3, padding=1)     #input:28x28x8 Output:28x28x16 RF:5
        self.pool1 = nn.MaxPool2d(2, 2)                 #input:28x28x16 Output:14x14x16 RF: 10
        self.conv3 = nn.Conv2d(16, 32, 3, padding=1)   #input: Output: RF:
        self.conv4 = nn.Conv2d(32, 64, 3, padding=1)  #input: Output: RF:
        self.pool2 = nn.MaxPool2d(2, 2)                 #input: Output: RF:
        self.conv5 = nn.Conv2d(256, 512, 3)             #input: Output: RF:
        self.conv6 = nn.Conv2d(512, 1024, 3)            #input: Output: RF:
        self.conv7 = nn.Conv2d(1024, 10, 3)             #input: Output: RF:

        self.fc1 = nn.Linear(16*7*7, 10)
        #self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        x = F.relu(self.conv6(F.relu(self.conv5(x))))
        x = F.relu(self.conv7(x))
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [None]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 28, 28]              80
         MaxPool2d-2            [-1, 8, 14, 14]               0
            Conv2d-3           [-1, 16, 14, 14]           1,168
         MaxPool2d-4             [-1, 16, 7, 7]               0
            Linear-5                   [-1, 10]           7,850
Total params: 9,098
Trainable params: 9,098
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.09
Params size (MB): 0.03
Estimated Total Size (MB): 0.13
----------------------------------------------------------------


In [None]:


torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


In [None]:
from tqdm import tqdm
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [None]:

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2):
    print(epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

loss=nan batch_id=468: 100%|██████████| 469/469 [00:18<00:00, 25.17it/s]



Test set: Average loss: nan, Accuracy: 980/10000 (10%)

