In [0]:
from __future__ import print_function
import torch                   ## importing pytorch   
import torch.nn as nn  ##PyTorch provides the torch.nn module to help us in creating and  training of the neural network 

import torch.nn.functional as F ## The torch.nn.functional area specifically gives us access to some handy functions that we might not want to write ourselves. We will be using the relu or "rectified linear" activation function for our neurons. Instead of writing all of the code for these things, we can just import them, since these are things everyone will be needing in their deep learning code. 
import torch.optim as optim  ###torch.optim is a package implementing various optimization algorithms   
from torchvision import datasets, transforms  ###The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision. 

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()       ##super constructor – call constructor of nn.Module 
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)  #input –1  OUtput – 32  RF-3 
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1) #input –32  OUtput – 64  RF-5 
        self.pool1 = nn.MaxPool2d(2, 2)     #        RF10        
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1) #input –64  OUtput – 128  RF-12 
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)#input –128  OUtput – 256  RF-14 
        self.pool2 = nn.MaxPool2d(2, 2)   # RF 28 
        self.conv5 = nn.Conv2d(256, 512, 3)#input –256  OUtput – 512  RF-30 
        self.conv6 = nn.Conv2d(512, 1024, 3) #input –512  OUtput – 1024  RF-32 
        self.conv7 = nn.Conv2d(1024, 10, 3) #input –1024  OUtput – 10  RF-34 

    def forward(self, x):
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))                #The forward function computes output Tensors from input Tensors.
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
        x = self.conv7(self.conv6(F.relu(self.conv5(x))))
        #x = F.relu(self.conv7(x))
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

#### basically a summary of the above function defined 32 ,64,64,128 … respectively is no,. Of channels in output layer . 

##While 28*28 , 14*14  is the padded value of image size in the layers

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
Total params: 6,379,786
Trainable params: 6,379,786
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estimated Total Size (MB): 25.85
-------------------------------------



In [0]:


torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
'''loading data – train ,test . Doing transformation by converting to tensor and normalisation. 

num_workers (int, optional): how many subprocesses to use for data loading. 

pin_memory (bool, optional): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them.  

Transforms.Compose :- Composes several transforms together. 

 

torchvision.transforms.Normalize(mean, std, inplace=False)[SOURCE] 

Normalize a tensor image with mean and standard deviation. Given mean: (M1,...,Mn) and std: (S1,..,Sn) for n channels, this transform will normalize each channel of the input  

 

torchvision.transforms.ToTensor[SOURCE] 

Convert a PIL Image or numpy.ndarray to tensor. 

Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] 
'''
 

'loading data – train ,test . Doing transformation by converting to tensor and normalisation. \n\nnum_workers (int, optional): how many subprocesses to use for data loading. \n\npin_memory (bool, optional): If ``True``, the data loader will copy Tensors into CUDA pinned memory before returning them.  \n\nTransforms.Compose :- Composes several transforms together. \n\n \n\ntorchvision.transforms.Normalize(mean, std, inplace=False)[SOURCE] \n\nNormalize a tensor image with mean and standard deviation. Given mean: (M1,...,Mn) and std: (S1,..,Sn) for n channels, this transform will normalize each channel of the input  \n\n \n\ntorchvision.transforms.ToTensor[SOURCE] \n\nConvert a PIL Image or numpy.ndarray to tensor. \n\nConverts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] \n'

In [0]:
from tqdm import tqdm                  
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    

''' 

model.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly. 

More details: It sets the mode to train (see source code). You can call either model.eval() or model.train(mode=False) to tell that you are testing. It is somewhat intuitive to expect train function to train model but it does not do that. It just sets the mode. 

      optimizer.zero_grad() 

Since the backward() function accumulates gradients, and you don’t want to mix up gradients between minibatches, you have to zero them out at the start of a new minibatch. This is exactly like how a general (additive) accumulator variable is initialized to 0 in code. 

By the way, the best practice is to use the zero_grad() 2.8k function on the optimizer. 

 

optimizer.step is performs a parameter update based on the current gradient (stored in .grad attribute of a parameter) and the update rule. As an example, the update rule for SGD is defined here: 
https://github.com/pytorch/pytorch/blob/cd9b27231b51633e76e28b6a34002ab83b0660fc/torch/optim/sgd.py#L63 2.3k. 

Calling .backward() mutiple times accumulates the gradient (by addition) for each parameter. This is why you should call optimizer.zero_grad() after each .step() call. Note that following the first .backward call, a second call is only possible after you have performed another forward pass. 
'''

' \n\nmodel.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly. \n\nMore details: It sets the mode to train (see source code). You can call either model.eval() or model.train(mode=False) to tell that you are testing. It is somewhat intuitive to expect train function to train model but it does not do that. It just sets the mode. \n\n      optimizer.zero_grad() \n\nSince the backward() function accumulates gradients, and you don’t want to mix up gradients between minibatches, you have to zero them out at the start of a new minibatch. This is exactly like how a general (additive) accumulator variable is initialized to 0 in code. \n\nBy the way, the best practice is to use the zero_grad() 2.8k function on the optimizer. \n\n \n\noptimizer.step is performs a parameter update based on the current gradient (stored in .grad att

In [0]:

model= Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

    #Lr is learning rate  

#Momentum – momentum factor 



loss=2.3033065795898438 batch_id=0:   0%|          | 0/469 [00:00<?, ?it/s][A
loss=2.3033065795898438 batch_id=0:   0%|          | 1/469 [00:00<00:49,  9.50it/s][A
loss=2.300729274749756 batch_id=1:   0%|          | 1/469 [00:00<00:49,  9.50it/s] [A
loss=2.3013687133789062 batch_id=2:   0%|          | 1/469 [00:00<00:49,  9.50it/s][A
loss=2.299576759338379 batch_id=3:   0%|          | 1/469 [00:00<00:49,  9.50it/s] [A
loss=2.299576759338379 batch_id=3:   1%|          | 4/469 [00:00<00:40, 11.50it/s][A
loss=2.302168369293213 batch_id=4:   1%|          | 4/469 [00:00<00:40, 11.50it/s][A
loss=2.2996318340301514 batch_id=5:   1%|          | 4/469 [00:00<00:40, 11.50it/s][A
loss=2.2996318340301514 batch_id=5:   1%|▏         | 6/469 [00:00<00:35, 13.14it/s][A
loss=2.298245429992676 batch_id=6:   1%|▏         | 6/469 [00:00<00:35, 13.14it/s] [A
loss=2.2952821254730225 batch_id=7:   1%|▏         | 6/469 [00:00<00:35, 13.14it/s][A
loss=2.2946512699127197 batch_id=8:   1%|▏         


Test set: Average loss: 0.0661, Accuracy: 9780/10000 (98%)

