In [60]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super().__init__()# super().__init__()  is equal to  super(Net,self).__init__() where self is the object of 
        #child class which is used to access proxy objectof Parent class created by super method .
        
        
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        #Default value of stride is (1,1)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(in_features=16 * 5 * 5, out_features=120,bias=True)
        self.fc2 = nn.Linear(in_features=120, out_features=84,bias=True)
        self.fc3 = nn.Linear(in_features=84, out_features=10,bias=True)

In [61]:
net=Net()
print(net)#this is printed because we have imported Module class of nn package and in this class 
# __repr__() method is overwritten in this class

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [62]:
print(type(net.conv1))
print(type(net))

#this shows how we have accessed conv1 object using net object of Net class
#conv1 is object we defined in our __init__ function is instance of torch.nn.modules.conv.Conv2d class

<class 'torch.nn.modules.conv.Conv2d'>
<class '__main__.Net'>


In [63]:
net.conv2

Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))

In [64]:
net.fc1

Linear(in_features=400, out_features=120, bias=True)

In [65]:
net.fc2

Linear(in_features=120, out_features=84, bias=True)

In [66]:
net.conv1.weight
#this shows how we can access weight object inside conv1 object 

#In output Below Parameter containing: means 
#the tensor printed is a special tensor which will learn the parameter values as we train our model
#adn will try to achieve value in suchway that loss function is minimum

Parameter containing:
tensor([[[[-0.0777, -0.1945,  0.1544, -0.1183,  0.0964],
          [ 0.0532,  0.0028, -0.1555,  0.1976,  0.1119],
          [ 0.0048,  0.0737,  0.1175, -0.1454,  0.0146],
          [ 0.1663, -0.0418, -0.1756, -0.1280, -0.1954],
          [ 0.0870,  0.1991, -0.0780,  0.1761, -0.1773]]],


        [[[ 0.1472, -0.0942, -0.0782, -0.0236,  0.1453],
          [ 0.0703, -0.1771, -0.1546,  0.0647,  0.0745],
          [ 0.1441, -0.0775, -0.0960,  0.0543, -0.1330],
          [ 0.1436,  0.0533, -0.1132, -0.1508,  0.0667],
          [-0.0324,  0.0884, -0.0381,  0.1769, -0.0810]]],


        [[[-0.1967,  0.0737,  0.0730,  0.1705, -0.1281],
          [-0.0587, -0.0366,  0.1587,  0.0676, -0.1368],
          [-0.0545,  0.1578, -0.1564, -0.1864, -0.0431],
          [ 0.1308, -0.1159,  0.1129,  0.1110,  0.0157],
          [ 0.1385,  0.1508,  0.1869, -0.0656,  0.0457]]],


        [[[-0.0945, -0.0274, -0.0213, -0.0197,  0.1984],
          [ 0.0754, -0.0368,  0.1723,  0.1376,  0.1327

In [67]:
print(net.conv1.weight.shape,'\n')
for i in range(net.conv1.weight.shape[0]):
    print(net.conv1.weight[i].shape)#accessing a specific filter from six filters
type(net.conv1.weight.shape)
#output shows number of output channels,no of input channels,height of kernel,width of kernel

torch.Size([6, 1, 5, 5]) 

torch.Size([1, 5, 5])
torch.Size([1, 5, 5])
torch.Size([1, 5, 5])
torch.Size([1, 5, 5])
torch.Size([1, 5, 5])
torch.Size([1, 5, 5])


torch.Size

In [68]:
type(net.conv1.weight)
#this shows that weight attribute belongs to Parameter class which is accessed by conv1 object

torch.nn.parameter.Parameter

In [69]:
param=list(net.parameters())
print(len(param))
for param in net.parameters():
    print(param.shape)

10
torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


## This shows that although we have defined :-
## 5 layers(2 convolution layers,3 fully connected layers) 
## We have got 10 parameters because 
## By default for each layer we have Weight tensor and Bias Tensor

In [70]:
for name , param in net.named_parameters():
    print(name ," SHAPE ---> ", param.shape,'\n')

conv1.weight  SHAPE --->  torch.Size([6, 1, 5, 5]) 

conv1.bias  SHAPE --->  torch.Size([6]) 

conv2.weight  SHAPE --->  torch.Size([16, 6, 5, 5]) 

conv2.bias  SHAPE --->  torch.Size([16]) 

fc1.weight  SHAPE --->  torch.Size([120, 400]) 

fc1.bias  SHAPE --->  torch.Size([120]) 

fc2.weight  SHAPE --->  torch.Size([84, 120]) 

fc2.bias  SHAPE --->  torch.Size([84]) 

fc3.weight  SHAPE --->  torch.Size([10, 84]) 

fc3.bias  SHAPE --->  torch.Size([10]) 



## Showing use of RELU Function

In [99]:
input=torch.randn(3)
print(input)
m = nn.functional.relu(input)
print(m)

#Rectifying Negative Input
#This also shows Relu Does not Reduce the size of Input  tensor given to it

tensor([-0.9573, -0.4994, -1.2536])
tensor([0., 0., 0.])


In [100]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        print(x.size(),'\n')
        
        #Activation Functions do not reduce the size of Filter Map Given to Them as Input
        relu_layer_1=F.relu(self.conv1(x))# Activation Relu is applied on First Convolutional Layer(conv1)
        print(relu_layer_1.size(),'\n')#Size has reduced from 32,32 in input to 28,28 after this RELU operation 
        #beacuse Input is First passed from convolution Layer then Relu Layer
        


        # Max pooling over a (2, 2) window
        x = F.max_pool2d(relu_layer_1, (2, 2))# Max Pooling is applied on output of Previous Relu Layer using 2,2 Max Pooling
        # If the size is a square you can only specify a single number
        print(x.size(),'\n')
        
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        print(x.size(),'\n')
        
        x = x.view(-1, self.num_flat_features(x))
        print(x.size(),'\n')
        
        x = F.relu(self.fc1(x))
        print(x.size(),'\n')
        
        x = F.relu(self.fc2(x))
        print(x.size(),'\n')
        
        x = self.fc3(x)
        print(x.size(),'\n')
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
torch.Size([1, 1, 32, 32]) 

torch.Size([1, 6, 28, 28]) 

torch.Size([1, 6, 14, 14]) 

torch.Size([1, 16, 5, 5]) 

torch.Size([1, 400]) 

torch.Size([1, 120]) 

torch.Size([1, 84]) 

torch.Size([1, 10]) 

tensor([[ 0.0592, -0.0786,  0.0614, -0.0773, -0.0693, -0.1795,  0.1136,  0.0597,
          0.1579, -0.0359]], grad_fn=<AddmmBackward>)


In [101]:
net.zero_grad()#Setting Gradient to Zero to Deal with Problem of Accumulating Gradient
out.backward(torch.randn(1, 10))

In [102]:
#output = net(input)

target = torch.randn(10)  # a dummy target, for example
print(target)

target = target.view(1, -1)  # make it the same shape as output
print(target)
criterion = nn.MSELoss()
print(type(criterion))
loss = criterion(output, target)
print(loss)


tensor([-0.1848, -0.7626,  2.0601, -0.3574, -0.2599, -0.2325,  0.7191, -0.1894,
         0.6062, -0.1902])
tensor([[-0.1848, -0.7626,  2.0601, -0.3574, -0.2599, -0.2325,  0.7191, -0.1894,
          0.6062, -0.1902]])
<class 'torch.nn.modules.loss.MSELoss'>
tensor(0.6049, grad_fn=<MseLossBackward>)


## So, when we call loss.backward(), the whole graph is differentiated w.r.t. the loss, and all Tensors in the graph that has requires_grad=True will have their .grad Tensor accumulated with the gradient.

In [103]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x7f5f8a414b00>
<AddmmBackward object at 0x7f5f8a4142b0>
<AccumulateGrad object at 0x7f5f8a414b00>


# To Print whole Graph

In [104]:
def print_graph(g, level=0):
    if g == None: return
    print('*'*level*4, g,'\n')
    for subg in g.next_functions:
        print_graph(subg[0], level+1)

print_graph(loss.grad_fn, 0)

 <MseLossBackward object at 0x7f5f8a414a20> 

**** <AddmmBackward object at 0x7f5f88380b00> 

******** <AccumulateGrad object at 0x7f5f88380588> 

******** <ReluBackward0 object at 0x7f5f883809e8> 

************ <AddmmBackward object at 0x7f5f88380780> 

**************** <AccumulateGrad object at 0x7f5f88380908> 

**************** <ReluBackward0 object at 0x7f5f88380cc0> 

******************** <AddmmBackward object at 0x7f5f88380dd8> 

************************ <AccumulateGrad object at 0x7f5f88380e48> 

************************ <ViewBackward object at 0x7f5f88380160> 

**************************** <MaxPool2DWithIndicesBackward object at 0x7f5f88380978> 

******************************** <ReluBackward0 object at 0x7f5fc4259668> 

************************************ <MkldnnConvolutionBackward object at 0x7f5fc42599e8> 

**************************************** <MaxPool2DWithIndicesBackward object at 0x7f5fc4259c18> 

******************************************** <ReluBackward0 object at 

## Complete example is Explained Below

In [134]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)


input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)


net.zero_grad()
out.backward(torch.randn(1, 10))

output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss,'\n')


Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
tensor([[-0.0272,  0.0164, -0.1269,  0.0983,  0.0991, -0.0832, -0.0999, -0.0658,
          0.0328, -0.0606]], grad_fn=<AddmmBackward>)
tensor(1.7300, grad_fn=<MseLossBackward>) 

When we print the net.conv1.weight.grad we get 6 arrays of size (5,5) this clears that we have given 
one input image and (5,5) kernel and (5,5) kernel means this kernel of height 5 and width 5 
will traverse the image in 25 steps and we want 6 of them so we are getting 6 arrays of (5,5) as output
 

conv1.weight.grad before backward
tensor([[[[ 4.3141e-02,  3.1216e-02,  1.9751e-02,  3.5670e-02,  1.1059e-02],
          [ 4.8656e-05, -6.7129e-03, -2.8760e-02, -2.9703e-02, -2.1257e-02],
          [ 7.0798e-02

## When we print the net.conv1.weight.grad we get 6 arrays of size (5,5) this clears that we have given one input image and (5,5) kernel and (5,5) kernel means this kernel of height 5 and width 5 will traverse the image in 25 steps and we want 6 of them so we are getting 6 arrays of (5,5) as output


In [135]:
print('conv1.weight.grad before backward')
print(net.conv1.weight.grad,'\n')

print(net.conv1.weight.shape,'\n')
loss.backward(retain_graph=True)

print('conv1.weight.grad after backward')
print(net.conv1.weight.grad)

conv1.weight.grad before backward
tensor([[[[ 0.0544,  0.0368,  0.0284,  0.0433,  0.0144],
          [-0.0032, -0.0185, -0.0275, -0.0048, -0.0072],
          [ 0.0741, -0.0431,  0.0497,  0.0019,  0.0232],
          [ 0.0540,  0.0136,  0.0619,  0.0720,  0.0349],
          [ 0.0563,  0.0059, -0.0006,  0.0101, -0.0184]]],


        [[[-0.0026, -0.0591,  0.0008, -0.0285,  0.1127],
          [ 0.0595,  0.0312, -0.0284, -0.0480,  0.0202],
          [ 0.0357,  0.0656,  0.0037, -0.0018, -0.1155],
          [ 0.0070, -0.0572,  0.0386,  0.0759,  0.0893],
          [ 0.0107,  0.0403,  0.0410, -0.0261,  0.0412]]],


        [[[ 0.0008,  0.0561, -0.1083, -0.0312, -0.0524],
          [ 0.0085, -0.0596,  0.0425, -0.0999,  0.0429],
          [ 0.0691, -0.0310, -0.0580,  0.0124,  0.0751],
          [-0.0535,  0.0048, -0.0064,  0.0049,  0.0062],
          [ 0.0263, -0.0630,  0.0477, -0.0914,  0.0361]]],


        [[[ 0.0289, -0.1104,  0.0237, -0.0938, -0.0322],
          [-0.0858,  0.0042,  0.0538,  0.0

## In Stochastic Gradient Descent (SGD) update rule used is 

# weight = weight - learning_rate * gradient



In [193]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward(retain_graph=1)
optimizer.step()    # Does the update

print(optimizer)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0
)
