# A typical training procedure for a neural network is as follows:

Define the neural network that has some learnable parameters (or weights)
Iterate over a dataset of inputs
Process input through the network
Compute the loss (how far is the output from being correct)
Propagate gradients back into the network’s parameters
Update the weights of the network, typically using a simple update rule: weight = weight - learning_rate * gradient

#Defining the Neural Network

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [18]:
import torch.optim as optim

In [2]:
class model(nn.Module):
    def __init__(self):
        super(model,self).__init__()
        self.conv1 = nn.Conv2d(1,6,5) #Conv2d parameters: #channel_in, channel_out, kernel size 
        self.conv2 = nn.Conv2d(6,16,5)
        
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)),2)
        x = F.max_pool2d(F.relu(self.conv2(x)),2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x    
    
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [3]:
ffnn = model()
print(ffnn) #see the model

model(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [None]:
#get the learnable parameters
l = list(ffnn.parameters)

In [4]:
t = torch.rand(1,1,32,32)
out = ffnn(t)
print(out)

tensor([[-0.0670, -0.0426,  0.0892,  0.0375,  0.1500,  0.0769,  0.0445, -0.0166,
         -0.1227,  0.1012]], grad_fn=<ThAddmmBackward>)


In [5]:
target = torch.rand(10)

In [9]:
target = target.view(1,-1)
print(target)

tensor([[0.7606, 0.1782, 0.6753, 0.9203, 0.8952, 0.8738, 0.9695, 0.4205, 0.5126,
         0.3479]])


In [11]:
costFunction = nn.MSELoss()
loss = costFunction(out, target)

In [12]:
print(loss)

tensor(0.4558, grad_fn=<MseLossBackward>)


#Computation graph
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss

In [14]:
#You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.
ffnn.zero_grad()

In [16]:
#gradient of conv1 layer before gradient
print(ffnn.conv1.bias.grad)

loss.backward()
print(ffnn.conv1.bias.grad)

None
tensor([-0.0067,  0.0010,  0.0004, -0.0032,  0.0073,  0.0007])


In [19]:
#update the parameter
learning_rate = 0.001
for f in ffnn.parameters():
    f.data.sub_(learning_rate*f.grad.data)

In [21]:
#To use any other optimizer

# create your optimizer
optimizer = optim.SGD(ffnn.parameters(), lr=0.01)
optimizer.zero_grad()
out = ffnn(t)
loss = costFunction(out, target)
loss.backward()
optimizer.step() #update the parameter

In [22]:
list(ffnn.parameters())

[Parameter containing:
 tensor([[[[-0.0546, -0.1184,  0.0429, -0.1313, -0.0570],
           [ 0.0446,  0.1544,  0.1698, -0.0569,  0.1383],
           [-0.0213,  0.1659, -0.1411, -0.1192, -0.0773],
           [ 0.0457,  0.1573, -0.0105,  0.1066, -0.0191],
           [-0.0864, -0.0465, -0.0838, -0.1686, -0.0662]]],
 
 
         [[[ 0.0696, -0.1745, -0.1891, -0.0568, -0.1134],
           [ 0.0048,  0.0789,  0.1468,  0.1652,  0.0167],
           [-0.1206, -0.1288,  0.0023,  0.1276,  0.1561],
           [-0.1229, -0.1944,  0.0661,  0.1063,  0.0510],
           [ 0.0936,  0.1287, -0.1423, -0.1465, -0.0178]]],
 
 
         [[[ 0.1786, -0.1462, -0.1645,  0.1352,  0.0487],
           [ 0.0683, -0.1894, -0.1746, -0.0006,  0.0321],
           [-0.0340,  0.0078, -0.1830,  0.1288, -0.0723],
           [-0.0759, -0.1581,  0.1124, -0.0458, -0.1766],
           [ 0.0570, -0.0219, -0.1381,  0.1339, -0.0287]]],
 
 
         [[[ 0.0170, -0.0462,  0.0200, -0.1292, -0.0942],
           [ 0.0621, -0.1699, -