- [Source_blog](https://mattmazur.com/2015/03/17/a-step-by-step-backpropagation-example/)

In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy
from sklearn import datasets

## Simple Perceptron

![image](https://matthewmazur.files.wordpress.com/2018/03/neural_network-9.png)

In [2]:
def init_weight(m):
    if type(m) in [nn.Conv2d,nn.Linear]:
        m.weight.data=torch.Tensor([[0.15,0.20],[0.25,0.30]])
        m.bias.data=torch.Tensor([0.35])

In [3]:
## Model Set Up
class Perceptron(nn.Module):
    def __init__(self,input_size,H1,output_size):
        super().__init__()
        
        self.linear=nn.Linear(input_size,H1)
        # manual weight initialization for input to hidden node [refer above image]
        self.linear.weight.data=torch.Tensor([[0.15,0.20],[0.25,0.30]])
        self.linear.bias.data=torch.Tensor([0.35])
        
        # manual weight initialization for input to hidden node [refer above image]
        self.linear2=nn.Linear(H1,output_size)
        self.linear2.weight.data=torch.Tensor([[0.40,0.45],[0.50,0.55]])
        self.linear2.bias.data=torch.Tensor([0.60])
    
    def forward(self,x, print_values=False):

        ## Actual Return after Sigmoid
        net_h = self.linear(x)
        out_h=torch.sigmoid(net_h)
        net_O = self.linear2(out_h)
        out_O = torch.sigmoid(net_O)
        
        if print_values:
            print("h1: {}, h2: {}".format(net_h[0], net_h[1]))
            print("out_h1: {}, out_h2: {}".format(out_h[0], out_h[1]))
            print("net_O1: {}, net_O2: {}".format(net_O[0], net_O[1]))
            print("out_O1: {}, out_O2: {}".format(out_O[0], out_O[1]))

        
        return (out_O)

## Unfold each epoch and check intermediate values


### Initial Weight of the model

In [4]:
model=Perceptron(2,2,1)
print (list(model.parameters()))

[Parameter containing:
tensor([[0.1500, 0.2000],
        [0.2500, 0.3000]], requires_grad=True), Parameter containing:
tensor([0.3500], requires_grad=True), Parameter containing:
tensor([[0.4000, 0.4500],
        [0.5000, 0.5500]], requires_grad=True), Parameter containing:
tensor([0.6000], requires_grad=True)]


### After 1 forward pass check hidden, output and loss

![image](blog_1.png)

In [5]:
#model.forward(torch.Tensor([0.05,0.10]))

$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = \left( x_n - y_n \right)^2,$$

In [6]:
##??nn.MSELoss

In [7]:
model=Perceptron(2,2,2)
criterion=nn.MSELoss()
optimizer=torch.optim.SGD(model.parameters(),lr=0.5)
output=model.forward(torch.Tensor([0.05,0.10]))
target=torch.Tensor([0.01,0.99])
loss=criterion(output,target)
print("total MSEerror: {}".format(loss.item()))

total MSEerror: 0.2983711063861847


### Backward pass

In [8]:
# set prev grad to zero
optimizer.zero_grad()

loss.backward()

optimizer.step()

print(list(model.parameters()))

[Parameter containing:
tensor([[0.1498, 0.1996],
        [0.2498, 0.2995]], requires_grad=True), Parameter containing:
tensor([0.3406], requires_grad=True), Parameter containing:
tensor([[0.3589, 0.4087],
        [0.5113, 0.5614]], requires_grad=True), Parameter containing:
tensor([0.5498], requires_grad=True)]


### 2nd Forward Pass

In [9]:
y_output = model.forward(torch.Tensor([0.05,0.10]))

### 2nd Backward pass

In [10]:
loss=criterion(y_output,target)

optimizer.zero_grad()

loss.backward()

optimizer.step()

print(list(model.parameters()))

[Parameter containing:
tensor([[0.1496, 0.1992],
        [0.2495, 0.2991]], requires_grad=True), Parameter containing:
tensor([0.3327], requires_grad=True), Parameter containing:
tensor([[0.3171, 0.3666],
        [0.5232, 0.5733]], requires_grad=True), Parameter containing:
tensor([0.4991], requires_grad=True)]


## Reinitializing the model and looping over Epoch

In [16]:
model=Perceptron(2,2,2)
criterion=nn.MSELoss()
optimizer=torch.optim.SGD(model.parameters(),lr=0.5)

input_x = torch.Tensor([0.05,0.10])
for i in range(100):
    y_output=model.forward(input_x)
    
    loss=criterion(y_output,target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i%10 == 0:
        print(f'Epoch: {i}, loss: {loss.item()}, output: {y_output}')
        print("="*60)

Epoch: 0, loss: 0.2983711063861847, output: tensor([0.7514, 0.7729], grad_fn=<SigmoidBackward>)
Epoch: 10, loss: 0.17523802816867828, output: tensor([0.5335, 0.7135], grad_fn=<SigmoidBackward>)
Epoch: 20, loss: 0.10270833224058151, output: tensor([0.3705, 0.7153], grad_fn=<SigmoidBackward>)
Epoch: 30, loss: 0.06461693346500397, output: tensor([0.2801, 0.7528], grad_fn=<SigmoidBackward>)
Epoch: 40, loss: 0.04308116436004639, output: tensor([0.2253, 0.7905], grad_fn=<SigmoidBackward>)
Epoch: 50, loss: 0.03048398159444332, output: tensor([0.1893, 0.8202], grad_fn=<SigmoidBackward>)
Epoch: 60, loss: 0.022741930559277534, output: tensor([0.1640, 0.8425], grad_fn=<SigmoidBackward>)
Epoch: 70, loss: 0.017712488770484924, output: tensor([0.1455, 0.8594], grad_fn=<SigmoidBackward>)
Epoch: 80, loss: 0.014273736625909805, output: tensor([0.1314, 0.8725], grad_fn=<SigmoidBackward>)
Epoch: 90, loss: 0.01181731652468443, output: tensor([0.1203, 0.8829], grad_fn=<SigmoidBackward>)


**Note:** at Line 3 above, if we use `Adam` optimizer, it will learn faster in lesser epoch. For example, with Adam, only using 10 epochs, the model will learn the optimum weight

In [12]:
print(list(model.parameters()))

[Parameter containing:
tensor([[0.1731, 0.2462],
        [0.2726, 0.3452]], requires_grad=True), Parameter containing:
tensor([1.2642], requires_grad=True), Parameter containing:
tensor([[-1.1495, -1.1076],
        [ 1.5021,  1.5570]], requires_grad=True), Parameter containing:
tensor([-0.3011], requires_grad=True)]


## Prediction

In [13]:
print (model.forward(torch.Tensor([0.06,0.12])))

tensor([0.1111, 0.8918], grad_fn=<SigmoidBackward>)


## Checking gradients of the particular layer

In [14]:
print (model.linear2.weight.grad)

tensor([[ 0.0080,  0.0080],
        [-0.0076, -0.0076]])


In [15]:
print("DONE")

DONE
