# Analysis of Backpropagation with Examples in Pytorch
Author: Pierre Nugues

In [1]:
import torch
import torch.nn as nn

## The architectures

The network architecture from the book

In [2]:
class NetBook(nn.Module):

    def __init__(self):
        super(NetBook, self).__init__()
        self.fc1 = nn.Linear(3, 4, bias=False)  
        self.fc2 = nn.Linear(4, 2, bias=False)
        self.fc3 = nn.Linear(2, 1, bias=False)

    def forward(self, x):
        self.z1 = self.fc1(x)
        self.h1 = torch.relu(self.z1)
        
        self.z2 = self.fc2(self.h1)
        self.h2 = torch.relu(self.z2)
        
        self.z3 = self.fc3(self.h2)
        self.h3 = torch.sigmoid(self.z3)
        return self.h3

The simplified network architecture

In [3]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(3, 4, bias=False)  
        self.fc2 = nn.Linear(4, 2, bias=False)
        self.fc3 = nn.Linear(2, 1, bias=False)

    def forward(self, x):
        self.z1 = self.fc1(x)

        self.z2 = self.fc2(self.z1)
        
        self.z3 = self.fc3(self.z2)
        self.h3 = torch.sigmoid(self.z3)
        return self.h3

In [4]:
net_book = NetBook()
net_book

NetBook(
  (fc1): Linear(in_features=3, out_features=4, bias=False)
  (fc2): Linear(in_features=4, out_features=2, bias=False)
  (fc3): Linear(in_features=2, out_features=1, bias=False)
)

In [5]:
net = Net()
net

Net(
  (fc1): Linear(in_features=3, out_features=4, bias=False)
  (fc2): Linear(in_features=4, out_features=2, bias=False)
  (fc3): Linear(in_features=2, out_features=1, bias=False)
)

## The network weights

In [6]:
for name, param in net.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param} \n")

Layer: fc1.weight | Size: torch.Size([4, 3]) | Values : Parameter containing:
tensor([[ 0.2369,  0.4875, -0.0785],
        [-0.1420,  0.5180, -0.4875],
        [-0.0190, -0.2783,  0.3500],
        [ 0.2950, -0.3992,  0.5065]], requires_grad=True) 

Layer: fc2.weight | Size: torch.Size([2, 4]) | Values : Parameter containing:
tensor([[-0.0172, -0.1255,  0.3816,  0.4249],
        [-0.0027, -0.1248,  0.4886,  0.1373]], requires_grad=True) 

Layer: fc3.weight | Size: torch.Size([1, 2]) | Values : Parameter containing:
tensor([[-0.2785,  0.4220]], requires_grad=True) 



In [7]:
net.fc1.weight

Parameter containing:
tensor([[ 0.2369,  0.4875, -0.0785],
        [-0.1420,  0.5180, -0.4875],
        [-0.0190, -0.2783,  0.3500],
        [ 0.2950, -0.3992,  0.5065]], requires_grad=True)

## The dataset

An input vector. We create a row vector to ease the computation.

In [8]:
x = torch.tensor([[1.0, 2.0, 3.0]])

The output with the simplified network

In [9]:
y_pred = net.forward(x)
y_pred

tensor([[0.4999]], grad_fn=<SigmoidBackward0>)

A dataset

In [10]:
X = torch.tensor([[1.0, 2.0, 3.0],
                  [2.0, 3.0, 4.0],
                  [3.0, 4.0, 5.0],
                  [4.0, 5.0, 6.0]])

In [11]:
y_v = torch.tensor([[1.0, 0.0, 1.0, 0.0]]).T
y_v

tensor([[1.],
        [0.],
        [1.],
        [0.]])

The output with the book network

In [12]:
y_v_pred = net_book.forward(X)
y_v_pred

tensor([[0.4899],
        [0.4848],
        [0.4798],
        [0.4747]], grad_fn=<SigmoidBackward0>)

With the simplified network

In [13]:
y_v_pred = net.forward(X)
y_v_pred

tensor([[0.4999],
        [0.4962],
        [0.4926],
        [0.4889]], grad_fn=<SigmoidBackward0>)

## Examining the Forward Pass

### First as a matrix product

We compute the transpose of $\mathbf{x}$ to show the importance of the format

In [14]:
x.T

tensor([[1.],
        [2.],
        [3.]])

The matrix product accepts both simple vectors and column vectors

In [15]:
x

tensor([[1., 2., 3.]])

In [16]:
net.fc1.weight@x[0]

tensor([ 0.9764, -0.5687,  0.4745,  1.0161], grad_fn=<MvBackward0>)

In [17]:
net.fc1.weight@x.T

tensor([[ 0.9764],
        [-0.5687],
        [ 0.4745],
        [ 1.0161]], grad_fn=<MmBackward0>)

Going through the whole network

In [18]:
net.fc3.weight@net.fc2.weight@net.fc1.weight@x[0]

tensor([-0.0003], grad_fn=<MvBackward0>)

or

In [19]:
net.fc3.weight@net.fc2.weight@net.fc1.weight@x.T

tensor([[-0.0003]], grad_fn=<MmBackward0>)

### As a PyTorch function
Here we need to have observations arranged by rows

In [20]:
net.forward(X)

tensor([[0.4999],
        [0.4962],
        [0.4926],
        [0.4889]], grad_fn=<SigmoidBackward0>)

In [21]:
net.forward(x)

tensor([[0.4999]], grad_fn=<SigmoidBackward0>)

The importance of the format. The first axis is the batch axis. The classical matrix vector product with a column vector fails

In [22]:
net.forward(x.T)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x1 and 3x4)

## The Loss

### The simplified network

In [23]:
loss_fn = nn.BCELoss()

Computing the loss: $- y\ln\hat{y} - (1-y)\ln(1 -\hat{y})$

For the dataset, per sample

In [24]:
-y_v * torch.log(y_v_pred) - (1 - y_v) * torch.log(1 - y_v_pred)

tensor([[0.6933],
        [0.6857],
        [0.7081],
        [0.6712]], grad_fn=<SubBackward0>)

For the whole dataset

In [25]:
torch.sum(-y_v * torch.log(y_v_pred) - (1 - y_v) * torch.log(1 - y_v_pred))/len(y_v)

tensor(0.6896, grad_fn=<DivBackward0>)

We use pytorch to compute it. For the dataset

In [26]:
net.forward(X)

tensor([[0.4999],
        [0.4962],
        [0.4926],
        [0.4889]], grad_fn=<SigmoidBackward0>)

In [27]:
loss_fn(net.forward(X), y_v)

tensor(0.6896, grad_fn=<BinaryCrossEntropyBackward0>)

For one observation

In [28]:
net.forward(x)

tensor([[0.4999]], grad_fn=<SigmoidBackward0>)

In [29]:
loss = loss_fn(net.forward(x), y_v[:1])
loss

tensor(0.6933, grad_fn=<BinaryCrossEntropyBackward0>)

In [30]:
loss.item()

0.6933102607727051

In [31]:
loss.data

tensor(0.6933)

### The book network

In [32]:
loss_book_fn = nn.BCELoss()

In [33]:
net_book(X)

tensor([[0.4899],
        [0.4848],
        [0.4798],
        [0.4747]], grad_fn=<SigmoidBackward0>)

In [34]:
loss_book = loss_book_fn(net_book(X), y_v)
loss_book

tensor(0.6888, grad_fn=<BinaryCrossEntropyBackward0>)

## Going backward

Now, we backpropagate the gradients and we will use just one observation to check the backpropagated gradient values

In [35]:
loss.backward()

In [36]:
loss

tensor(0.6933, grad_fn=<BinaryCrossEntropyBackward0>)

## The gradients

Obtained from PyTorch with respect to the weights

In [37]:
net.fc1.weight.grad

tensor([[-0.0018, -0.0037, -0.0055],
        [ 0.0089,  0.0177,  0.0266],
        [-0.0500, -0.0999, -0.1499],
        [ 0.0302,  0.0604,  0.0906]])

In [38]:
net.fc2.weight.grad

tensor([[ 0.1360, -0.0792,  0.0661,  0.1415],
        [-0.2061,  0.1200, -0.1001, -0.2144]])

In [39]:
net.fc3.weight.grad

tensor([[-0.3337, -0.2199]])

## Gradients computed by hand with respect to the inputs and hidden layers

### Layer 3
The gradient of the loss with respect to the third and last layer: $\mathbf{z}^{(3)}$: $\nabla_{\mathbf{z}^{(3)}}Loss(y, \hat{y})$.

A few useful identities:
$$
\begin{array}{lcl}
Loss(y, \hat{y}) & = &- y\ln\hat{y} - (1-y)\ln(1 -\hat{y})\\\\
\nabla_{\hat{y}} & = & -\frac{y}{\hat{y}} + \frac{1 - y}{1 - \hat{y}}\\
& = & -\frac{y - \hat{y}}{\hat{y}(1 -\hat{y})}\\\\
lr'& =& \frac{e^{-z^{(3)}}}{(1 + e^{-z^{(3)}})^2}\\
& =& \frac{1}{1 + e^{-z^{(3)}}}(1 - \frac{1}{(1 + e^{-z^{(3)}}})\\
& =&\hat{y}(1 - \hat{y})\\\\
\delta_3 &=& \nabla_{\hat{y}} \cdot lr'\\
 &=& \hat{y} - y
\end{array}
$$

In [40]:
delta3 = net.h3 - y_v[:1]
delta3

tensor([[-0.5001]], grad_fn=<SubBackward0>)

This would update $\mathbf{z}^{(3)}$

In [41]:
net.z3

tensor([[-0.0003]], grad_fn=<MmBackward0>)

### Layer 2

The gradient of the loss with respect to layer 2: $\mathbf{z}^{(2)}$: $\nabla_{\mathbf{z}^{(2)}}Loss(y, \hat{y})$

$$(\hat{y} - y)\mathbf{W}^{(3)}$$

In [42]:
delta2 = delta3@net.fc3.weight
delta2

tensor([[ 0.1393, -0.2110]], grad_fn=<MmBackward0>)

This would update $\mathbf{z}^{(2)}$

In [43]:
net.z2

tensor([[0.6674, 0.4397]], grad_fn=<MmBackward0>)

### Layer 1

The gradient of the loss with respect to layer 1: $\mathbf{z}^{(1)}$: $\nabla_{\mathbf{z}^{(1)}}Loss(y, \hat{y})$
$$(\hat{y} - y) \mathbf{W}^{(3)} \mathbf{W}^{(2)}$$

In [44]:
delta1 = delta2@net.fc2.weight
delta1

tensor([[-0.0018,  0.0089, -0.0500,  0.0302]], grad_fn=<MmBackward0>)

This would update $\mathbf{z}^{(1)}$

In [45]:
net.z1

tensor([[ 0.9764, -0.5687,  0.4745,  1.0161]], grad_fn=<MmBackward0>)

### Input Layer

The gradient of the loss with respect to the input layer: $\mathbf{x}$: $\nabla_{\mathbf{x}}Loss(y, \hat{y})$

$$(\hat{y} - y) \mathbf{W}^{(3)} \mathbf{W}^{(2)} \mathbf{W}^{(1)}$$



In [46]:
delta0 = delta1@net.fc1.weight
delta0

tensor([[ 0.0082,  0.0055, -0.0064]], grad_fn=<MmBackward0>)

We check the value

In [47]:
(torch.sigmoid(net.fc3.weight@net.fc2.weight@net.fc1.weight@x.T) - y_v[0])*net.fc3.weight@net.fc2.weight@net.fc1.weight

tensor([[ 0.0082,  0.0055, -0.0064]], grad_fn=<MmBackward0>)

This would update $\mathbf{x}$

In [48]:
x

tensor([[1., 2., 3.]])

## Checking the forward pass, once again

As $\mathbf{x}$ is a row vector, we transpose it to compute the product by hand

In [49]:
x.T

tensor([[1.],
        [2.],
        [3.]])

In [50]:
net.fc3.weight@net.fc2.weight@net.fc1.weight@x.T

tensor([[-0.0003]], grad_fn=<MmBackward0>)

In [51]:
net.z3

tensor([[-0.0003]], grad_fn=<MmBackward0>)

In [52]:
torch.sigmoid(net.z3)

tensor([[0.4999]], grad_fn=<SigmoidBackward0>)

In [53]:
net.h3

tensor([[0.4999]], grad_fn=<SigmoidBackward0>)

We check the value. Note that we do not transpose $\mathbf{x}$.

In [54]:
net.forward(x)

tensor([[0.4999]], grad_fn=<SigmoidBackward0>)

## Now computing the gradient with respect to the weights. 

### One step backward

$$
\begin{array}{lcl}
\mathbf{\nabla}_{\mathbf{W}^{(3)}} Loss(y, \hat{y}) &= & (f^{(3)}(\mathbf{W}^{(3)} \mathbf{W}^{(2)} \mathbf{W}^{(1)} \mathbf{x}) - y) (\mathbf{W}^{(2)} \mathbf{W}^{(1)} \mathbf{x})^{\intercal}\\
&=&\mathbf{\delta}_3 {\mathbf{z}_2}^{\intercal}
\end{array}
$$

In the computation, we do not need to transpose $\mathbf{W}^{(2)} \mathbf{W}^{(1)} \mathbf{x}$ as we will create a row vector directly

The derivative of the loss

In [55]:
delta3

tensor([[-0.5001]], grad_fn=<SubBackward0>)

In [56]:
net.z2

tensor([[0.6674, 0.4397]], grad_fn=<MmBackward0>)

Now we check the equality with the gradient

In [57]:
delta3 @ net.z2

tensor([[-0.3337, -0.2199]], grad_fn=<MmBackward0>)

In [58]:
net.fc3.weight.grad

tensor([[-0.3337, -0.2199]])

### Two steps backward

$$
\begin{array}{lcl}
\mathbf{\nabla}_{\mathbf{W}^{(2)}} Loss(y, \hat{y}) & = & (f^{(3)}(\mathbf{W}^{(3)} \mathbf{W}^{(2)} \mathbf{W}^{(1)} \mathbf{x}) - y)\mathbf{W}^{(3)} ( \mathbf{W}^{(1)} \mathbf{x})^{\intercal}\\
& = & \mathbf{\delta}_2 {\mathbf{z}_1}^\intercal
\end{array}
$$

The gradient with respect to the hidden layer is supposed to be a column and the hidden layer a row vector. We adjust them with some transpose operations.

We transpose $\delta_2$ so that it is a column

In [59]:
delta2.T

tensor([[ 0.1393],
        [-0.2110]], grad_fn=<PermuteBackward0>)

And we do not transpose $\mathbf{z}_1$ as it is already a row.

In [63]:
net.z1

tensor([[ 0.9764, -0.5687,  0.4745,  1.0161]], grad_fn=<MmBackward0>)

We check the equality with the gradient

In [64]:
delta2.T@net.z1

tensor([[ 0.1360, -0.0792,  0.0661,  0.1415],
        [-0.2061,  0.1200, -0.1001, -0.2144]], grad_fn=<MmBackward0>)

In [65]:
net.fc2.weight.grad

tensor([[ 0.1360, -0.0792,  0.0661,  0.1415],
        [-0.2061,  0.1200, -0.1001, -0.2144]])

### Three steps backward

$$
\begin{array}{lcl}
\mathbf{\nabla}_{\mathbf{W}^{(1)}} Loss(y, \hat{y}) & = & (f^{(3)}(\mathbf{W}^{(3)} \mathbf{W}^{(2)} \mathbf{W}^{(1)} \mathbf{x}) - y)\mathbf{W}^{(3)} \mathbf{W}^{(2)} (  \mathbf{x})^{\intercal}\\
& = & \mathbf{\delta}_1 \mathbf{x}^\intercal
\end{array}
$$

In [66]:
delta1.T

tensor([[-0.0018],
        [ 0.0089],
        [-0.0500],
        [ 0.0302]], grad_fn=<PermuteBackward0>)

In [67]:
x

tensor([[1., 2., 3.]])

In [68]:
delta1.T@x

tensor([[-0.0018, -0.0037, -0.0055],
        [ 0.0089,  0.0177,  0.0266],
        [-0.0500, -0.0999, -0.1499],
        [ 0.0302,  0.0604,  0.0906]], grad_fn=<MmBackward0>)

In [69]:
net.fc1.weight.grad

tensor([[-0.0018, -0.0037, -0.0055],
        [ 0.0089,  0.0177,  0.0266],
        [-0.0500, -0.0999, -0.1499],
        [ 0.0302,  0.0604,  0.0906]])

We have the same gradients!

## Test

In [70]:
a = torch.tensor([[1.0, 2.0, 3.0]]).T
a

tensor([[1.],
        [2.],
        [3.]])

In [71]:
b  = torch.tensor([[2.0, 3.0, 4.0]])
b

tensor([[2., 3., 4.]])

In [72]:
a @ b

tensor([[ 2.,  3.,  4.],
        [ 4.,  6.,  8.],
        [ 6.,  9., 12.]])