# (Self Study) The backpropagation algorithm

The notebook includes the following steps:
1. Generate data and compute gradients with PyTorch
1. Implement (dense linear) layers and activation functions (including local gradients)
1. Implement forward pass
1. Implement backward pass

In [1]:
import copy
import math

import numpy as np
import sklearn.metrics
import torch
import torch.nn

In [2]:
def t2a(x):
    return x.detach().numpy()

def test(arr, tens, eps=1e-6):
    assert np.max(np.abs(arr - t2a(tens))) < eps

## Data

In [3]:
np.random.seed(501)

In [4]:
x_in = np.random.uniform(-1, 1, size=(8, 2))
xt_in = torch.Tensor(x_in)

In [5]:
y = np.random.choice([0, 1], size=(8, 1))
yt = torch.Tensor(y)

In [6]:
xt_in.shape

torch.Size([8, 2])

In [7]:
yt.shape

torch.Size([8, 1])

## PyTorch

### Set seeds for reproducibility

In [8]:
torch.manual_seed(501)
np.random.seed(501)

### Build net in PyTorch

In [9]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.h0 = torch.nn.Linear(2, 4)
        self.h1 = torch.nn.Linear(4, 1)

    def forward(self, xb):
        xb = self.h0(xb)
        xb = torch.nn.functional.relu(xb)
        xb = self.h1(xb)
        return xb

In [10]:
raw_net = Net()
net1 = copy.deepcopy(raw_net)
net2 = copy.deepcopy(raw_net)

In [11]:
net1.h0.weight.shape

torch.Size([4, 2])

In [12]:
net1.h0.weight, net1.h0.bias

(Parameter containing:
 tensor([[-0.5283, -0.5109],
         [-0.2843,  0.6360],
         [-0.6374,  0.5668],
         [-0.3771,  0.6981]], requires_grad=True),
 Parameter containing:
 tensor([-0.3359, -0.2572,  0.5822,  0.3144], requires_grad=True))

In [13]:
net1.h1.weight, net1.h1.bias

(Parameter containing:
 tensor([[ 0.3399, -0.3500, -0.2764, -0.0239]], requires_grad=True),
 Parameter containing:
 tensor([0.4749], requires_grad=True))

### Forward pass

In [14]:
logits1 = net1(xt_in)

In [15]:
probs1 = torch.sigmoid(logits1)

In [16]:
loss1 = torch.nn.functional.binary_cross_entropy(probs1, yt)
loss1

tensor(0.6287, grad_fn=<BinaryCrossEntropyBackward0>)

### Backward pass

In [17]:
loss1.backward()

In [18]:
net1.h0.weight.grad

tensor([[ 0.0140,  0.0153],
        [-0.0114,  0.0145],
        [ 0.0051,  0.0153],
        [ 0.0014,  0.0024]])

In [19]:
net1.h0.bias.grad

tensor([-0.0158,  0.0220,  0.0482,  0.0031])

In [20]:
net1.h1.weight.grad

tensor([[-0.0291, -0.0194, -0.1212, -0.0878]])

In [21]:
net1.h1.bias.grad

tensor([-0.1745])

### Using `binary_cross_entropy_with_logits` yields same result

... and is actually preferred over `binary_cross_entropy`. Care to guess why?

In [22]:
logits2 = net2(xt_in)
logits2.retain_grad()  # so we can compare even non leaf tensors
logits2

tensor([[ 0.1936],
        [ 0.5221],
        [ 0.3482],
        [ 0.3536],
        [ 0.3307],
        [-0.0119],
        [ 0.3099],
        [ 0.3991]], grad_fn=<AddmmBackward0>)

In [23]:
logits2.grad

In [24]:
loss2 = torch.nn.functional.binary_cross_entropy_with_logits(logits2, yt)
loss2.retain_grad()
loss2

tensor(0.6287, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [25]:
loss2.grad

In [26]:
loss2.backward()

In [27]:
net2.h0.weight.grad, net1.h0.weight.grad

(tensor([[ 0.0140,  0.0153],
         [-0.0114,  0.0145],
         [ 0.0051,  0.0153],
         [ 0.0014,  0.0024]]),
 tensor([[ 0.0140,  0.0153],
         [-0.0114,  0.0145],
         [ 0.0051,  0.0153],
         [ 0.0014,  0.0024]]))

In [28]:
net2.h0.bias.grad, net1.h0.bias.grad

(tensor([-0.0158,  0.0220,  0.0482,  0.0031]),
 tensor([-0.0158,  0.0220,  0.0482,  0.0031]))

In [29]:
net2.h1.weight.grad, net1.h1.weight.grad

(tensor([[-0.0291, -0.0194, -0.1212, -0.0878]]),
 tensor([[-0.0291, -0.0194, -0.1212, -0.0878]]))

In [30]:
net2.h1.bias.grad, net1.h1.bias.grad

(tensor([-0.1745]), tensor([-0.1745]))

## Manual implementation

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/backpropagation.png" align="left" width="1000">

In [31]:
class LinearLayer:
    def __init__(self, shape, activation="relu"):
        self.shape = shape
        # He initialization
        gain = math.sqrt(2) if activation == "relu" else 1
        std = gain / math.sqrt(self.shape[0])
        bound = math.sqrt(3.0) * std
        self.weights = np.random.uniform(-bound, bound, self.shape)
        bound = 1 / math.sqrt(self.shape[0])
        self.bias = np.random.uniform(-bound, bound, self.shape[1])
        # store data on forward pass ...
        self.input = None
        # ... and gradients on backward pass
        self.grad_weights = None
        self.grad_bias = None

    def forward_pass(self, x, training=True):
        self.input = x
        return x.dot(self.weights) + self.bias

    def backward_pass(self, cum_grad):
        weights = self.weights.copy()
        self.grad_weights = self.input.T.dot(cum_grad)
        self.grad_bias = np.sum(cum_grad, axis=0, keepdims=True)
        # we only want to understand backpropagation here so we skip updating the weights
        return cum_grad.dot(weights.T)

In [32]:
class Sigmoid:
    def __init__(self):
        self.input = None
        self.grad = None

    def forward_pass(self, x):
        self.input = x
        return self._sigmoid(x)

    def backward_pass(self, cum_grad):
        self.grad = self.gradient(self.input)
        return cum_grad * self.grad

    def gradient(self, x):
        return self._sigmoid(x) * (1 - self._sigmoid(x))

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

In [33]:
class ReLU:
    def __init__(self):
        self.input = None
        self.grad = None

    def forward_pass(self, x):
        self.input = x
        return (x > 0) * x

    def backward_pass(self, cum_grad):
        self.grad = self.gradient(self.input)
        return cum_grad * self.grad

    def gradient(self, x):
        return (x > 0).astype(int)

In [34]:
l0 = LinearLayer((2, 4))
# overwrite weights with pytorch net initalization so we can compare results
l0.weights = raw_net.h0.weight.detach().T.numpy()
l0.bias = raw_net.h0.bias.detach().numpy()

l1 = LinearLayer((4, 1))
# overwrite weights with pytorch net initalization so we can compare results
l1.weights = raw_net.h1.weight.detach().T.numpy()
l1.bias = raw_net.h1.bias.detach().numpy()

a0 = ReLU()
a1 = Sigmoid()

In [35]:
# assembling the full network
layers = [
    l0,
    a0,
    l1,
    a1,
]

### Forward pass

In [36]:
out = x_in.copy()
for layer in layers:
    out = layer.forward_pass(out)
probs_np = out
loss_np = sklearn.metrics.log_loss(y, probs_np)

In [37]:
assert np.max(np.abs(probs_np - t2a(probs1))) < 1e-6

In [38]:
assert np.max(np.abs(loss_np - t2a(loss1))) < 1e-6

### Backward pass

In [39]:
grad = -(y / probs_np) + (1 - y) / (1 - probs_np)
grad /= grad.shape[0]
grad

array([[ 0.27670447],
       [-0.1991569 ],
       [-0.21324057],
       [-0.21276838],
       [-0.21479973],
       [-0.25149278],
       [-0.2166928 ],
       [ 0.31130692]])

In [40]:
for layer in layers[::-1]:
    grad = layer.backward_pass(grad)

In [41]:
l1.grad_weights

array([[-0.02906101],
       [-0.01942071],
       [-0.12120559],
       [-0.08780599]])

In [42]:
net2.h1.weight.grad.T

tensor([[-0.0291],
        [-0.0194],
        [-0.1212],
        [-0.0878]])

In [43]:
test(l1.grad_weights, net2.h1.weight.grad.T)

In [44]:
test(l1.grad_bias, net2.h1.bias.grad)

In [45]:
test(l0.grad_weights, net2.h0.weight.grad.T)

In [46]:
test(l0.grad_bias, net2.h0.bias.grad)

<br>
<br>
<b>Learning from Big Data</b> <br>
Sebastian Gabel <br>