In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## The forward and backward passes

In [2]:
#export
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): 
    return (x-m)/s

In [3]:
x_train,y_train,x_valid,y_valid = get_data()

In [4]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.1304), tensor(0.3073))

In [5]:
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [6]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(-9.3644e-07), tensor(1.))

In [7]:
#export
def test_near_zero(a, tol=1e-3): 
    assert a.abs() < tol 
    print(f"Near zero: {a}")

In [8]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

Near zero: -9.364412107970566e-07
Near zero: 0.0


In [9]:
# training set has 50000 images, every image is 28x28=784 pixels, images are numbers from 1 to 9
n, m = x_train.shape
c = y_train.max() + 1
n, m, c

(50000, 784, tensor(10))

## Foundations version

### Basic architecture

In [10]:
# num hidden. so input layer=784, hidden layer=50, output layer=1
nh = 50

In [11]:
# simplified kaiming init / he init
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [12]:
# torch.randn(m, n) give a matrix with 0 mean and deviation of 1

In [13]:
w1.shape, w1[0]
b1.shape, b1
w2.shape, w2[:5]
b2.shape, b2

(torch.Size([784, 50]),
 tensor([ 0.0253,  0.0170,  0.0105,  0.0187,  0.0569,  0.0011,  0.0092,  0.0129,
         -0.0029, -0.0409, -0.0534,  0.0125,  0.0562,  0.0256,  0.0336, -0.0440,
         -0.0107, -0.0218,  0.0119, -0.0160, -0.0483, -0.0029, -0.0078, -0.0485,
          0.0270, -0.0603,  0.0510, -0.0326,  0.0234, -0.0461, -0.0156, -0.0357,
          0.0077, -0.0252, -0.0508, -0.0418,  0.0008, -0.0621,  0.0015, -0.0278,
         -0.0282,  0.0300,  0.0224, -0.0311,  0.0218, -0.1446, -0.0224,  0.0326,
         -0.0193, -0.0085]))

(torch.Size([50]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]))

(torch.Size([50, 1]), tensor([[-0.1539],
         [-0.1221],
         [ 0.0577],
         [ 0.0694],
         [-0.0139]]))

(torch.Size([1]), tensor([0.]))

### why do we want the mean of weight is 0 and standard deviation is $\frac{1} {\sqrt{m}}$? because we used kaiming initialization. dividing the random numbers by square root of m.
in fact, we have an activation function as well. so we really want is the std of activation is about 1. we intialzied with   $\frac{2} {\sqrt{m}}$

In [14]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))


Near zero: 5.4643245675833896e-05
Near zero: -0.00024520978331565857


In [15]:
# This should be ~ (0,1) (mean,std)... because we normalized x_valid with trian_mean and train_std
x_valid.mean(),x_valid.std()

(tensor(-0.0059), tensor(0.9924))

In [16]:
def lin(x, w, b): 
    return x@w + b

In [17]:
x_valid.shape

t = lin(x_valid, w1, b1)

t.shape

torch.Size([10000, 784])

torch.Size([10000, 50])

In [18]:
#...so should this, because we used kaiming init, which is designed to do this
t.mean(),t.std()

(tensor(0.0171), tensor(0.9935))

In [19]:
def relu(x): 
    return x.clamp_min(0.)

In [20]:
t = relu(lin(x_valid, w1, b1))

In [21]:
#...actually it really should be this! after Relu, we removed all negative values, so mean will not be 0, std should about halved.
t.mean(),t.std()

(tensor(0.3981), tensor(0.5880))

From pytorch docs: `a: the negative slope of the rectifier used after this layer (0 for ReLU by default)`

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$

This was introduced in the paper that described the Imagenet-winning approach from *He et al*: [Delving Deep into Rectifiers](https://arxiv.org/abs/1502.01852), which was also the first paper that claimed "super-human performance" on Imagenet (and, most importantly, it introduced resnets!)

In [22]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2/m)

In [23]:
# std of w1 is not 1 any more
w1.mean(),w1.std()

(tensor(-0.0001), tensor(0.0504))

In [24]:
t = relu(lin(x_valid, w1, b1))
t.mean(),t.std()

(tensor(0.6028), tensor(0.8429))

In [25]:
#export
from torch.nn import init

preserves the magnitude of the variance of the weights when initializing

In [26]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))

tensor([[ 0.0841, -0.0757,  0.0372,  ..., -0.0235,  0.0200,  0.0039],
        [ 0.0251,  0.0842,  0.0565,  ..., -0.0455, -0.0196, -0.0092],
        [-0.0546, -0.0227, -0.0484,  ...,  0.0280,  0.0124,  0.0271],
        ...,
        [-0.0276, -0.0668, -0.0096,  ...,  0.0115,  0.0279, -0.0433],
        [-0.0592,  0.0216,  0.0495,  ...,  0.0600, -0.0462, -0.0582],
        [ 0.0022, -0.0403,  0.0101,  ..., -0.0109, -0.0835, -0.0089]])

In [27]:
init.kaiming_normal_??

$   \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan\_in}}}$

Also known as He initialization.

In [28]:
w1.mean(),w1.std()

(tensor(-0.0002), tensor(0.0507))

In [29]:
t.shape, t.mean(),t.std()

(torch.Size([10000, 50]), tensor(0.6473), tensor(0.8692))

In [30]:
import torch.nn

In [31]:
torch.nn.Linear(m,nh).weight.shape
torch.nn.Linear(m,nh).weight

torch.Size([50, 784])

Parameter containing:
tensor([[-0.0299,  0.0229, -0.0029,  ..., -0.0112,  0.0027, -0.0341],
        [ 0.0230, -0.0118,  0.0102,  ...,  0.0350, -0.0088,  0.0315],
        [-0.0318, -0.0131, -0.0161,  ..., -0.0303, -0.0017, -0.0127],
        ...,
        [ 0.0005, -0.0251,  0.0146,  ...,  0.0030, -0.0246,  0.0308],
        [ 0.0059, -0.0092, -0.0206,  ...,  0.0073, -0.0171, -0.0215],
        [ 0.0339,  0.0340,  0.0113,  ...,  0.0161,  0.0134,  0.0042]],
       requires_grad=True)

In [None]:
torch.nn.Linear.forward??

In [None]:
torch.nn.functional.linear??

In [None]:
torch.nn.Conv2d??

In [None]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [32]:
# what if...?
def relu(x): 
    return x.clamp_min(0.) - 0.5

In [33]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.0389), tensor(0.8003))

In [34]:
w1.mean(), w1.std()

(tensor(7.5220e-05), tensor(0.0504))

In [35]:
def model(xb):
    # xb is the input matrix, e.g. num_images X pixels
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [36]:
model(x_valid).shape

torch.Size([10000, 1])

In [37]:
%timeit -n 10 _ = model(x_valid)

2.13 ms ± 205 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [38]:
assert model(x_valid).shape == torch.Size([x_valid.shape[0],1])

In [179]:
# the above all makes sense

### Loss function: MSE

In [39]:
model(x_valid).shape

torch.Size([10000, 1])

We need `squeeze()` to get rid of that trailing (,1), in order to use `mse`. (Of course, `mse` is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use `mse` for now to keep things simple.)

In [40]:
#export
def mse(output, targ): 
    return (output.squeeze(-1) - targ).pow(2).mean()

In [41]:
y_train, y_valid = y_train.float(), y_valid.float()

In [50]:
# prediction is the output layer, not the mse
preds = model(x_train)

In [43]:
preds.shape

torch.Size([50000, 1])

In [47]:
preds.shape
preds[:2]
preds.squeeze(-1).shape
preds.squeeze(-1)[:2]

torch.Size([50000, 1])

tensor([[0.4767],
        [0.0870]])

torch.Size([50000])

tensor([0.4767, 0.0870])

In [48]:
mse(preds, y_train)

tensor(29.8744)

In [52]:
preds.shape, preds
y_train

(torch.Size([50000, 1]), tensor([[ 0.4767],
         [ 0.0870],
         [-0.2929],
         ...,
         [ 0.2838],
         [-1.0049],
         [-0.0289]]))

tensor([5., 0., 4.,  ..., 8., 4., 8.])

### Gradients and backward pass

$$\operatorname{MSE}=\frac{1}{n}\sum_{i=1}^n(Y_i-\hat{Y_i})^2$$

In [54]:
def mse_grad(inp, targ): 
    # grad of loss with respect to output of previous layer
    # in this case the inp is the 50000 predictions, so the gradient of loss is with respect to each of the predictions.
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]
    print('mse grad', inp.g.shape, inp.g, inp.shape, inp, targ.shape, targ)

#### no magical here, just the prediction error times 2 and then divided by total number of predictions, so the gradient is for each of the 50000 predictions. mse_grad returns 50000 grads.

In [59]:
mse_grad(preds, y_train)
# the gradients for the first two predictions are: 
2 * (0.4767 - 5)/50000
2 * (0.0870 - 0)/50000

mse grad torch.Size([50000, 1]) tensor([[-1.8093e-04],
        [ 3.4793e-06],
        [-1.7172e-04],
        ...,
        [-3.0865e-04],
        [-2.0020e-04],
        [-3.2116e-04]]) torch.Size([50000, 1]) tensor([[ 0.4767],
        [ 0.0870],
        [-0.2929],
        ...,
        [ 0.2838],
        [-1.0049],
        [-0.0289]]) torch.Size([50000]) tensor([5., 0., 4.,  ..., 8., 4., 8.])


-0.000180932

3.4799999999999997e-06

In [71]:
# the gradient is store in the g attribute
preds.g

tensor([[-1.8093e-04],
        [ 3.4793e-06],
        [-1.7172e-04],
        ...,
        [-3.0865e-04],
        [-2.0020e-04],
        [-3.2116e-04]])

In [182]:
def relu_grad(inp, out):
    # grad of relu with respect to input activations
    # this out.g is the gradient of mse inp.g
#     print('relu gradient')
#     print('out is:', out.shape, out)
#     print('out.g is:', out.g.shape, out.g)
    # directly pass the gradients to inp from out for all positive inputs, negative inputs get 0 gradient
    inp.g = (inp>0).float() * out.g

In [None]:
relu_grad

In [144]:
def lin_grad(inp, out, w, b):   
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)    
    b.g = out.g.sum(0)

In [155]:
def lin_grad(inp, out, w, b):
    # think the gradient process as between two layers
    # grad of matmul with respect to input
    # out.g is the gradient of next layer, for lin2, out.g is the prediciton gradients calculated by mse_grad
    # pass the gradients from next layer to previous layer by multiplying the gradient for each image and its corresponding weight
    # image1's gradients for all 50 hidden neurons are just image1's gradient from next layer times each individual weight correspnding to each neuron 
    inp.g = out.g @ w.t()
#     print('nnnnn', out.g[0] * w, inp.g[:1])
#     test_near((out.g[0] * w), inp.g[:1].squeeze().unsqueeze(-1))
#     w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0) # why not -1 for out.g
    # element_wise multiplication and then sum over all images, input squeeze in a third axis, but gradients from next
    # layer squeeze in a second dimension, so the weight gradient dimension is input 2nd dim x gradient 3rd dim, 784x50 in this case.
    # gradient for weight connecting first input and first neuron is the sum of first pixel * gradient from first neuron over all images
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
#     print('mmmm', (inp.unsqueeze(-1) * out.g.unsqueeze(1)).shape, w.g.shape)
    # bias gradient is just the sum of next layer's gradidents for all images
    b.g = out.g.sum(0)
#     print('linear layer gradient')
#     print('out is:', out.shape, out)
#     print('w.g:', w.g.shape, w.g, w.shape, w) 
#     print('b.g', b.g.shape, b.g, b.shape, b) 
#     print('inp.g', inp.g.shape, inp.g, inp.shape, inp)

### how do we actually back pass loss
assuming input layer=1000, hidden layer=9, output layer =3, training images=20000
* weight gradient (linear layer): squeeze in a trailing aixs in input, squeeze in a second axis in gradients of next layer, times them together and then sum over all the training examples. back pass the last two layers: (20000, 9, 1)X(20000, 1, 2), so weights are (9,2). it is like the activation of first neuron in hidden layer times the first gradient in the output layer for all images and then sum over them.
* bias gradient: sum over all images for the gradients in the next layer
* previous layer gradient: gradients of next layer times transposed weights. (20000, 2)x(2, 9)=(20000, 9), this is weighted sum of gradients of next layer

In [190]:
def forward_and_backward(inp, targ):
    # forward pass:
    l1 = inp @ w1 + b1
    l2 = relu(l1)
#     print('xxxx', l2.shape, l2.unsqueeze(-1).shape, l2[:2])
    out = l2 @ w2 + b2
    
    # we don't actually need the loss in backward!
    loss = mse(out, targ)
    
    # backward pass:
    mse_grad(out, targ)
#     print('yyy', out.shape, out.g.shape, out.g.unsqueeze(1).shape, out.g[:2])
    lin_grad(l2, out, w2, b2)
#     print('zzzz', w2.g.shape, l2.g)
#     print('yyy', l2.shape, l2.g.shape, l2.g[:2])
    relu_grad(l1, l2)
#     print('zzz', l1.shape, l1[:2], l1.g.shape, l1.g)
    print('yyy', l1.shape, l1.g.shape, l1.g.unsqueeze(1).shape, l1.g[:2], l1.g.unsqueeze(1)[:2])
    print('YYY', inp.shape, inp.unsqueeze(-1).shape, inp.unsqueeze(-1)[:2])
    lin_grad(inp, l1, w1, b1)
    print('zzzz', w1.g.shape, l1.g)

In [191]:
forward_and_backward(x_train, y_train)

mse grad torch.Size([50000, 1]) tensor([[-1.8093e-04],
        [ 3.4793e-06],
        [-1.7172e-04],
        ...,
        [-3.0865e-04],
        [-2.0020e-04],
        [-3.2116e-04]]) torch.Size([50000, 1]) tensor([[ 0.4767],
        [ 0.0870],
        [-0.2929],
        ...,
        [ 0.2838],
        [-1.0049],
        [-0.0289]]) torch.Size([50000]) tensor([5., 0., 4.,  ..., 8., 4., 8.])
yyy torch.Size([50000, 50]) torch.Size([50000, 50]) torch.Size([50000, 1, 50]) tensor([[ 2.7842e-05,  0.0000e+00, -1.0446e-05, -0.0000e+00,  2.5195e-06,
          0.0000e+00,  2.0967e-06, -2.4934e-06, -4.9826e-06, -0.0000e+00,
         -1.2084e-05, -6.0460e-05, -2.6124e-05, -0.0000e+00, -1.4225e-05,
          8.4178e-06, -1.0181e-05,  0.0000e+00, -0.0000e+00,  3.8999e-06,
          0.0000e+00,  1.3938e-05,  5.5831e-06, -6.0686e-05,  3.1782e-05,
          2.2215e-05,  0.0000e+00,  2.3160e-05, -0.0000e+00, -0.0000e+00,
          0.0000e+00, -9.9402e-06,  0.0000e+00,  0.0000e+00,  2.0387e-05,
         

In [123]:
# element-wise multiplication
a = torch.tensor([[1,2,3], [4,5,6]]).unsqueeze(-1)
b = torch.tensor([[5], [1]]).unsqueeze(1)
# b == torch.tensor([[5], [1]]).unsqueeze(-1)
#a @ b give a error because shape mismatch
a.shape, a
b.shape, b
c = a * b # broadcasting
c.shape, c
c.sum(0)
c.sum(1)
c.sum(-1)

tensor([[[1]],

        [[1]]], dtype=torch.uint8)

(torch.Size([2, 3, 1]), tensor([[[1],
          [2],
          [3]],
 
         [[4],
          [5],
          [6]]]))

(torch.Size([2, 1, 1]), tensor([[[5]],
 
         [[1]]]))

(torch.Size([2, 3, 1]), tensor([[[ 5],
          [10],
          [15]],
 
         [[ 4],
          [ 5],
          [ 6]]]))

tensor([[ 9],
        [15],
        [21]])

tensor([[30],
        [15]])

tensor([[ 5, 10, 15],
        [ 4,  5,  6]])

In [79]:
# calculate mse gradient 2/num_of_input_images * (activation of l2 which is the prediction - target which is the handwritten number)
2/50000*(-0.0531 - 5) # gradient from first image in x_train
2/50000*(0.1005 - 0)  # gradient from second image in x_train

-0.000202124

4.0200000000000005e-06

In [76]:
x_train.shape
x_train[:10]
y_train[:10]
forward_and_backward(x_train, y_train)

torch.Size([50000, 784])

tensor([[-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        ...,
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245]])

tensor([5., 0., 4., 1., 9., 2., 1., 3., 1., 4.])

mse grad torch.Size([50000, 1]) tensor([[-2.0213e-04],
        [ 4.0215e-06],
        [-1.8602e-04],
        ...,
        [-3.1210e-04],
        [-1.3147e-04],
        [-2.9066e-04]]) torch.Size([50000, 1]) tensor([[-0.0531],
        [ 0.1005],
        [-0.6504],
        ...,
        [ 0.1976],
        [ 0.7133],
        [ 0.7335]]) torch.Size([50000]) tensor([5., 0., 4.,  ..., 8., 4., 8.])
out is: torch.Size([50000, 1]) tensor([[-0.0531],
        [ 0.1005],
        [-0.6504],
        ...,
        [ 0.1976],
        [ 0.7133],
        [ 0.7335]])
w.g: torch.Size([50, 1]) tensor([[-4.6058],
        [ 1.1992],
        [-4.0051],
        [-0.3161],
        [-7.8630],
        [ 3.4794],
        [ 3.3173],
        [-4.2022],
        [-5.9368],
        [ 2.7553],
        [ 1.3928],
        [-1.4530],
        [-2.9946],
        [-3.2554],
        [-2.1126],
        [ 2.5994],
        [-3.0389],
        [ 1.0889],
        [ 1.6581],
        [ 1.7819],
        [ 3.6775],
        [ 3.2682],
    

In [158]:
w1.g, w1.shape

(tensor([[-0.2419, -0.1247,  0.1506,  ..., -0.0096, -0.1922,  0.0087],
         [-0.2419, -0.1247,  0.1506,  ..., -0.0096, -0.1922,  0.0087],
         [-0.2419, -0.1247,  0.1506,  ..., -0.0096, -0.1922,  0.0087],
         ...,
         [-0.2419, -0.1247,  0.1506,  ..., -0.0096, -0.1922,  0.0087],
         [-0.2419, -0.1247,  0.1506,  ..., -0.0096, -0.1922,  0.0087],
         [-0.2419, -0.1247,  0.1506,  ..., -0.0096, -0.1922,  0.0087]]),
 torch.Size([784, 50]))

In [165]:
# Save for testing against later
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig  = x_train.g.clone()

We cheat a little bit and use PyTorch autograd to check our results.

In [192]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [193]:
def forward(inp, targ):
    # forward pass:
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    # we don't actually need the loss in backward!
    return mse(out, targ)

In [194]:
loss = forward(xt2, y_train)

In [195]:
loss.backward()

In [196]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig )

## Refactor model

### Layers as classes

In [197]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)-0.5
        return self.out
    
    def backward(self): 
        self.inp.g = (self.inp>0).float() * self.out.g

In [198]:
class Lin():
    def __init__(self, w, b): 
        self.w = w
        self.b = b
        
    def __call__(self, inp):
        self.inp = inp
        self.out = inp @ self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        # Creating a giant outer product, just to sum it, is inefficient!
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [199]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (inp.squeeze() - targ).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [200]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: 
            x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): 
            l.backward()

In [205]:
w1.g, b1.g, w2.g, b2.g = [None] * 4
model = Model(w1, b1, w2, b2)

In [206]:
w1.g

In [207]:
%time loss = model(x_train, y_train)

CPU times: user 465 ms, sys: 1.15 s, total: 1.61 s
Wall time: 22.8 ms


In [208]:
loss

tensor(29.8744)

In [209]:
%time model.backward()

CPU times: user 16.1 s, sys: 1min 24s, total: 1min 41s
Wall time: 2.66 s


In [214]:
model.layers

[<__main__.Lin at 0x7fc9d3ff94e0>,
 <__main__.Relu at 0x7fc9d3ff9e48>,
 <__main__.Lin at 0x7fc9d3ff9160>]

In [215]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Module.forward()

In [218]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): 
        raise Exception('not implemented')
        
    def backward(self): 
        self.bwd(self.out, *self.args)

In [219]:
class Relu(Module):
    def forward(self, inp): 
        return inp.clamp_min(0.)-0.5
    
    def bwd(self, out, inp): 
        inp.g = (inp > 0).float() * out.g

In [220]:
class Lin(Module):
    def __init__(self, w, b): 
        self.w = w
        self.b = b
        
    def forward(self, inp): 
        return inp @ self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [222]:
class Mse(Module):
    def forward (self, inp, targ): 
        return (inp.squeeze() - targ).pow(2).mean()
    
    def bwd(self, out, inp, targ): 
        inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [223]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: 
            x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): 
            l.backward()

In [224]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [225]:
%time loss = model(x_train, y_train)

CPU times: user 614 ms, sys: 2.25 s, total: 2.86 s
Wall time: 42 ms


In [226]:
%time model.backward()

CPU times: user 4.19 s, sys: 1min 30s, total: 1min 34s
Wall time: 1.31 s


In [227]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Without einsum

In [229]:
class Lin(Module):
    def __init__(self, w, b): 
        self.w = w
        self.b = b
        
    def forward(self, inp): 
        return inp @ self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        self.b.g = out.g.sum(0)

In [230]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [231]:
%time loss = model(x_train, y_train)

CPU times: user 707 ms, sys: 2.91 s, total: 3.62 s
Wall time: 50.6 ms


In [232]:
%time model.backward()

CPU times: user 3.33 s, sys: 8.32 s, total: 11.6 s
Wall time: 163 ms


In [233]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### nn.Linear and nn.Module

In [235]:
#export
from torch import nn

In [236]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: 
            x = l(x)
        return self.loss(x.squeeze(), targ)

In [237]:
model = Model(m, nh, 1)

In [238]:
%time loss = model(x_train, y_train)

CPU times: user 733 ms, sys: 3.73 s, total: 4.46 s
Wall time: 74.2 ms


In [239]:
%time loss.backward()

CPU times: user 3.54 s, sys: 1.89 s, total: 5.43 s
Wall time: 76.9 ms


## Export

In [240]:
!python notebook2script.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to exp/nb_02.py
