In [1]:
#export
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
import torch.nn



MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
torch.cuda.set_device(0)


### Forward Pass

In [32]:
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(torch.tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [33]:
x_train,y_train,x_valid,y_valid = get_data()

In [34]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.1304), tensor(0.3073))

In [35]:
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [36]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(3.0614e-05), tensor(1.))

In [37]:
valid_mean, valid_std = x_valid.mean(), x_valid.std()
valid_mean, valid_std

(tensor(-0.0058), tensor(0.9924))

In [38]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [39]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())
## No errors.

In [40]:
n,m = x_train.shape
c = y_train.max() +1
n,m,c

(50000, 784, tensor(10))

### Foundations
Basic Architecture

In [41]:
# num hidden
nh = 50

In [46]:
#weight matrices and biases
## kaiming init simplified 
dtype = torch.cuda.FloatTensor

w1 = torch.randn(m, nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)/math.sqrt(nh)
b2 = torch.zeros(1)


In [47]:
## Valid mean and std
x_valid.mean(), x_valid.std()

(tensor(-0.0058), tensor(0.9924))

In [48]:
def lin(x,w,b):
    return x@w + b


In [49]:
t = lin(x_valid,w1,b1)

In [50]:
t.mean(), t.std()
# without kaiming init /math.sqrt(m) 
#--> (tensor(-0.2946), tensor(27.4285))


(tensor(-0.0105), tensor(0.9690))

#### Note: Initialization of weght matrices play significant role

In [51]:
def relu(x): return x.clamp_min(0.) - 0.5 # helps a bit with the mean

In [52]:
## First layer
t = relu(lin(x_valid, w1, b1))

In [53]:
t.mean(), t.std()

(tensor(-0.1220), tensor(0.5563))

From pytorch docs: `a: the negative slope of the rectifier used after this layer (0 for ReLU by default)`

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$

This was introduced in the paper that described the Imagenet-winning approach from *He et al*: [Delving Deep into Rectifiers](https://arxiv.org/abs/1502.01852), which was also the first paper that claimed "super-human performance" on Imagenet (and, most importantly, it introduced resnets!)

In [54]:
##To fix this we use kaiming init and add sqrt(2/m)
w1 = torch.randn(m, nh)*math.sqrt(2/m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)*math.sqrt(2/nh)
b2 = torch.zeros(1)


In [55]:
## First layer
t = relu(lin(x_valid, w1, b1))

In [56]:
t.mean(), t.std()
## Better than before

(tensor(0.0551), tensor(0.8327))

In [57]:
#export
from torch.nn import init

In [58]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))

In [59]:
init.kaiming_normal_??

In [60]:
w1.shape

torch.Size([784, 50])

In [61]:
torch.nn.Linear(m,nh).weight.shape
## Torch creates opposite so we used fan_out

torch.Size([50, 784])

In [62]:
torch.nn.functional.linear??
## multiplies by weight.T transpose

In [63]:
torch.nn.Conv2d??

In [64]:
torch.nn.modules.conv._ConvNd??

In [65]:
## Basic Linear Forward pass model
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2,w2,b2)
    return l3

In [66]:
%timeit -n 10 _ = model(x_valid)

17.2 ms ± 6.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [67]:
model(x_valid).shape


torch.Size([10000, 1])

In [68]:
assert model(x_valid).shape == torch.Size([x_valid.shape[0],1])

### Loss Function: MSE

In [69]:
model(x_valid).shape

torch.Size([10000, 1])

In [70]:
## squeeze reduces dimensions opposite of unsqueeze
def mse(output, target): return (output.squeeze(-1) - target).pow(2).mean()

In [71]:
y_train, y_valid = y_train.float(), y_valid.float()

In [72]:
preds = model(x_train)

In [73]:
preds.shape

torch.Size([50000, 1])

In [74]:
y_train.shape

torch.Size([50000])

In [75]:
mse(preds, y_train)

tensor(30.5314)

#### Forward pass implemented now we need
## Gradients and backward pass

In [76]:
def mse_grad(inp, targ): 
    # grad of loss with respect to output of previous layer
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [77]:
def relu_grad(inp, out):
    ## the out in this is the inp in the prev func so we
    ## can access that by out.g
    #grad of relu with respect to input activations
    inp.g = (inp> 0).float() * out.g

In [78]:
def lin_grad(inp, out, w, b):
    #grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [79]:
def forward_and_backward(inp, targ):
    # forward pass:
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    ## we never use loss in grads or backward pass
    loss = mse(out,targ)
    
    #backward pass
    mse_grad(out, targ)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    lin_grad(inp, l1, w1, b1)
    

In [None]:
forward_and_backward(x_train, y_train)