In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## The forward and backward passes

In [3]:
#export
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path,'rb') as f:
        ((x_train,y_train), (x_valid,y_valid), _) = pickle.load(f,encoding = 'latin1')
    return map(tensor,(x_train,y_train,x_valid,y_valid))

def normalize(x,m,s):
    return (x-m)/s

In [4]:
x_train,y_train,x_valid,y_valid = get_data()

In [5]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [6]:
x_train = normalize(x_train, train_mean, train_std)

# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [7]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.0001), tensor(1.))

In [8]:
# Always mean should be around 0 and std around 1

In [9]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [10]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [11]:
n,m = x_train.shape # n = number of eg, m = number of features/columns, c = number of classes
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

## Foundations version

### Basic architecture

In [12]:
# num hidden
nh = 50

In [17]:
# simplified kaiming-hi initialisation (xavier_initialisation)
    
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)

w2 = torch.randn(nh,1)/math.sqrt(m)
b2 = torch.zeros(1)

In [23]:
test_near_zero(w1.mean())

AssertionError: Near zero: 0.007236461155116558

In [19]:
x_valid.mean(), x_valid.std()

(tensor(-0.0057), tensor(0.9924))

In [20]:
def lin(x,w,b):
    return x@w + b

In [21]:
t = lin(x_valid,w1,b1)

In [22]:
t.mean(), t.std()

(tensor(-1.2451), tensor(29.5519))

In [25]:
def relu(x):
    return x.clamp_min(0.)

In [27]:
t = relu(lin(x_valid,w1,b1))

In [28]:
t.mean(), t.std()

(tensor(10.9318), tensor(17.2163))

From pytorch docs: `a: the negative slope of the rectifier used after this layer (0 for ReLU by default)`

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$

This was introduced in the paper that described the Imagenet-winning approach from *He et al*: [Delving Deep into Rectifiers](https://arxiv.org/abs/1502.01852), which was also the first paper that claimed "super-human performance" on Imagenet (and, most importantly, it introduced resnets!)

In [29]:
# kaiming init / he init for RELU # IMP - For RELU! 
w1 = torch.randn(m,nh)*math.sqrt(2/m)

In [30]:
w1.mean(), w1.std()

(tensor(-0.0002), tensor(0.0504))

In [31]:
t = relu(lin(x_valid,w1,b1))
t.mean(), t.std()

(tensor(0.5434), tensor(0.8058))

In [32]:
# See the Difference of t.mean() adter using kaiming-he initialisation and the above one

In [41]:
t = torch.randn(2,2)

In [45]:
t

tensor([[-0.0062, -0.5018],
        [-0.6409, -0.5708]])

In [55]:
t.clamp_min(0)

tensor([[0., 0.],
        [0., 0.]])

In [56]:
#export
from torch.nn import init

In [57]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))

In [59]:
init.kaiming_normal_??

In [60]:
w1.mean(),w1.std()

(tensor(-0.0002), tensor(0.0503))

In [61]:
t.mean(),t.std()

(tensor(0.6265), tensor(0.8415))

In [62]:
w1.shape

torch.Size([784, 50])

In [63]:
import torch.nn

In [64]:
torch.nn.Linear(m,nh).weight.shape

torch.Size([50, 784])

In [65]:
torch.nn.Linear.forward??

In [66]:
torch.nn.functional.linear??

In [67]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.5931), tensor(0.8232))

In [68]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [69]:
%timeit -n 10 _=model(x_valid)

11.6 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [70]:
assert model(x_valid).shape==torch.Size([x_valid.shape[0],1])

### Loss function: MSE

In [71]:
model(x_valid).shape

torch.Size([10000, 1])