!curl https://course.fast.ai/setup/colab | bash

### Fully connected forward and backward pass

In [14]:
#export
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [15]:
x_train,y_train,x_val,y_val = get_data()

In [16]:
train_mean, train_std = x_train.mean(),x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [17]:
x_train.shape

torch.Size([50000, 784])

In [18]:
x_train = normalize(x_train,train_mean,train_std)
x_val = normalize(x_val,train_mean,train_std)

In [19]:
train_mean, train_std = x_train.mean(),x_train.std()
train_mean, train_std

(tensor(0.0001), tensor(1.))

In [20]:
#export 
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [21]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [23]:
n,m=x_train.shape
c=y_train.max()+1
n,m,c

(50000, 784, tensor(10))

### Foundations 

#### Basic architecture (1 hidden layer)

In [25]:
#number of hidden
nh = 50

In [101]:
# simplified kaiming init/he init
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.randn(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.randn(1)

In [56]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))

In [57]:
# export
def lin(x,w,b): return x@w+b

In [58]:
t=lin(x_val,w1,b1)
t.mean(),t.std()

(tensor(0.2490), tensor(1.2965))

In [62]:
def relu(x): return x.clamp_min(0.)

In [63]:
t=relu(lin(x_val,w1,b1))
t.mean(), t.std()

(tensor(0.6494), tensor(0.8022))

In [70]:
w1 = torch.randn(m,nh)*math.sqrt(2/m)
w1.mean(),w1.std()

In [72]:
t=relu(lin(x_val,w1,b1))
t.mean(),t.std()

(tensor(0.7970), tensor(1.0531))

In [73]:
#export
from torch.nn import init

In [82]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1,mode='fan_out')
t = relu(lin(x_val,w1,b1))
t.mean(),t.std()

(tensor(0.8611), tensor(1.1355))

In [83]:
init.kaiming_normal_??

In [84]:
w1.shape

torch.Size([784, 50])

In [85]:
import torch.nn

In [86]:
torch.nn.Linear(m,nh).weight.shape

torch.Size([50, 784])

In [89]:
torch.nn.Linear.forward??

In [90]:
torch.nn.functional.linear??

In [91]:
torch.nn.Conv2d??

In [92]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [93]:
# jeremy test on moving the average back
def relu(x): return x.clamp_min(0.)-0.5

In [102]:
w1 = torch.randn(m,nh)*math.sqrt(2./m)
t1 = relu(lin(x_val,w1,b1))
t1.mean(),t1.std()

(tensor(0.2624), tensor(1.0211))

In [103]:
def model(xb):
    l1 = lin(xb,w1,b1)
    l2 = relu(l1)
    l3 = lin(l2,w2,b2)
    return l3

In [104]:
%timeit -n 10 _=model(x_val)

7.4 ms ± 820 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [105]:
assert model(x_val).shape==torch.Size([x_val.shape[0],1])

#### loss function: MSE (things simplified by using MSE but its useless)

In [106]:
model(x_val).shape

torch.Size([10000, 1])

In [112]:
def mse(output,targ): return (output.squeeze(1)-targ).pow(2).mean()

In [108]:
y_train,y_val = y_train.float(),y_val.float()

In [109]:
preds=model(x_train)

In [111]:
preds.shape

torch.Size([50000, 1])

In [113]:
mse(preds,y_train)

tensor(23.1049)