In [44]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
torch.manual_seed(42)

torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

### Get data

In [45]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('../data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [46]:
n,m = x_train.shape

In [47]:
n,m

(50000, 784)

c = 9 digits + 1 

In [48]:
c = y_train.max()+1

Number of hiden layers

In [49]:
nh = 50

Create architechture

In [50]:
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)
b2 = torch.zeros(1)

In [51]:
def lin(inp,weight,bias):
    return inp @ weight + bias

In [52]:
lin(x_train,w1,b1)

tensor([[ 12.53,  10.99,  -2.58,  ...,   6.05,  -0.14,  11.34],
        [ 28.33,   6.83,   4.17,  ..., -13.48, -14.75,  17.52],
        [  5.67,  -5.26,   0.55,  ...,  -1.61,   4.13,   4.68],
        ...,
        [  1.14,   2.96, -14.34,  ...,   1.26,   4.31,  19.45],
        [ -6.83,   4.90,  -4.14,  ...,   2.80,  -1.42,  12.29],
        [  8.51,  14.52,  -7.94,  ...,  14.50,   1.10,  18.45]])

In [53]:
def relu(inp):
    return torch.clamp_min(inp,0.)

In [54]:
def model(inp):
    l1 = lin(inp,w1,b1)
    l2 = relu(l1)
    return lin(l2,w2,b2)


In [62]:
model(x_train).shape

torch.Size([50000, 1])

In [66]:
res = model(x_train)

define loss function

In [63]:
y_train.shape

torch.Size([50000])

In [68]:
def rse(preds,target):
    return (preds.squeeze()-target).pow(2).mean()    

In [69]:
rse(res,y_train)

tensor(4308.76)

Backward and forward prop

In [108]:
def forward_and_backward(inp,target):
    l1 = lin(inp,w1,b1)
    l2 = relu(l1)
    out = lin(l2,w2,b2)
    diff = out[:,0]-target
    loss = diff.pow(2).mean()

    out.g = 2. * diff.unsqueeze(1) / inp.shape[0]

    return out


In [111]:
forward_and_backward(x_train,y_train).shape

torch.Size([50000, 1])

In [113]:
w2.shape

torch.Size([50, 1])

In [99]:
x_train.shape[0]

50000

In [103]:
w2.T

tensor([[-0.87,  0.51,  0.93, -0.03, -1.70, -1.88,  0.12,  1.11, -0.30,  1.23, -2.60,  1.11,  0.33,  0.57, -0.80, -0.68,
         -0.46, -0.88,  0.53, -0.94,  1.15,  0.98,  0.06,  0.72,  1.06,  0.80, -0.23, -0.57, -0.58,  2.09,  0.37,  1.58,
         -0.45,  0.56,  0.60, -0.48, -0.34,  0.18, -0.84, -2.61, -0.09,  0.67,  0.17, -0.74,  0.27, -0.34, -1.06, -0.00,
          1.50, -1.54]])

In [114]:
def lin_grad(inp,out,weight,bias):
    inp.g = out.g @ weight.T
    weight.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    bias.g = out.g.sum()

In [117]:
def forward_and_backward(inp,target):
    #forward
    l1 = lin(inp,w1,b1)
    l2 = relu(l1)
    out = lin(l2,w2,b2)
    diff = out[:,0]-target
    loss = diff.pow(2).mean()

    #backward
    out.g = 2. * diff.unsqueeze(1) / inp.shape[0]
    lin_grad(l2,out,w2,b2)
    l1.g  = (l1>0).float() * l2.g
    lin_grad(inp,l1,w1,b1)

In [118]:
forward_and_backward(x_train,y_train)

In [119]:
x_train.g

tensor([[    -0.00,     -0.01,      0.00,  ...,     -0.00,      0.00,      0.00],
        [    -0.03,     -0.03,      0.01,  ...,     -0.04,     -0.01,     -0.01],
        [     0.00,      0.00,     -0.00,  ...,      0.00,     -0.00,      0.00],
        ...,
        [    -0.00,     -0.02,      0.01,  ...,     -0.00,     -0.00,      0.00],
        [    -0.02,     -0.01,      0.01,  ...,     -0.01,      0.01,     -0.00],
        [    -0.00,     -0.00,      0.00,  ...,     -0.00,     -0.00,     -0.00]])