In [26]:
#|export
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

In [27]:
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

path_data = Path('../data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Initial setup

### Data

In [28]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50

In [29]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [30]:
model = Model(m, nh, 10)

## Basic training loop

Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [31]:
loss_func = F.cross_entropy

In [32]:
bs=50                  # batch size

xb = x_train[0:bs]     # a mini-batch from x
preds = model(xb)      # predictions
preds[0], preds.shape

(tensor([-0.09, -0.21, -0.08,  0.10, -0.04,  0.08, -0.04, -0.03,  0.01,  0.06], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [33]:
yb = y_train[0:bs]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7, 6, 1, 8, 7, 9,
        3, 9, 8, 5, 9, 3])

In [34]:
loss_func(preds, yb)

tensor(2.30, grad_fn=<NllLossBackward0>)

In [35]:
preds.argmax(dim=1)

tensor([3, 9, 3, 8, 5, 9, 3, 9, 3, 9, 5, 3, 9, 9, 3, 9, 9, 5, 8, 7, 9, 5, 3, 8, 9, 5, 9, 5, 5, 9, 3, 5, 9, 7, 5, 7, 9, 9, 3, 9, 3, 5, 3, 8,
        3, 5, 9, 5, 9, 5])

In [36]:
def accuracy(out, yb): return (out.argmax(dim=1)==yb).float().mean()

In [37]:
accuracy(preds, yb)

tensor(0.08)

In [52]:
lr = 0.5   # learning rate
epochs = 3 # how many epochs to train for

In [39]:
#|export
def report(loss, preds, yb): print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

In [40]:
xb,yb = x_train[:bs],y_train[:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)

2.30, 0.08


### Stepping through the inner loop (first iter)

In [41]:
i = 0

In [42]:
s = slice(i, min(n,i+bs))
s

slice(0, 50, None)

In [43]:
xb,yb = x_train[s],y_train[s]
xb.shape,yb.shape

(torch.Size([50, 784]), torch.Size([50]))

Train model on minibatch

In [44]:
preds = model(xb)

Calculate loss function for predictions made

In [45]:
loss = loss_func(preds, yb)

In [46]:
loss.backward()

Dont take gradient of these

In [47]:
with torch.no_grad():
    for l in model.layers:
        if hasattr(l, 'weight'):
            print("Has attribute")
            l.weight -= l.weight.grad * lr
            l.bias   -= l.bias.grad   * lr
            l.weight.grad.zero_()
            l.bias  .grad.zero_()

Has attribute
Has attribute


Report accuracy

In [48]:
report(loss, preds, yb)

2.30, 0.08


### Putting it in a loop with number of epochs

In [53]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()
    report(loss, preds, yb)

0.05, 0.98
0.10, 0.96
0.07, 0.96
