# A basic training loop

## MNIST data setup

In [1]:
from pathlib import Path

DATA_PATH = Path('data')
PATH = DATA_PATH/'mnist'

PATH.mkdir(parents=True, exist_ok=True)

In [2]:
import requests

URL='http://deeplearning.net/data/mnist/'
FILENAME='mnist.pkl.gz'

if not (PATH/FILENAME).exists():
    content = requests.get(URL+FILENAME).content
    (PATH/FILENAME).open('wb').write(content)

In [3]:
import pickle, gzip

with gzip.open(PATH/FILENAME, 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

In [4]:
import torch 

x_train,y_train,x_valid,y_valid = map(torch.tensor, (x_train,y_train,x_valid,y_valid))

In [5]:
n,c = x_train.shape
x_train, x_train.shape, y_train.min(), y_train.max()

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 torch.Size([50000, 784]),
 tensor(0),
 tensor(9))

## Basic model and training loop

In [6]:
import math

weights = torch.randn(784,10)/math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)

In [7]:
import torch.nn.functional as F

def model(xb):
    xb = (xb @ weights) + bias
    return F.log_softmax(xb, dim=-1)

In [8]:
bs=64

In [9]:
preds = model(x_train[0:bs])
preds[0], preds.shape

(tensor([-2.3505, -2.4234, -2.0880, -1.9759, -2.5629, -2.5447, -2.2578, -2.4022,
         -1.9325, -2.8470], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [10]:
loss_fn = F.nll_loss

In [11]:
loss_fn(preds, y_train[0:bs])

tensor(2.3689, grad_fn=<NllLossBackward>)

In [12]:
import numpy as np

np.argmax(preds.detach(), axis=1), y_train[0:bs]

(tensor([8, 7, 6, 6, 7, 9, 2, 7, 5, 8, 7, 6, 6, 8, 2, 8, 6, 3, 2, 3, 6, 7, 1, 6,
         9, 7, 7, 6, 6, 6, 6, 1, 8, 2, 2, 6, 2, 2, 6, 8, 2, 3, 7, 3, 6, 0, 7, 8,
         8, 7, 7, 7, 0, 6, 0, 6, 5, 7, 7, 3, 1, 3, 2, 7]),
 tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1,
         1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7, 6, 1, 8, 7, 9, 3, 9, 8, 5,
         9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0]))

In [13]:
def accuracy(preds, ys):
    return np.count_nonzero(np.argmax(preds.detach(), axis=1) == ys) / len(ys)

In [14]:
accuracy(preds, y_train[0:bs])

0.015625

In [15]:
def predict_batch(model, xb, yb, loss_fn=loss_fn, metric=accuracy):
    preds = model(xb)
    return loss_fn(preds, yb), metric(preds, yb)

In [16]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(2.3689, grad_fn=<NllLossBackward>), 0.015625)

In [17]:
lr = 0.5
epochs = 2

In [18]:
from IPython.core.debugger import set_trace

In [19]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
#         set_trace()
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        with torch.no_grad():
            weights -= weights.grad * lr
            bias -= bias.grad * lr
            weights.grad.zero_()
            bias.grad.zero_()

In [20]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(0.2265, grad_fn=<NllLossBackward>), 0.953125)

## Refactor using nn.Module

In [21]:
from torch import nn

class Mnist_Logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(784,10)/math.sqrt(784))
        self.bias = nn.Parameter(torch.zeros(10))

    def forward(self, xb):
        xb = (xb @ self.weights) + self.bias
        return F.log_softmax(xb, dim=-1)

In [22]:
model = Mnist_Logistic()

In [23]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(2.3224, grad_fn=<NllLossBackward>), 0.078125)

In [24]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        with torch.no_grad():
            for p in model.parameters(): p -= p.grad * lr
            model.zero_grad()

In [25]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(0.2241, grad_fn=<NllLossBackward>), 0.9375)

## Refactor using nn.Linear

In [26]:
class Mnist_Logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = nn.Linear(784,10)

    def forward(self, xb):
        return F.log_softmax(self.lin(xb), dim=-1)

In [27]:
model = Mnist_Logistic()
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(2.2881, grad_fn=<NllLossBackward>), 0.09375)

In [28]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        with torch.no_grad():
            for p in model.parameters(): p -= p.grad * lr
            model.zero_grad()

In [29]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(0.2278, grad_fn=<NllLossBackward>), 0.953125)

## Refactor using optim

In [30]:
from torch import optim

In [31]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(2.3499, grad_fn=<NllLossBackward>), 0.015625)

In [32]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [33]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(0.2268, grad_fn=<NllLossBackward>), 0.953125)

## Refactor using Dataset

In [34]:
from torch.utils.data import TensorDataset

In [35]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=1.)

In [36]:
train_ds = TensorDataset(x_train, y_train)

In [37]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        xb,yb = train_ds[i*bs : i*bs+bs]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [38]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(0.2217, grad_fn=<NllLossBackward>), 0.9375)

## Refactor using DataLoader

In [39]:
from torch.utils.data import DataLoader

In [40]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

In [41]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs)

In [42]:
for epoch in range(epochs):
    for xb,yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [43]:
predict_batch(model, x_train[0:bs], y_train[0:bs])

(tensor(0.2274, grad_fn=<NllLossBackward>), 0.9375)

# Add validation

## First try

In [44]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

In [45]:
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True)

valid_ds = TensorDataset(x_valid, y_valid)
valid_dl = DataLoader(valid_ds, batch_size=bs*2)

In [46]:
predict_batch(model, x_valid[0:bs], y_valid[0:bs])

(tensor(2.2690, grad_fn=<NllLossBackward>), 0.140625)

In [47]:
for epoch in range(epochs):
    model.train()
    for xb,yb in train_dl:
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()
        
    model.eval()
    with torch.no_grad():
        valid_loss = sum(loss_fn(model(xb), yb)
                         for xb,yb in valid_dl)

    print(epoch, valid_loss/len(valid_dl))

0 tensor(0.3133)
1 tensor(0.3212)


In [48]:
predict_batch(model, x_valid[0:bs], y_valid[0:bs])

(tensor(0.3723, grad_fn=<NllLossBackward>), 0.875)

## Create fit() and get_data()

In [49]:
def loss_batch(model, xb, yb, loss_fn, opt=None):
    loss, metric = predict_batch(model, xb, yb, loss_fn)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    return loss.item(), len(xb), metric

In [50]:
import numpy as np

def fit(epochs, model, loss_fn, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb,yb in train_dl: loss_batch(model, xb, yb, loss_fn, opt)

        model.eval()
        with torch.no_grad():
            losses,nums,metric = zip(*[loss_batch(model, xb, yb, loss_fn)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
        val_metric = np.sum(np.multiply(metric,nums)) / np.sum(nums)

        print(epoch, val_loss, val_metric)

In [51]:
def get_data(train_ds, valid_ds, bs):
    return (DataLoader(train_ds, batch_size=bs, shuffle=True),
            DataLoader(valid_ds, batch_size=bs*2))

train_dl,valid_dl = get_data(train_ds, valid_ds, bs)

In [52]:
model = Mnist_Logistic()
opt = optim.SGD(model.parameters(), lr=lr)

In [53]:
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.30872952551841737 0.9111
1 0.29135184154510496 0.9177


# Switch to CNN

## First try

In [54]:
class Mnist_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,  16, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(16, 10,  kernel_size=3, stride=2, padding=1)

    def forward(self, xb):
        xb = xb.view(-1,1,28,28)
        xb = F.relu(self.conv1(xb))
        xb = F.relu(self.conv2(xb))
        xb = F.relu(self.conv3(xb))
        xb = F.avg_pool2d(xb, 4)
        xb = xb.view(-1,xb.size(1))
        return F.log_softmax(xb, dim=-1)

In [55]:
lr=0.2

In [56]:
model = Mnist_CNN()
opt = optim.SGD(model.parameters(), lr=lr)

In [64]:
xb, yb = next(iter(valid_dl))
predict_batch(model, xb, yb)

(tensor(2.8568, grad_fn=<NllLossBackward>), 0.421875)

In [66]:
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.3227022711753845 0.8993
1 0.37818677072525025 0.8854


## nn.Sequential

In [59]:
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func=func
        
    def forward(self, x): return self.func(x)

In [60]:
model = nn.Sequential(
    Lambda(lambda x: x.view(-1,1,28,28)),
    nn.Conv2d(1,  16,  kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 16,  kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.Conv2d(16, 10,  kernel_size=3, stride=2, padding=1), nn.ReLU(),
    nn.AvgPool2d(4),
    Lambda(lambda x: x.view(x.size(0),-1)),
    nn.LogSoftmax(-1),
)

In [61]:
opt = optim.SGD(model.parameters(), lr=lr)

In [62]:
xb, yb = next(iter(valid_dl))
predict_batch(model, xb, yb)

(tensor(2.2958, grad_fn=<NllLossBackward>), 0.15625)

In [63]:
fit(epochs, model, loss_fn, opt, train_dl, valid_dl)

0 0.8756247200965881 0.7174
1 2.7175918724060057 0.4372
