In [52]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, "../..")

import torch
import torch.nn as nn
from src.data import make_dataset
from pathlib import Path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
datadir = Path("../../data/raw/")
train_dataloader, test_dataloader = make_dataset.get_MNIST(datadir, batch_size=64) 

We can obtain an item:

In [54]:
x, y = next(iter(train_dataloader))
x.shape, y.shape

(torch.Size([64, 1, 28, 28]), torch.Size([64]))

The image follows the channels-first convention: (channel, width, height). The label is an integer.

Lets pull this through a Conv2d layer:

In [55]:
conv = nn.Conv2d(
    in_channels=1, 
    out_channels=32,
    kernel_size=3,
    padding=(1,1))
out = conv(x)
out.shape

torch.Size([64, 32, 28, 28])

What is happening here? Can you explain all the parameters, and relate them to the outputshape?

Let's see what happens if we change the padding:

In [56]:
conv = nn.Conv2d(
    in_channels=1, 
    out_channels=32,
    kernel_size=3,
    padding=(0,0))
out = conv(x)
out.shape

torch.Size([64, 32, 26, 26])

And if we change the stride from the default 1 to 2:

In [57]:
conv = nn.Conv2d(
    in_channels=1, 
    out_channels=32,
    kernel_size=3,
    padding=(1,1),
    stride=2)
out = conv(x)
out.shape

torch.Size([64, 32, 14, 14])

As you can see, you need to think about what is going in and out of the convolution. We can stitch multiple layers together like this:

In [58]:
convolutions = nn.Sequential(
    nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=0),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=0),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2),
)
out = convolutions(x)
out.shape

torch.Size([64, 32, 2, 2])

As you can see, the dimensions of the featuremap have become really small. You need to take this into account: If we would have started with a smaller image, we could get errors...

At this point we have 32 featuremaps, each 3x3 big.

If we want to pull them through a neural network (A dense layer) we will need to flatten them (do you understand what happens if you dont do that?)

In [59]:
input_nn = nn.Flatten()(out)
input_nn.shape

torch.Size([64, 128])

Let's combine it all together:

In [60]:
import torch
from torch import nn

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.convolutions = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        
        self.dense = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 10)
        )

    def forward(self, x):
        x = self.convolutions(x)
        logits = self.dense(x)
        return logits

model = CNN().to(device)
print(model)

Using cpu device
CNN(
  (convolutions): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (dense): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=10, bias=True)
  )
)


In [61]:
from src.models import train_model
train_model.count_parameters(model)

29482

We have about 30k parameters. You will always need to judge that relative to your input data: how many observations do you have? Do you think the model needs a lot of complexity, or not so much?

What is the trade off between adding more complexity? Or reducing complexity?

Try to answer this trade of in terms of:

- speed
- generalization
- accuracy

We will need to tell the model how good it is performing. To do that, we will need to pick a loss function $\mathcal{L}$. We will discuss this in more depth, but for now, just take my word for it that a CrossEntropyLoss is a good pick.

In [62]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()

We now have everything we need to train the model.

In [63]:
model = train_model.trainloop(
    epochs=10,
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader
)

2022-04-05 09:43:37.182 | INFO     | src.models.train_model:trainloop:39 - Epoch 0 train 0.0112 | test 0.0087
2022-04-05 09:44:09.813 | INFO     | src.models.train_model:trainloop:39 - Epoch 1 train 0.0069 | test 0.0064
2022-04-05 09:44:41.065 | INFO     | src.models.train_model:trainloop:39 - Epoch 2 train 0.0059 | test 0.0057
2022-04-05 09:45:14.385 | INFO     | src.models.train_model:trainloop:39 - Epoch 3 train 0.0052 | test 0.0059
2022-04-05 09:45:46.853 | INFO     | src.models.train_model:trainloop:39 - Epoch 4 train 0.0048 | test 0.0052
2022-04-05 09:46:21.576 | INFO     | src.models.train_model:trainloop:39 - Epoch 5 train 0.0045 | test 0.0047
2022-04-05 09:46:57.738 | INFO     | src.models.train_model:trainloop:39 - Epoch 6 train 0.0043 | test 0.0047
2022-04-05 09:47:33.093 | INFO     | src.models.train_model:trainloop:39 - Epoch 7 train 0.0040 | test 0.0048
2022-04-05 09:48:09.682 | INFO     | src.models.train_model:trainloop:39 - Epoch 8 train 0.0039 | test 0.0046
2022-04-05