# "Write a script to train a small CNN on MNIST, or find one you have written previously."

Day 1 turned out to be more of a "relearn how to actually make a CNN, and then implement it out" type of day

In [1]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torchvision import datasets, models, transforms

In [2]:
class ConvNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 18, 5)

        self.lin1 = nn.Linear(18 * 16, 120) # currently hardcoded the 16, don't quite know where it comes from
        self.lin2 = nn.Linear(120, 60)
        self.lin3 = nn.Linear(60, 10)

    def forward(self, x):
        # apply convolutions
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))

        #print(x.shape)
        # flatten everything except the batch
        x = t.flatten(x, 1) 

        #print(x.shape)

        # apply linear layers
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x
        
net = ConvNet()
optim = t.optim.SGD(net.parameters(),0.1)
criterion = nn.CrossEntropyLoss()

In [3]:
mnist_trainset = datasets.MNIST(root = './data', train=True, download=True, transform=transforms.ToTensor())
mnist_testset = datasets.MNIST(root = './data', train=False, download=True, transform=transforms.ToTensor())

train_dl = DataLoader(mnist_trainset, batch_size=64, shuffle=True)
test_dl = DataLoader(mnist_testset, batch_size=64, shuffle=True)

In [4]:
mnist_trainset[0][0].shape

torch.Size([1, 28, 28])

In [5]:
def train_model(model, train_dl, test_dl, criterion, optimizer, num_epochs: int, val_history):
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # train
        for phase in ['train', 'val']:
            running_loss = 0.0
            running_corrects = 0.0
            current_dl = None

            if phase == 'train':
                model.train()
                current_dl = train_dl
            else:
                model.eval()
                current_dl = test_dl
            

            for inputs, labels in current_dl:
                inputs = inputs.float()
                labels = labels.float()
                optimizer.zero_grad()

                with t.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    orig_labels = labels
                    labels = labels.to(t.int64)
                    labels = F.one_hot(labels, 10).to(t.float64)


                    loss = criterion(outputs, labels)
                    preds = t.argmax(outputs,dim=-1)

                    if (phase == 'train'):
                        loss.backward()
                        optimizer.step()
                    
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += t.sum(preds == orig_labels.data)

            epoch_loss = running_loss / len(current_dl.dataset) # TODO: divide by something else, right????
            epoch_acc = running_corrects / len(current_dl.dataset)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            if phase == 'val':
                val_history.append(epoch_acc)
        print() 
    return model, val_history

In [6]:
val_history = []

In [7]:
train_model(net, train_dl, test_dl, criterion, optim, 3, val_history)

Epoch 0/2
----------
train Loss: 0.5253 Acc: 0.8233
val Loss: 0.4997 Acc: 0.8787

Epoch 1/2
----------
train Loss: 0.0849 Acc: 0.9734
val Loss: 0.1196 Acc: 0.9647

Epoch 2/2
----------
train Loss: 0.0601 Acc: 0.9811
val Loss: 0.0458 Acc: 0.9853



(ConvNet(
   (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
   (conv2): Conv2d(6, 18, kernel_size=(5, 5), stride=(1, 1))
   (lin1): Linear(in_features=288, out_features=120, bias=True)
   (lin2): Linear(in_features=120, out_features=60, bias=True)
   (lin3): Linear(in_features=60, out_features=10, bias=True)
 ),
 [tensor(0.8787), tensor(0.9647), tensor(0.9853)])

In [15]:
for i in range (3):
    for image, value in test_dl:
        selectImage = image[0:1]
        selectValue = value[0]
        #print(selectImage.shape)
        print("number: " + str(selectValue))
        print("predicted: " )
        print(t.argmax(net(selectImage)))

        break
    print("-" * 10)

number: tensor(8)
predicted: 
tensor(8)
----------
number: tensor(4)
predicted: 
tensor(4)
----------
number: tensor(6)
predicted: 
tensor(6)
----------
