<center><img src='https://drive.google.com/uc?id=1_utx_ZGclmCwNttSe40kYA6VHzNocdET' height="60"></center>

AI TECH - Akademia Innowacyjnych Zastosowań Technologii Cyfrowych. Program Operacyjny Polska Cyfrowa na lata 2014-2020
<hr>

<center><img src='https://drive.google.com/uc?id=1BXZ0u3562N_MqCLcekI-Ens77Kk4LpPm'></center>

<center>
Projekt współfinansowany ze środków Unii Europejskiej w ramach Europejskiego Funduszu Rozwoju Regionalnego
Program Operacyjny Polska Cyfrowa na lata 2014-2020,
Oś Priorytetowa nr 3 "Cyfrowe kompetencje społeczeństwa" Działanie  nr 3.2 "Innowacyjne rozwiązania na rzecz aktywizacji cyfrowej"
Tytuł projektu:  „Akademia Innowacyjnych Zastosowań Technologii Cyfrowych (AI Tech)”
    </center>

Code based on https://github.com/pytorch/examples/blob/master/mnist/main.py

In this exercise we are using high level abstractions from torch.nn like nn.Linear.
Note: during the next lab session we will go one level deeper and implement more things
with bare hands.

Tasks:

    1. Read the code.

    2. Check that the given implementation reaches 95% test accuracy for architecture input-128-128-10 after few epochs.

    3. Add the option to use SGD with momentum instead of ADAM.

    4. Experiment with different learning rates, plot the learning curves for different
    learning rates for both ADAM and SGD with momentum.

    5. Parameterize the constructor by a list of sizes of hidden layers of the MLP.
    Note that this requires creating a list of layers as an atribute of the Net class,
    and one can't use a standard python list containing nn.Modules (why?).
    Check torch.nn.ModuleList.


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # After flattening an image of size 28x28 we have 784 inputs
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output


def train(model, device, train_loader, optimizer, epoch, log_interval, show=False):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0 and show:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return loss.item()


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [3]:
batch_size = 256
test_batch_size = 1000
epochs = 5
lr = 1e-2
use_cuda = True
seed = 1
log_interval = 100

In [4]:
use_cuda = use_cuda and torch.cuda.is_available()
torch.manual_seed(seed)
device = torch.device("cuda" if use_cuda else "cpu")

train_kwargs = {'batch_size': batch_size}
test_kwargs = {'batch_size': test_batch_size}
if use_cuda:
    cuda_kwargs = {'num_workers': 1,
                    'pin_memory': True,
                    'shuffle': True}
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)

In [5]:
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                    transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                    transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

In [6]:
model = Net().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
losses = []

for epoch in range(1, epochs + 1):
    losses.append(train(model, device, train_loader, optimizer, epoch, log_interval))
    test(model, device, test_loader)


Test set: Average loss: 0.1360, Accuracy: 9558/10000 (96%)


Test set: Average loss: 0.1369, Accuracy: 9614/10000 (96%)


Test set: Average loss: 0.1174, Accuracy: 9650/10000 (96%)


Test set: Average loss: 0.1508, Accuracy: 9617/10000 (96%)


Test set: Average loss: 0.1538, Accuracy: 9597/10000 (96%)



In [7]:
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
losses = []

for epoch in range(1, epochs + 1):
    losses.append(train(model, device, train_loader, optimizer, epoch, log_interval))
    test(model, device, test_loader)


Test set: Average loss: 0.2569, Accuracy: 9269/10000 (93%)


Test set: Average loss: 0.1830, Accuracy: 9451/10000 (95%)


Test set: Average loss: 0.1437, Accuracy: 9568/10000 (96%)


Test set: Average loss: 0.1208, Accuracy: 9646/10000 (96%)


Test set: Average loss: 0.1138, Accuracy: 9643/10000 (96%)



In [8]:
import plotly.express as px
import plotly.graph_objects as go

def full_train(optimizer, lr):

  model = Net().to(device)

  if optimizer == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=lr)
  else:
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.8)

  losses = []
  epochs = 5

  for epoch in range(1, epochs + 1):
      losses.append(train(model, device, train_loader, optimizer, epoch, log_interval))

  return losses

optimizers = ['Adam', 'SGD']
lrs = [1e-3, 1e-2, 1e-1]

for optimizer in optimizers:

  losses = full_train(optimizer, lrs[0])
  fig = px.line(y=losses, x=list(range(len(losses))), title=f"{optimizer}")

  for lr in lrs[1:]:
    losses = full_train(optimizer, lr)
    fig.add_trace(go.Scatter(y=losses, x=list(range(len(losses))), mode='lines'))

  fig.show()

In [9]:
class NetList(nn.Module):
    def __init__(self, layers):
        super(NetList, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])

    def forward(self, x):
        x = torch.flatten(x, 1)
        for i in range(len(self.linears)-1):
            x = F.relu(self.linears[i](x))
        x = F.log_softmax(self.linears[-1](x), dim=1)
        return x

model = NetList([784, 256, 128, 64, 10])
model(torch.ones(size=[4, 784])).size()

torch.Size([4, 10])

In [12]:
model = NetList([784, 256, 128, 64, 10]).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10
losses = []

for epoch in range(1, epochs + 1):
    losses.append(train(model, device, train_loader, optimizer, epoch, log_interval, True))
    test(model, device, test_loader)


Test set: Average loss: 0.1592, Accuracy: 9529/10000 (95%)


Test set: Average loss: 0.1066, Accuracy: 9663/10000 (97%)


Test set: Average loss: 0.0856, Accuracy: 9733/10000 (97%)


Test set: Average loss: 0.0822, Accuracy: 9750/10000 (98%)


Test set: Average loss: 0.0796, Accuracy: 9749/10000 (97%)


Test set: Average loss: 0.0781, Accuracy: 9760/10000 (98%)


Test set: Average loss: 0.0767, Accuracy: 9774/10000 (98%)


Test set: Average loss: 0.0886, Accuracy: 9764/10000 (98%)


Test set: Average loss: 0.0711, Accuracy: 9801/10000 (98%)


Test set: Average loss: 0.0854, Accuracy: 9774/10000 (98%)

