# ECS759P Lab6 Part 3: Neural networks for Classification

## Data
We are going to use in this lab the MNIST dataset containing images (28x28 pixels) of hand written digits. `PyTorch` provides an API to get the data:

In [None]:
import torch
import torch.nn as nn
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
validation_dataset = datasets.MNIST('./data', train=False, transform=transforms.ToTensor())

Now that the data downloaded, let's create a data loader which will allow you to access images and their corresponding labels:


In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=35, shuffle=True)

validation_loader = torch.utils.data.DataLoader(dataset=validation_dataset, batch_size=35, shuffle=False)

Let's now have a look at the data we are dealing with as well as its dimension (you can run the cell several times to see what happens):

In [None]:
input_data, label = next(iter(train_loader))
plt.imshow(input_data[0,:,:,:].numpy().reshape(28,28), cmap="gray_r");
print("Label is: {}".format(label[0]))
print("Dimension of input data: {}".format(input_data.size()))
print("Dimension of labels: {}".format(label.size()))

Data loaders add additional functionalities (like picking batches, shuffling, etc).

There are many pre-made data loaders like MNIST, but there is also a possibility of creating custom dataloaders.

## Deep Network

**Q. By filling the gaps below, implement a Multi-layer Neural Network with several hidden layers (number to explore as a hyperparameter). Initially, we are only interested in using the sigmoid activation function and the `SGD` optimiser. Test your networks using the provided data loader and compute the accuracy on the validation set. What is the impact of the different hyperparameters: number of epochs, learning rate $\eta$, initial weights and number of neurons of the hidden layers, number of hidden layers?**


**A. First thing to note is that the classification accuracy is usually already good with only one hidden layer. More layers do not change the final performance too much. However, when adding more layers, we need more epochs since we have more weights to learn. Starting with only zeros or ones is usually worse than using some random-based distribution. It is quite difficult to make any distinct conclusion regarding the impact of the number of neurons for each layer on the performance. A very low learning rate will slow down the convergence whereas a too high learning rate will result in a neural network that diverges and will get bad results.**

In [None]:
# Definition of the neural network
class MyMLP(nn.Module):
  def __init__(self):
    super(MyMLP, self).__init__()
    # TO DO
    # This would be a single hidden layer neural network
    self.fc = nn.Linear(28*28, 256)
    self.act = nn.Sigmoid()
    self.fc1 = nn.Linear(256, 10)

    # Alternatively use the Sequential container to run layers sequentially
    # self.fc_model = nn.Sequential(nn.Linear(28*28, 256), nn.Sigmoid(), nn.Linear(256,10))

    # This would be a neural net with 3 hidden layers and sigmoid function
    # self.fc_model = nn.Sequential(nn.Linear(28*28, 1046), nn.Sigmoid(), nn.Linear(1046,512), nn.Sigmoid(), nn.Linear(512, 256), nn.Sigmoid(), nn.Linear(256, 10))
    # This would be another neural net with 2 hidden layers and ReLU function
    #self.fc_model = nn.Sequential(nn.Linear(28*28, 512), nn.ReLU(), nn.Linear(512,256), nn.ReLU(), nn.Linear(256, 10))

  def forward(self, x):
    # TO DO
    x = x.view(x.size(0), -1)
    x = self.fc(x)
    x = self.act(x)
    x = self.fc1(x)
    # Alternatively use the Sequential container to run layers sequentially
    # x = self.fc_model(x)
    return x

def evaluation(dataloader):
  total, correct = 0,0
  net.eval()
  # TO DO
  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = net(inputs)
    _, pred = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (pred == labels).sum().item()
  return 100 * correct / total

def weights_init(layer):
    if isinstance(layer, nn.Linear):
      # TO DO
      # This is a gaussian initialization following N(0.5,2)
      nn.init.normal_(layer.weight, 0.5, 2)
      # This would be an uniform initialization between -1 and 1
      # nn.init.uniform_(layer.weight, -1, 1)
      # This would be an initialization with only zeros
      # nn.init.zeros_(layer.weight)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

net = MyMLP().to(device)
net.apply(weights_init)
loss_fn = nn.CrossEntropyLoss()
# SGD optimizer with learning rate of 0.1
opt = torch.optim.SGD(list(net.parameters()), lr = 0.1)
# This would be for Adam optimizer with learning rate = 0.05
# opt = torch.optim.Adam(list(net.parameters()), lr = 0.05)
# This would be SGD with Momentum with a lr=0.025 and a momentum of 0.9
# opt = torch.optim.SGD(list(net.parameters()), lr = 0.025, momentum = 0.025)


# Training phase

# Change this value to make training longer
max_epochs = 5
loss_epoch_array = []
loss_epoch = 0
train_accuracy = []
valid_accuracy = []
for epoch in range(max_epochs):
  loss_epoch = 0
  print(epoch)
  for i, data in enumerate(train_loader, 0):
    # TO DO
    net.train()
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
    opt.zero_grad()
    outputs = net(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    opt.step()
    loss_epoch += loss.item()
    # TO DO
  loss_epoch_array.append(loss_epoch)
  train_accuracy.append(evaluation(train_loader))
  valid_accuracy.append(evaluation(validation_loader))
  print("Epoch {}: loss: {}, train accuracy: {}, valid accuracy:{}".format(epoch + 1, loss_epoch_array[-1], train_accuracy[-1], valid_accuracy[-1]))


**Q. Test other activation functions in your Deep Network, especially  `ReLU()`. Briefly explain what effect you observe on the performance.**

**A. Using ReLU does not seem to change significantly the results with only a few layers but its impact is getting more obvious when the network grows.**

**Q. Use other optimizers such as `SGD with momentum` and `Adam`. What do you observe during the learning phase?**

**A. The loss change pattern is different. However, the results are not that different.**

**Q. When dealing with images, we usually prefer using Convolutional Neural Networks (CNN). By filling the gaps below, implement the LeNet-5 architecture (see lenet-5.png attached to the lab) using `PyTorch`. Do you see any improvement in terms of performance?**
**A. The performance obtained using LeNet-5 is better than any MLP tried before.**



In [None]:
# CNN implementation

class MyCNN(nn.Module):
  def __init__(self):
    super(MyCNN, self).__init__()
    # TO DO
    self.conv = nn.Conv2d(1, 6, kernel_size = 5)
    self.act_conv = nn.Tanh()
    self.avg_pool = nn.AvgPool2d(2, stride=2)

    self.conv1 = nn.Conv2d(6, 16, kernel_size = 5)
    self.act_conv1 = nn.Tanh()
    self.avg_pool1 = nn.AvgPool2d(2, stride=2)

    # Alternatively use the Sequential container to run layers sequentially

    # self.cnn_model = nn.Sequential(nn.Conv2d(1, 6, kernel_size = 5), nn.Tanh(), nn.AvgPool2d(2, stride=2), nn.Conv2d(6, 16, kernel_size = 5), nn.Tanh(), nn.AvgPool2d(2, stride = 2))

    self.fc = nn.Linear(256, 120)
    self.act = nn.Tanh()
    self.fc1 = nn.Linear(120, 84)
    self.act1 = nn.Tanh()
    self.fc2 = nn.Linear(84, 10)

    # Alternatively use the Sequential container to run layers sequentially

    # self.fc_model = nn.Sequential(nn.Linear(256, 120), nn.Tanh(), nn.Linear(120,84), nn.Tanh(), nn.Linear(84, 10))

  def forward(self, x):
    # TO DO

    x = self.conv(x)
    x = self.act_conv(x)
    x = self.avg_pool(x)

    x = self.conv1(x)
    x = self.act_conv1(x)
    x = self.avg_pool1(x)

    x = x.view(x.size(0), -1)

    x = self.fc(x)
    x = self.act(x)
    x = self.fc1(x)
    x = self.act1(x)
    x = self.fc2(x)

    # Alternatively use the Sequential container to run layers sequentially

    # x = self.cnn_model(x)
    # x = x.view(x.size(0), -1)
    # x = self.fc_model(x)

    return x

device = torch.device("cuda:0")

net = MyCNN().to(device)
loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.Adam(list(net.parameters()))

def evaluation(dataloader):
  total, correct = 0,0
  net.eval()
  for data in dataloader:
    # TO DO
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = net(inputs)
    _, pred = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (pred == labels).sum().item()
  return 100 * correct / total

loss_epoch_array = []
max_epochs = 50
loss_epoch = 0
train_accuracy = []
valid_accuracy = []
for epoch in range(max_epochs):
  loss_epoch = 0
  for i, data in enumerate(train_loader, 0):
    # TO DO
    net.train()
    inputs, labels = data
    inputs, labels = inputs.to(device), labels.to(device)
    opt.zero_grad()
    outputs = net(inputs)
    loss = loss_fn(outputs, labels)
    loss.backward()
    opt.step()
    loss_epoch += loss.item()
    # TO DO
  loss_epoch_array.append(loss_epoch)
  train_accuracy.append(evaluation(train_loader))
  valid_accuracy.append(evaluation(validation_loader))
  print("Epoch {}: loss: {}, train accuracy: {}, valid accuracy:{}".format(epoch + 1, loss_epoch_array[-1], train_accuracy[-1], valid_accuracy[-1]))