<a href="https://colab.research.google.com/github/sc22lg/ML-Notebooks/blob/MNIST_Batching/Pytorch_MLP_on_MNIST_Batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLP For MNIST

In [1]:
try:
  %pip install fancy_einsum
  %pip install einops
  %pip install keras
except:
  print("Package failed to install")

from keras.datasets import mnist
import torch as t
import einops
from fancy_einsum import einsum
import torch.nn as nn
import numpy as np
import math
from dataclasses import dataclass


Collecting fancy_einsum
  Downloading fancy_einsum-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Downloading fancy_einsum-0.0.3-py3-none-any.whl (6.2 kB)
Installing collected packages: fancy_einsum
Successfully installed fancy_einsum-0.0.3


###Investigate MNIST dataset:

In [2]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

#printing the shapes of the vectors
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(train_y.shape))
print('X_test:  '  + str(test_X.shape))
print('Y_test:  '  + str(test_y.shape))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  (10000,)


In [3]:
# expand each element of train_y to a 10-element tensor
# with a 1 in the position corresponding to that element's number
train_y_tensor = t.nn.functional.one_hot(t.tensor(train_y).to(t.long), num_classes=10).to(t.float)
print('original: ' + str(train_y[0]) + '\n new: ' + str(train_y_tensor[0]))
#repeat for test_y
test_y_tensor = t.nn.functional.one_hot(t.tensor(test_y).to(t.long), num_classes=10).to(t.float)

original: 5
 new: tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])


###Model Setup:

In [4]:
@dataclass
class Config:
  batch:int = 1
  d_img:int = 28
  n_first_layer:int = 56
  n_hidden:int = 16
  n_last_layer:int = 56
  n_out:int = 10

cfg = Config()

###MLP Layer:

In [5]:
class MLP_In_Layer(nn.Module):
  def __init__(self, cfg):
    super(MLP_In_Layer, self).__init__()
    self.cfg = cfg

    self.in_W = nn.Parameter(t.empty(cfg.d_img, cfg.n_first_layer))
    nn.init.kaiming_uniform_(self.in_W, nonlinearity='leaky_relu')

    self.in_B = nn.Parameter(t.randn(cfg.n_first_layer), requires_grad=True)
    nn.init.zeros_(self.in_B) # Initialize biases to zero

    self.activation = nn.LeakyReLU()

  def forward(self, data_in):
    #data_in format: [batch, d_img, d_img]
    first_layer = einsum('batch d_img d_img, d_img n_layer -> batch n_layer', data_in, self.in_W) + self.in_B
    #apply Leaky ReLU function
    post_ReLu = self.activation(first_layer)
    return post_ReLu

class MLP_Out_Layer(nn.Module):
  def __init__(self, cfg):
    super(MLP_Out_Layer, self).__init__()
    self.cfg = cfg

    self.out_W = nn.Parameter(t.empty(cfg.n_last_layer, cfg.n_out)) # weights
    nn.init.kaiming_uniform_(self.out_W, nonlinearity='leaky_relu') # initialise

    self.out_B = nn.Parameter(t.randn(cfg.n_out), requires_grad=True)
    nn.init.zeros_(self.out_B) # Initialize biases to zero

  def forward(self, data_in):
    #data_in format: [batch, n_layer]
    second_layer = einsum('batch n_layer, n_layer n_out -> batch n_out', data_in, self.out_W) + self.out_B
    return second_layer

class HiddenLayer(nn.Module):
  def __init__(self, dim1:int, dim2:int):
    super(HiddenLayer, self).__init__()
    self.layer_W = nn.Parameter(t.empty(dim1, dim2)) # where dim1 is output of prev layer and dim2 is output size
    nn.init.kaiming_uniform_(self.layer_W, nonlinearity='leaky_relu')

    self.layer_B = nn.Parameter(t.randn(dim2), requires_grad=True)
    nn.init.zeros_(self.layer_B) # Initialize biases to zero

    self.activation = nn.LeakyReLU()

  def forward(self, data_in):
    #data_in format: [batch, dim1]
    hidden_layer = einsum('batch dim1, dim1 dim2 -> batch dim2', data_in, self.layer_W) + self.layer_B
    activated = self.activation(hidden_layer)
    return activated

class Softmax(nn.Module):
  def __init__(self):
    super(Softmax, self).__init__()

  def forward(self, MLP_result):
    return nn.functional.softmax(MLP_result, dim=1)

###MLP Body

In [6]:
class MLP(nn.Module):
  def __init__(self, cfg):
    super(MLP, self).__init__()
    self.cfg = cfg
    self.in_layer = MLP_In_Layer(cfg)
    self.out_layer = MLP_Out_Layer(cfg)
    self.hidden_layer_1 = HiddenLayer(cfg.n_first_layer, cfg.n_hidden)
    self.hidden_layer_2 = HiddenLayer(cfg.n_hidden, cfg.n_last_layer)
    self.Predictor = Softmax()

  def forward(self, data_in):
    #data_in format: [batch, d_img, d_img]
    result = self.in_layer(data_in)
    result = self.hidden_layer_1(result)
    result = self.hidden_layer_2(result)
    logits = self.out_layer(result)
    return self.Predictor.forward(logits)

###Testing Forward Propagation

In [7]:
MLP = MLP(cfg)
for i in range(10):
  #create tensor of data, set as floats, add extra 'batch' dimension, run through network
  prediction = MLP.forward(t.unsqueeze(t.tensor(train_X[i]).float(), 0))
  print('Prediction: ' + str(prediction) + 'Actual: ' + str(train_y[i]))

Prediction: tensor([[0.0000e+00, 4.9830e-24, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
         1.1761e-06, 0.0000e+00, 0.0000e+00, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)Actual: 5
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 5.2319e-31, 1.0000e+00,
         2.2747e-23, 1.8976e-18, 0.0000e+00, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)Actual: 0
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.4472e-40, 5.6125e-41,
         0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)Actual: 4
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 6.3590e-02, 0.0000e+00, 3.2646e-32,
         9.3637e-01, 4.2575e-05, 0.0000e+00, 4.9786e-32]],
       grad_fn=<SoftmaxBackward0>)Actual: 1
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 3.3631e-44, 0.0000e+00, 0.0000e+00,
         1.0000e+00, 1.2412e-13, 0.0000e+00, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)Actual: 9
Prediction: tensor([

###Training

In [8]:
optimiser = t.optim.SGD(MLP.parameters(), lr=0.01)

In [9]:
log_every = 100
# MLP = MLP(cfg) # Removed redundant instantiation
for i in range(len(train_X)): # Iterate over training data
  # Zero gradients before calculating gradients for the current step
  optimiser.zero_grad()

  #create tensor of data, set as floats, add extra 'batch' dimension, run through network
  # Scale input data to be between 0 and 1
  input_data = t.unsqueeze(t.tensor(train_X[i]).float() / 255.0, 0)
  prediction = MLP.forward(input_data)

  # Use cross_entropy loss with the target class index
  loss = nn.functional.cross_entropy(prediction, t.tensor([train_y[i]]).long()) # Target should be class index, not one-hot

  # train network with loss
  loss.backward()
  optimiser.step()

  if i % log_every == 0:
    print('Loss: ' + str(loss))

Loss: tensor(2.2235, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2303, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2895, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3198, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3655, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2259, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2807, grad_fn=<NllLossBackward0>)
Loss: tensor(2.1785, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2782, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2552, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2829, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2746, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2899, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2473, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3203, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3210, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2095, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2257, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3574, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3414, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3397,

###Testing:

In [10]:
#Test model with test_X and test_Y data, output correct guess %
correct_guess = 0
for i in range(len(test_X)):
  prediction = MLP.forward(t.unsqueeze(t.tensor(test_X[i]).float(), 0))
  if t.argmax(prediction) == test_y[i]:
    correct_guess += 1
print('Score: ' +  str(correct_guess) + '/' + str(len(test_X)) + ', ' + str(correct_guess/len(test_X)*100) + '%')

Score: 4236/10000, 42.36%
