<a href="https://colab.research.google.com/github/sc22lg/ML-Notebooks/blob/MLP_in_Pytorch/Pytorch_MLP_on_MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MLP For MNIST

In [96]:
try:
  %pip install fancy_einsum
  %pip install einops
  %pip install keras
except:
  print("Package failed to install")

from keras.datasets import mnist
import torch as t
import einops
from fancy_einsum import einsum
import torch.nn as nn
import numpy as np
import math
from dataclasses import dataclass




###Investigate MNIST dataset:

In [119]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()

#printing the shapes of the vectors
print('X_train: ' + str(train_X.shape))
print('Y_train: ' + str(train_y.shape))
print('X_test:  '  + str(test_X.shape))
print('Y_test:  '  + str(test_y.shape))

X_train: (60000, 28, 28)
Y_train: (60000,)
X_test:  (10000, 28, 28)
Y_test:  (10000,)


In [120]:
# expand each element of train_y to a 10-element tensor
# with a 1 in the position corresponding to that element's number
train_y_tensor = t.nn.functional.one_hot(t.tensor(train_y).to(t.long), num_classes=10).to(t.float)
print('original: ' + str(train_y[0]) + '\n new: ' + str(train_y_tensor[0]))
#repeat for test_y
test_y_tensor = t.nn.functional.one_hot(t.tensor(test_y).to(t.long), num_classes=10).to(t.float)

original: 5
 new: tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])


###Model Setup:

In [121]:
@dataclass
class Config:
  batch:int = 1
  d_img:int = 28
  n_layer:int = 56
  n_out:int = 10

cfg = Config()

###MLP Layer:

In [122]:
class MLPLayer(nn.Module):
  def __init__(self, cfg):
    super(MLPLayer, self).__init__()
    self.cfg = cfg
    # Use Kaiming He initialization for weights
    self.in_W = nn.Parameter(t.empty(cfg.d_img, cfg.n_layer))
    nn.init.kaiming_uniform_(self.in_W, nonlinearity='leaky_relu')

    self.in_B = nn.Parameter(t.randn(cfg.n_layer), requires_grad=True) # Biases often initialized to zero or a small constant
    nn.init.zeros_(self.in_B) # Initialize biases to zero

    self.out_W = nn.Parameter(t.empty(cfg.n_layer, cfg.n_out))
    nn.init.kaiming_uniform_(self.out_W, nonlinearity='leaky_relu')

    self.out_B = nn.Parameter(t.randn(cfg.n_out), requires_grad=True) # Biases often initialized to zero or a small constant
    nn.init.zeros_(self.out_B) # Initialize biases to zero

    self.activation = nn.LeakyReLU()

  def forward(self, data_in):
    #data_in format: [batch, d_img, d_img]
    first_layer = einsum('batch d_img d_img, d_img n_layer -> batch n_layer', data_in, self.in_W) + self.in_B
    #apply Leaky ReLU function
    post_ReLu = self.activation(first_layer)
    #second matrix:
    second_layer = einsum('batch n_layer, n_layer n_out -> batch n_out', post_ReLu, self.out_W) + self.out_B
    return second_layer

###Prediction Extraction

In [123]:
class Softmax(nn.Module):
  def __init__(self):
    super(Softmax, self).__init__()

  def forward(self, MLP_result):
    return nn.functional.softmax(MLP_result, dim=1)


###MLP Body

In [124]:
class MLP(nn.Module):
  def __init__(self, cfg):
    super(MLP, self).__init__()
    self.cfg = cfg
    self.layer = MLPLayer(cfg)
    self.Predictor = Softmax()

  def forward(self, data_in):
    #data_in format: [batch, d_img, d_img]
    MLP_result = self.layer(data_in)
    return self.Predictor.forward(MLP_result)

###Testing Forward Propagation

In [125]:
MLP = MLP(cfg)
for i in range(10):
  #create tensor of data, set as floats, add extra 'batch' dimension, run through network
  prediction = MLP.forward(t.unsqueeze(t.tensor(train_X[i]).float(), 0))
  print('Prediction: ' + str(prediction) + 'Actual: ' + str(train_y[i]))

Prediction: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]], grad_fn=<SoftmaxBackward0>)Actual: 5
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 5.6606e-10]],
       grad_fn=<SoftmaxBackward0>)Actual: 0
Prediction: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], grad_fn=<SoftmaxBackward0>)Actual: 4
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 1.8687e-37,
         0.0000e+00, 0.0000e+00, 3.4663e-35, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)Actual: 1
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 1.7947e-16, 0.0000e+00, 1.0000e+00,
         3.6337e-29, 3.2230e-44, 0.0000e+00, 0.0000e+00]],
       grad_fn=<SoftmaxBackward0>)Actual: 9
Prediction: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 1.1629e-38, 1.7620e-24]],
       grad_fn=<SoftmaxBackward0>)Actual: 2
Prediction: tens

###Training

In [126]:
optimiser = t.optim.SGD(MLP.parameters(), lr=0.01)

In [127]:
log_every = 100
# MLP = MLP(cfg) # Removed redundant instantiation
for i in range(len(train_X)): # Iterate over training data
  # Zero gradients before calculating gradients for the current step
  optimiser.zero_grad()

  #create tensor of data, set as floats, add extra 'batch' dimension, run through network
  # Scale input data to be between 0 and 1
  input_data = t.unsqueeze(t.tensor(train_X[i]).float() / 255.0, 0)
  prediction = MLP.forward(input_data)

  # Use cross_entropy loss with the target class index
  loss = nn.functional.cross_entropy(prediction, t.tensor([train_y[i]]).long()) # Target should be class index, not one-hot

  # train network with loss
  loss.backward()
  optimiser.step()

  if i % log_every == 0:
    print('Loss: ' + str(loss))

Loss: tensor(2.2774, grad_fn=<NllLossBackward0>)
Loss: tensor(2.1862, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2913, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3061, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3933, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3815, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2832, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3417, grad_fn=<NllLossBackward0>)
Loss: tensor(2.0487, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2596, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3719, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3597, grad_fn=<NllLossBackward0>)
Loss: tensor(1.7622, grad_fn=<NllLossBackward0>)
Loss: tensor(1.8954, grad_fn=<NllLossBackward0>)
Loss: tensor(1.9850, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2756, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2590, grad_fn=<NllLossBackward0>)
Loss: tensor(1.6113, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3219, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3700, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3692,

###Testing:

In [128]:
#Test model with test_X and test_Y data, output correct guess %
correct_guess = 0
for i in range(len(test_X)):
  prediction = MLP.forward(t.unsqueeze(t.tensor(test_X[i]).float(), 0))
  if t.argmax(prediction) == test_y[i]:
    correct_guess += 1
print('Score: ' +  str(correct_guess) + '/' + str(len(test_X)) + ', ' + str(correct_guess/len(test_X)*100) + '%')

Score: 5583/10000, 55.83%
