# MNIST Digit Classification with our own Framework

Lab Assignment from [AI for Beginners Curriculum](https://github.com/microsoft/ai-for-beginners).

### Reading the Dataset

This code download the dataset from the repository on the internet. You can also manually copy the dataset from `/data` directory of AI Curriculum repo.

In [1]:
import pylab
from matplotlib import gridspec
from sklearn.datasets import make_classification
import numpy as np
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
import pickle
import os
import gzip
#!rm *.pkl
#!wget https://raw.githubusercontent.com/microsoft/AI-For-Beginners/main/data/mnist.pkl.gz
#!gzip -d mnist.pkl.gz
file_path = r'c:\Users\nakam\Downloads\mnist.pkl.gz'  # Use raw string (r'...')
with gzip.open(file_path, 'rb') as mnist_pickle:
    mnist_tuple= pickle.load(mnist_pickle, encoding='latin1')
MNIST = {
    'Train': {'Features': mnist_tuple[0][0], 'Labels': mnist_tuple[0][1]},
    'Valid': {'Features': mnist_tuple[1][0], 'Labels': mnist_tuple[1][1]},
    'Test': {'Features': mnist_tuple[2][0], 'Labels': mnist_tuple[2][1]},
}


In [2]:
#import pickle
#with open('mnist.pkl','rb') as f:
#    MNIST = pickle.load(f)

In [3]:
labels = MNIST['Train']['Labels']
data = MNIST['Train']['Features']

Let's see what is the shape of data that we have:

In [4]:
data.shape

(50000, 784)

### Splitting the Data

We will use Scikit Learn to split the data between training and test dataset:

In [5]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(data,labels,test_size=0.2)

print(f"Train samples: {len(features_train)}, test samples: {len(features_test)}")

Train samples: 40000, test samples: 10000


In [20]:
import matplotlib.pyplot as plt
class Linear:
    def __init__(self,nin,nout):
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin))
        self.b = np.zeros((1,nout))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        
    def forward(self, x):
        self.x=x
        return np.dot(x, self.W.T) + self.b
    
    def backward(self, dz):
        dx = np.dot(dz, self.W)
        dW = np.dot(dz.T, self.x)
        db = dz.sum(axis=0)
        self.dW = dW
        self.db = db
        return dx
    
    def update(self,lr):
        self.W -= lr*self.dW
        self.b -= lr*self.db

class Softmax:
    def forward(self,z):
        self.z = z
        zmax = z.max(axis=1,keepdims=True)
        expz = np.exp(z-zmax)
        Z = expz.sum(axis=1,keepdims=True)
        return expz / Z
    def backward(self,dp):
        p = self.forward(self.z)
        pdp = p * dp
        return pdp - p * pdp.sum(axis=1, keepdims=True)
    
class CrossEntropyLoss:
    def forward(self,p,y):
        self.p = p
        self.y = y
        p_of_y = p[np.arange(len(y)), y]
        log_prob = np.log(p_of_y)
        return -log_prob.mean()
    def backward(self,loss):
        dlog_softmax = np.zeros_like(self.p)
        dlog_softmax[np.arange(len(self.y)), self.y] -= 1.0/len(self.y)
        return dlog_softmax / self.p
    
class Net:
    def __init__(self):
        self.layers = []
    
    def add(self,l):
        self.layers.append(l)
        
    def forward(self,x):
        for l in self.layers:
            x = l.forward(x)
        return x
    
    def backward(self,z):
        for l in self.layers[::-1]:
            z = l.backward(z)
        return z
    
    def update(self,lr):
        for l in self.layers:
            if 'update' in l.__dir__():
                l.update(lr)
    def get_max_weight_abs(self):
        # Track max absolute value of weights in the network
        max_abs = 0
        for layer in self.layers:
            if isinstance(layer, Linear):
                max_abs = max(max_abs, np.max(np.abs(layer.W)))
        return max_abs

class Tanh:
    def forward(self,x):
        y = np.tanh(x)
        self.y = y
        return y
    def backward(self,dy):
        return (1.0-self.y**2)*dy

class Sinh:
    def forward(self, x):
        self.y = np.sinh(x)  # Apply sinh function element-wise
        return self.y
    
    def backward(self, dy):
        return (1.0 - self.y**2) * dy  # Derivative of sinh(x) is 1 - sinh^2(x)

# Define the network
net = Net()
net.add(Linear(784, 128))   # First Linear layer: 784 -> 128 neurons
net.add(Tanh())              # Sinh activation function
net.add(Linear(128, 10))    # Second Linear layer: 128 -> 64 neurons
net.add(Softmax())          # Softmax activation functio

def get_loss_acc(x,y,loss=CrossEntropyLoss()):
    p = net.forward(x)
    l = loss.forward(p,y)
    pred = np.argmax(p,axis=1)
    acc = (pred==y).mean()
    return l,acc

print("Initial loss={}, accuracy={}: ".format(*get_loss_acc(features_train,labels_train)))

max_weight_abs_history = []

def train_epoch(net, train_x, train_labels, loss=CrossEntropyLoss(), batch_size=4, lr=0.1):
    for i in range(0,len(train_x),batch_size):
        xb = train_x[i:i+batch_size]
        yb = train_labels[i:i+batch_size]

        p = net.forward(xb)
        l = loss.forward(p,yb)
        dp = loss.backward(l)
        dx = net.backward(dp)
        net.update(lr)
    # Track max absolute weight value after each epoch
    max_weight_abs = net.get_max_weight_abs()
    max_weight_abs_history.append(max_weight_abs)
 
#train_epoch(net,features_train,labels_train)
        
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(features_train,labels_train)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(features_test,labels_test)))

epochs = 10
for epoch in range(epochs):
    train_epoch(net, features_train, labels_train)
    
    # Optionally print or track loss/accuracy
    if epoch % 10 == 0:
        train_loss, train_acc = get_loss_acc(features_train, labels_train)
        print(f"Epoch {epoch}, Loss: {train_loss}, Accuracy: {train_acc}")

# Plot max absolute weight values vs. epoch number
plt.plot(range(epochs), max_weight_abs_history)
plt.xlabel('Epoch')
plt.ylabel('Max Abs Value of Weights')
plt.title('Max Absolute Value of Weights vs. Epoch')
plt.show()

ValueError: shapes (40000,784) and (78,128) not aligned: 784 (dim 1) != 78 (dim 0)

### Instructions

1. Take the framework code from the lesson and paste it into this notebook, or (even better) into a separate Python module
1. Define and train one-layered perceptron, observing training and validation accuracy during training
1. Try to understand if overfitting took place, and adjust layer parameters to improve accuracy
1. Repeat previous steps for 2- and 3-layered perceptrons. Try to experiment with different activation functions between layers.
1. Try to answer the following questions:
    - Does the inter-layer activation function affect network performance?
    - Do we need 2- or 3-layered network for this task?
    - Did you experience any problems training the network? Especially as the number of layers increased.
    - How do weights of the network behave during training? You may plot max abs value of weights vs. epoch to understand the relation.