In [None]:
""" Implement a Multi-Layer Perceptron (MLP) to classify handwritten digits from the 
MNIST dataset. However, restrict the implementation to not use any deep learning 
library such as TensorFlow or PyTorch, except for data loading. Specifically, you can use 
libraries or utilities to load and preprocess the MNIST dataset, but all aspects of 
building and training the neural network should be implemented without relying on 
external deep learning libraries. Experiment with different architectures by varying the 
number of hidden layers and neurons to observe their effects on classification 
performance"""

In [5]:
import tensorflow as tf
import numpy as np

# Load the MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the images to [0, 1] range
x_train, x_test = x_train / 255.0, x_test / 255.0

# Flatten the images
x_train = x_train.reshape(-1, 28*28)
x_test = x_test.reshape(-1, 28*28)

# One-hot encode the labels
y_train_one_hot = np.zeros((y_train.size, y_train.max() + 1))
y_train_one_hot[np.arange(y_train.size), y_train] = 1

y_test_one_hot = np.zeros((y_test.size, y_test.max() + 1))
y_test_one_hot[np.arange(y_test.size), y_test] = 1

class MLP:
    def __init__(self, layer_sizes):
        self.layer_sizes = layer_sizes
        self.params = self.initialize_parameters()
    
    def initialize_parameters(self):
        params = {}
        for i in range(len(self.layer_sizes) - 1):
            params[f'W{i+1}'] = np.random.randn(self.layer_sizes[i], self.layer_sizes[i+1]) * 0.01
            params[f'b{i+1}'] = np.zeros((1, self.layer_sizes[i+1]))
        return params
    
    def relu(self, Z):
        return np.maximum(0, Z)
    
    def relu_derivative(self, Z):
        return Z > 0
    
    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
        return expZ / np.sum(expZ, axis=1, keepdims=True)
    
    def forward_propagation(self, X):
        caches = {'A0': X}
        A = X
        for i in range(len(self.layer_sizes) - 2):
            Z = np.dot(A, self.params[f'W{i+1}']) + self.params[f'b{i+1}']
            A = self.relu(Z)
            caches[f'Z{i+1}'] = Z
            caches[f'A{i+1}'] = A
        ZL = np.dot(A, self.params[f'W{len(self.layer_sizes)-1}']) + self.params[f'b{len(self.layer_sizes)-1}']
        AL = self.softmax(ZL)
        caches[f'Z{len(self.layer_sizes)-1}'] = ZL
        caches[f'A{len(self.layer_sizes)-1}'] = AL
        return AL, caches
    
    def compute_loss(self, AL, Y):
        m = Y.shape[0]
        loss = -np.sum(Y * np.log(AL)) / m
        return loss
    
    def backward_propagation(self, caches, Y):
        grads = {}
        L = len(self.layer_sizes) - 1
        m = Y.shape[0]
        AL = caches[f'A{L}']
        dZL = AL - Y
        grads[f'dW{L}'] = np.dot(caches[f'A{L-1}'].T, dZL) / m
        grads[f'db{L}'] = np.sum(dZL, axis=0, keepdims=True) / m
        
        for i in reversed(range(1, L)):
            dZ = np.dot(dZL, self.params[f'W{i+1}'].T) * self.relu_derivative(caches[f'Z{i}'])
            grads[f'dW{i}'] = np.dot(caches[f'A{i-1}'].T, dZ) / m
            grads[f'db{i}'] = np.sum(dZ, axis=0, keepdims=True) / m
            dZL = dZ
            
        return grads
    
    def update_parameters(self, grads, learning_rate):
        for i in range(len(self.layer_sizes) - 1):
            self.params[f'W{i+1}'] -= learning_rate * grads[f'dW{i+1}']
            self.params[f'b{i+1}'] -= learning_rate * grads[f'db{i+1}']
    
    def train(self, X_train, Y_train, learning_rate=0.1, epochs=100):
        for epoch in range(epochs):
            AL, caches = self.forward_propagation(X_train)
            loss = self.compute_loss(AL, Y_train)
            grads = self.backward_propagation(caches, Y_train)
            self.update_parameters(grads, learning_rate)
            
            if epoch % 10 == 0:
                print(f'Epoch {epoch}, Loss: {loss}')
    
    def predict(self, X):
        AL, _ = self.forward_propagation(X)
        return np.argmax(AL, axis=1)
# Define the architecture of the MLP (e.g., input layer size = 784, one hidden layer with 64 neurons, output layer with 10 neurons)
mlp = MLP(layer_sizes=[784, 64, 10])

# Train the MLP
mlp.train(X_train=x_train, Y_train=y_train_one_hot, learning_rate=0.1, epochs=100)

# Predict on test data
y_pred = mlp.predict(x_test)

# Evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Test Accuracy: {accuracy}')



Epoch 0, Loss: 2.3028984347786023
Epoch 10, Loss: 2.295956480918371
Epoch 20, Loss: 2.28382324551763
Epoch 30, Loss: 2.25715080089639
Epoch 40, Loss: 2.199692965005896
Epoch 50, Loss: 2.0887234503393053
Epoch 60, Loss: 1.9116015997133406
Epoch 70, Loss: 1.6829110201371704
Epoch 80, Loss: 1.4441953657773008
Epoch 90, Loss: 1.235629926979568
Test Accuracy: 0.7602
