In [1]:
#imports
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('MNIST.csv')
data = np.array(data)
np.random.shuffle(data)

In [3]:
m, n = data.shape

data_test = data[0:1000].T
Y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

_, m_train = X_train.shape

In [4]:
data_train.shape

(785, 41000)

In [5]:
class Neural_Network:
    def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size, learning_rate, data_num, bias = 1):
        self.input_size = input_size
        self.hidden_1_size = hidden_1_size
        self.hidden_2_size = hidden_2_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.m = data_num

        # self.W1 = np.random.rand(hidden_1_size, input_size) - 0.5
        # self.b1 = np.random.rand(hidden_1_size, bias) - 0.5

        # self.W2 = np.random.rand(hidden_2_size, hidden_1_size) - 0.5
        # self.b2 = np.random.rand(hidden_2_size, bias) - 0.5

        # self.W3 = np.random.rand(output_size, hidden_2_size) - 0.5
        # self.b3 = np.random.rand(output_size, bias) - 0.5

        # Xavier initialization for weights
        self.W1 = np.random.randn(hidden_1_size, input_size) * np.sqrt(1.0 / input_size)
        self.b1 = np.zeros((hidden_1_size, bias))

        self.W2 = np.random.randn(hidden_2_size, hidden_1_size) * np.sqrt(1.0 / hidden_1_size)
        self.b2 = np.zeros((hidden_2_size, bias))

        self.W3 = np.random.randn(output_size, hidden_2_size) * np.sqrt(1.0 / hidden_2_size)
        self.b3 = np.zeros((output_size, bias))

    def set_learning_rate(self, new_LR):
        self.learning_rate = new_LR

    def forward_prop(self, X):
        self.Z1 = self.W1.dot(X) + self.b1
        self.A1 = ReLU(self.Z1)

        self.Z2 = self.W2.dot(self.A1) + self.b2
        self.A2 = ReLU(self.Z2)

        self.Z3 = self.W3.dot(self.A2) + self.b3
        self.A3 = softmax(self.Z3)

        return self.A3, self.Z3

    def backward_prop(self, X, y):
        one_hot_Y = one_hot(y)

        # print("Shape of self.A3:", self.A3.shape)
        # print("Shape of one_hot_Y:", one_hot_Y.shape)


        self.dZ3 = self.A3 - one_hot_Y
        self.dW3 = 1 / self.m * self.dZ3.dot(self.A2.T)
        self.db3 = 1 / self.m * np.sum(self.dZ3, axis = 1, keepdims = True)

        self.dZ2 = self.W3.T.dot(self.dZ3) * ReLU_prime(self.Z2)
        self.dW2 = 1 / self.m * self.dZ2.dot(self.A1.T)
        self.db2 = 1 / self.m * np.sum(self.dZ2, axis = 1, keepdims = True)

        self.dZ1 = self.W2.T.dot(self.dZ2) * ReLU_prime(self.Z1)
        self.dW1 = 1 / self.m * self.dZ1.dot(X.T)
        self.db1 = 1 / self.m * np.sum(self.dZ1, axis = 1, keepdims = True)


    def gradient_descent(self):
        self.W1 = self.W1 - self.dW1 * self.learning_rate
        self.b1 = self.b1 - self.db1 * self.learning_rate

        self.W2 = self.W2 - self.dW2 * self.learning_rate
        self.b2 = self.b2 - self.db2 * self.learning_rate

        self.W3 = self.W3 - self.dW3 * self.learning_rate
        self.b3 = self.b3 - self.db3 * self.learning_rate


    def train_GD(self, epochs):
        for epoch in range(epochs):
            
            self.forward_prop(X_train)
            self.backward_prop(X_train, Y_train)
            self.gradient_descent()

            if epoch % 100 == 0:
                self.forward_prop(X_train)
                predictions = np.argmax(self.A3, axis=0)        
                print(f'Epoch {epoch}: {round((np.sum(predictions == Y_train) / Y_train.size) * 100, 4)}%')

        self.forward_prop(X_test)
        predictions = np.argmax(self.A3, axis=0)
        print("Training complete\n")
        print(f'Test Set Accuracy: {round((np.sum(predictions == Y_test) / Y_test.size) * 100, 4)}%')

    def train_SGD(self, epochs, batch = 64):
        num_samples = X_train.shape[1]

        for epoch in range(epochs):
            indicies = np.random.permutation(num_samples)
            X_train_shuffled = X_train[:, indicies]
            Y_train_shuffled = Y_train[indicies]

            for i in range(0, num_samples, batch):
                X_batch = X_train_shuffled[:, i:i+batch]
                Y_batch = Y_train_shuffled[i:i+batch]

                self.forward_prop(X_batch)
                self.backward_prop(X_batch, Y_batch)
                self.gradient_descent()

            if epoch % 100 == 0:
                self.forward_prop(X_train)
                predictions = np.argmax(self.A3, axis=0)        
                print(f'Epoch {epoch}: {round((np.sum(predictions == Y_train) / Y_train.size) * 100, 4)}%')

        self.forward_prop(X_test)
        predictions = np.argmax(self.A3, axis=0)
        print("Training complete\n")
        print(f'Test Set Accuracy: {round((np.sum(predictions == Y_test) / Y_test.size) * 100, 4)}%')

    def train_adam(self, epochs):
        #Init Extra Weights
        self.beta1 = 0.9 # Exp decay rate for mean of gradients
        self.beta2 = 0.999 # Exp decay rate for varience of gradients
        self.epsilon = 1e-8 # Prevent divisions by 0
        self.m_W1 = np.zeros_like(self.W1) # Moving average of gradients
        self.v_W1 = np.zeros_like(self.W1) # Squared moving averages of gradients
        self.m_b1 = np.zeros_like(self.b1)
        self.v_b1 = np.zeros_like(self.b1)
        self.m_W2 = np.zeros_like(self.W2)
        self.v_W2 = np.zeros_like(self.W2)
        self.m_b2 = np.zeros_like(self.b2)
        self.v_b2 = np.zeros_like(self.b2)
        self.m_W3 = np.zeros_like(self.W3)
        self.v_W3 = np.zeros_like(self.W3)
        self.m_b3 = np.zeros_like(self.b3)
        self.v_b3 = np.zeros_like(self.b3)

        for epoch in range(epochs):
            self.forward_prop(X_train)
            self.backward_prop(X_train, Y_train)
            self.adam_update()

            if epoch % 100 == 0:
                self.forward_prop(X_train)
                predictions = np.argmax(self.A3, axis=0)        
                print(f'Epoch {epoch}: {round((np.sum(predictions == Y_train) / Y_train.size) * 100, 4)}%')

        self.forward_prop(X_test)
        predictions = np.argmax(self.A3, axis=0)
        print("Training complete\n")
        print(f'Test Set Accuracy: {round((np.sum(predictions == Y_test) / Y_test.size) * 100, 4)}%')

    def adam_update(self):
        self.m_W1 = self.beta1 * self.m_W1 + (1 - self.beta1) * self.dW1
        self.v_W1 = self.beta2 * self.v_W1 + (1 - self.beta2) * (self.dW1 ** 2)
        m_W1_hat = self.m_W1 / (1 - self.beta1)
        v_W1_hat = self.v_W1 / (1 - self.beta2)
        self.W1 -= self.learning_rate * m_W1_hat / (np.sqrt(v_W1_hat) + self.epsilon)

        self.m_b1 = self.beta1 * self.m_b1 + (1 - self.beta1) * self.db1
        self.v_b1 = self.beta2 * self.v_b1 + (1 - self.beta2) * (self.db1 ** 2)
        m_b1_hat = self.m_b1 / (1 - self.beta1)
        v_b1_hat = self.v_b1 / (1 - self.beta2)
        self.b1 -= self.learning_rate * m_b1_hat / (np.sqrt(v_b1_hat) + self.epsilon)

        self.m_W2 = self.beta1 * self.m_W2 + (1 - self.beta1) * self.dW2
        self.v_W2 = self.beta2 * self.v_W2 + (1 - self.beta2) * (self.dW2 ** 2)
        m_W2_hat = self.m_W2 / (1 - self.beta1)
        v_W2_hat = self.v_W2 / (1 - self.beta2)
        self.W2 -= self.learning_rate * m_W2_hat / (np.sqrt(v_W2_hat) + self.epsilon)

        self.m_b2 = self.beta1 * self.m_b2 + (1 - self.beta1) * self.db2
        self.v_b2 = self.beta2 * self.v_b2 + (1 - self.beta2) * (self.db2 ** 2)
        m_b2_hat = self.m_b2 / (1 - self.beta1)
        v_b2_hat = self.v_b2 / (1 - self.beta2)
        self.b2 -= self.learning_rate * m_b2_hat / (np.sqrt(v_b2_hat) + self.epsilon)

        self.m_W3 = self.beta1 * self.m_W3 + (1 - self.beta1) * self.dW3
        self.v_W3 = self.beta2 * self.v_W3 + (1 - self.beta2) * (self.dW3 ** 2)
        m_W3_hat = self.m_W3 / (1 - self.beta1)
        v_W3_hat = self.v_W3 / (1 - self.beta2)
        self.W3 -= self.learning_rate * m_W3_hat / (np.sqrt(v_W3_hat) + self.epsilon)

        self.m_b3 = self.beta1 * self.m_b3 + (1 - self.beta1) * self.db3
        self.v_b3 = self.beta2 * self.v_b3 + (1 - self.beta2) * (self.db3 ** 2)
        m_b3_hat = self.m_b3 / (1 - self.beta1)
        v_b3_hat = self.v_b3 / (1 - self.beta2)
        self.b3 -= self.learning_rate * m_b3_hat / (np.sqrt(v_b3_hat) + self.epsilon)



    def train(self, method, epochs):
        if method == 'GD':
            self.train_GD(epochs)
        elif method == 'SGD':
            self.train_SGD(epochs)
        elif method == 'adam':
            self.train_adam(epochs)
        else:
            raise ValueError('Invalid Training Method')
            # Add more cases for different training methods


        

class PReLU:
    def __init__(self, alpha_init = 0.01):
        self.alpha = alpha_init

    def forward(self, Z):
        return np.maximum(self.alpha * Z, Z)

    def prime(self, Z):
        return np.where(Z > 0, 1, self.alpha)

def update_alpha(alpha, dZ):
    learning_rate = 0.01
    alpha -= learning_rate * np.mean(np.where(dZ < 0, dZ * alpha, 0))
    return alpha


def ReLU(Z):
    return np.maximum(Z, 0)

def ReLU_prime(Z):
    return Z > 0



def sigmoid(Z):
    A = 1 / (1 + np.exp(np.clip(-Z, -4, 4)))
    return A

def sigmoid_prime(Z):
    A = (sigmoid(Z) * (1 - sigmoid(Z)))
    return A



def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A



def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, 10))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [8]:
nn = Neural_Network(784, 64, 32, 10, 0.05, 41000)

In [10]:
nn.set_learning_rate(0.001)
nn.train('adam', 1000)

Epoch 0: 99.1829%
Epoch 100: 99.2634%
Epoch 200: 99.3%
Epoch 300: 99.3244%
Epoch 400: 99.3512%
Epoch 500: 99.3732%
Epoch 600: 99.4146%
Epoch 700: 99.4512%
Epoch 800: 99.4659%
Epoch 900: 99.4805%
Training complete

Test Set Accuracy: 92.3%
