In [1]:
#imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv('MNIST.csv')
data = np.array(data)
np.random.shuffle(data)

In [3]:
m, n = data.shape

data_test = data[0:1000].T
Y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

_, m_train = X_train.shape

In [4]:
data_train.shape

(785, 41000)

In [35]:
class Neural_Network:
    def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size, learning_rate, data_num, bias = 1):
        self.input_size = input_size
        self.hidden_1_size = hidden_1_size
        self.hidden_2_size = hidden_2_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.m = data_num

        # self.W1 = np.random.rand(hidden_1_size, input_size) - 0.5
        # self.b1 = np.random.rand(hidden_1_size, bias) - 0.5

        # self.W2 = np.random.rand(hidden_2_size, hidden_1_size) - 0.5
        # self.b2 = np.random.rand(hidden_2_size, bias) - 0.5

        # self.W3 = np.random.rand(output_size, hidden_2_size) - 0.5
        # self.b3 = np.random.rand(output_size, bias) - 0.5

        # Xavier initialization for weights
        self.W1 = np.random.randn(hidden_1_size, input_size) * np.sqrt(1.0 / input_size)
        self.b1 = np.zeros((hidden_1_size, bias))

        self.W2 = np.random.randn(hidden_2_size, hidden_1_size) * np.sqrt(1.0 / hidden_1_size)
        self.b2 = np.zeros((hidden_2_size, bias))

        self.W3 = np.random.randn(output_size, hidden_2_size) * np.sqrt(1.0 / hidden_2_size)
        self.b3 = np.zeros((output_size, bias))

    def set_learning_rate(self, new_LR):
        self.learning_rate = new_LR

    def forward_prop(self, X):
        self.Z1 = self.W1.dot(X) + self.b1
        self.A1 = ReLU(self.Z1)

        self.Z2 = self.W2.dot(self.A1) + self.b2
        self.A2 = ReLU(self.Z2)

        self.Z3 = self.W3.dot(self.A2) + self.b3
        self.A3 = softmax(self.Z3)

        return self.A3, self.Z3

    def backward_prop(self, X, y):
        one_hot_Y = one_hot(y)

        self.dZ3 = self.A3 - one_hot_Y
        self.dW3 = 1 / self.m * self.dZ3.dot(self.A2.T)
        self.db3 = 1 / self.m * np.sum(self.dZ3, axis = 1, keepdims = True)

        self.dZ2 = self.W3.T.dot(self.dZ3) * ReLU_prime(self.Z2)
        self.dW2 = 1 / self.m * self.dZ2.dot(self.A1.T)
        self.db2 = 1 / self.m * np.sum(self.dZ2, axis = 1, keepdims = True)

        self.dZ1 = self.W2.T.dot(self.dZ2) * ReLU_prime(self.Z1)
        self.dW1 = 1 / self.m * self.dZ1.dot(X.T)
        self.db1 = 1 / self.m * np.sum(self.dZ1, axis = 1, keepdims = True)

        #return dW1, db1, dW2, db2, dW3, db3

    def gradient_descent(self):
        self.W1 = self.W1 - self.dW1 * self.learning_rate
        self.b1 = self.b1 - self.db1 * self.learning_rate

        self.W2 = self.W2 - self.dW2 * self.learning_rate
        self.b2 = self.b2 - self.db2 * self.learning_rate

        self.W3 = self.W3 - self.dW3 * self.learning_rate
        self.b3 = self.b3 - self.db3 * self.learning_rate


    def train_GD(self, epochs):
        for epoch in range(epochs):
            
            self.forward_prop(X_train)
            self.backward_prop(X_train, Y_train)
            self.gradient_descent()

            if epoch % 100 == 0:
                predictions = np.argmax(self.A3, 0)
                print("Epoch", epoch, ":", round((np.sum(predictions == Y_train) / Y_train.size) * 100, 10), "%")
        print("Training complete")           

            

# def get_predictions(A3):
#     return np.argmax(A3, 0)

# def get_accuracy(predictions, Y):
#     return round((np.sum(predictions == Y) / Y.size) * 100, 2)
        

class PReLU:
    def __init__(self, alpha_init = 0.01):
        self.alpha = alpha_init

    def forward(self, Z):
        return np.maximum(self.alpha * Z, Z)

    def prime(self, Z):
        return np.where(Z > 0, 1, self.alpha)

def update_alpha(alpha, dZ):
    learning_rate = 0.01
    alpha -= learning_rate * np.mean(np.where(dZ < 0, dZ * alpha, 0))
    return alpha


def ReLU(Z):
    return np.maximum(Z, 0)

def ReLU_prime(Z):
    return Z > 0



def sigmoid(Z):
    A = 1 / (1 + np.exp(np.clip(-Z, -4, 4)))
    return A

def sigmoid_prime(Z):
    A = (sigmoid(Z) * (1 - sigmoid(Z)))
    return A



def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A



def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [22]:
nn = Neural_Network(784, 64, 32, 10, 0.05, 41000)

In [34]:
nn.set_learning_rate(0.05)
nn.train_GD(5000)

Epoch 0 : 99.94 %
Epoch 10 : 99.94 %
Epoch 20 : 99.94 %
Epoch 30 : 99.94 %
Epoch 40 : 99.94 %
Epoch 50 : 99.94 %
Epoch 60 : 99.94 %
Epoch 70 : 99.94 %
Epoch 80 : 99.94 %
Epoch 90 : 99.94 %
Epoch 100 : 99.94 %
Epoch 110 : 99.94 %
Epoch 120 : 99.94 %
Epoch 130 : 99.94 %
Epoch 140 : 99.94 %
Epoch 150 : 99.94 %
Epoch 160 : 99.94 %
Epoch 170 : 99.94 %
Epoch 180 : 99.94 %
Epoch 190 : 99.94 %
Epoch 200 : 99.94 %
Epoch 210 : 99.94 %
Epoch 220 : 99.94 %
Epoch 230 : 99.94 %
Epoch 240 : 99.94 %
Epoch 250 : 99.94 %
Epoch 260 : 99.94 %
Epoch 270 : 99.95 %
Epoch 280 : 99.95 %
Epoch 290 : 99.95 %
Epoch 300 : 99.95 %
Epoch 310 : 99.95 %
Epoch 320 : 99.95 %
Epoch 330 : 99.95 %
Epoch 340 : 99.95 %
Epoch 350 : 99.95 %
Epoch 360 : 99.95 %
Epoch 370 : 99.95 %
Epoch 380 : 99.95 %
Epoch 390 : 99.95 %
Epoch 400 : 99.95 %
Epoch 410 : 99.95 %
Epoch 420 : 99.95 %
Epoch 430 : 99.95 %
Epoch 440 : 99.95 %
Epoch 450 : 99.95 %
Epoch 460 : 99.95 %
Epoch 470 : 99.95 %
Epoch 480 : 99.95 %
Epoch 490 : 99.95 %
Epoch 500 :