In [1]:
#imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv('MNIST.csv')
data = np.array(data)
np.random.shuffle(data)

In [3]:
m, n = data.shape

data_test = data[0:1000].T
Y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

_, m_train = X_train.shape

In [4]:
data_train.shape

(785, 41000)

In [5]:
class Neural_Network:
    def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size, learning_rate, data_num, bias = 1):
        self.input_size = input_size
        self.hidden_1_size = hidden_1_size
        self.hidden_2_size = hidden_2_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.m = data_num

        # self.W1 = np.random.rand(hidden_1_size, input_size) - 0.5
        # self.b1 = np.random.rand(hidden_1_size, bias) - 0.5

        # self.W2 = np.random.rand(hidden_2_size, hidden_1_size) - 0.5
        # self.b2 = np.random.rand(hidden_2_size, bias) - 0.5

        # self.W3 = np.random.rand(output_size, hidden_2_size) - 0.5
        # self.b3 = np.random.rand(output_size, bias) - 0.5

        # Xavier initialization for weights
        self.W1 = np.random.randn(hidden_1_size, input_size) * np.sqrt(1.0 / input_size)
        self.b1 = np.zeros((hidden_1_size, bias))

        self.W2 = np.random.randn(hidden_2_size, hidden_1_size) * np.sqrt(1.0 / hidden_1_size)
        self.b2 = np.zeros((hidden_2_size, bias))

        self.W3 = np.random.randn(output_size, hidden_2_size) * np.sqrt(1.0 / hidden_2_size)
        self.b3 = np.zeros((output_size, bias))

    def set_learning_rate(self, new_LR):
        self.learning_rate = new_LR

    def forward_prop(self, X):
        self.Z1 = self.W1.dot(X) + self.b1
        self.A1 = ReLU(self.Z1)

        self.Z2 = self.W2.dot(self.A1) + self.b2
        self.A2 = ReLU(self.Z2)

        self.Z3 = self.W3.dot(self.A2) + self.b3
        self.A3 = softmax(self.Z3)

        return self.A3, self.Z3

    def backward_prop(self, X, y):
        one_hot_Y = one_hot(y)

        self.dZ3 = self.A3 - one_hot_Y
        self.dW3 = 1 / self.m * self.dZ3.dot(self.A2.T)
        self.db3 = 1 / self.m * np.sum(self.dZ3, axis = 1, keepdims = True)

        self.dZ2 = self.W3.T.dot(self.dZ3) * ReLU_prime(self.Z2)
        self.dW2 = 1 / self.m * self.dZ2.dot(self.A1.T)
        self.db2 = 1 / self.m * np.sum(self.dZ2, axis = 1, keepdims = True)

        self.dZ1 = self.W2.T.dot(self.dZ2) * ReLU_prime(self.Z1)
        self.dW1 = 1 / self.m * self.dZ1.dot(X.T)
        self.db1 = 1 / self.m * np.sum(self.dZ1, axis = 1, keepdims = True)

        #return dW1, db1, dW2, db2, dW3, db3

    def gradient_descent(self):
        self.W1 = self.W1 - self.dW1 * self.learning_rate
        self.b1 = self.b1 - self.db1 * self.learning_rate

        self.W2 = self.W2 - self.dW2 * self.learning_rate
        self.b2 = self.b2 - self.db2 * self.learning_rate

        self.W3 = self.W3 - self.dW3 * self.learning_rate
        self.b3 = self.b3 - self.db3 * self.learning_rate


    def train_epochs(self, epochs):
        for epoch in range(epochs):
            
            self.forward_prop(X_train)
            self.backward_prop(X_train, Y_train)
            self.gradient_descent()

            if epoch % 10 == 0:
                predictions = np.argmax(self.A3, 0)
                print("Epoch", epoch, ":", round((np.sum(predictions == Y_train) / Y_train.size) * 100, 2), "%")
        print("Training complete")

# def get_predictions(A3):
#     return np.argmax(A3, 0)

# def get_accuracy(predictions, Y):
#     return round((np.sum(predictions == Y) / Y.size) * 100, 2)
        

class PReLU:
    def __init__(self, alpha_init = 0.01):
        self.alpha = alpha_init

    def forward(self, Z):
        return np.maximum(self.alpha * Z, Z)

    def prime(self, Z):
        return np.where(Z > 0, 1, self.alpha)

def update_alpha(alpha, dZ):
    learning_rate = 0.01
    alpha -= learning_rate * np.mean(np.where(dZ < 0, dZ * alpha, 0))
    return alpha


def ReLU(Z):
    return np.maximum(Z, 0)

def ReLU_prime(Z):
    return Z > 0



def sigmoid(Z):
    A = 1 / (1 + np.exp(np.clip(-Z, -4, 4)))
    return A

def sigmoid_prime(Z):
    A = (sigmoid(Z) * (1 - sigmoid(Z)))
    return A



def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A



def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [6]:
nn = Neural_Network(784, 64, 32, 10, 0.05, 41000)

epoch 0 : 9.77 %
epoch 10 : 30.01 %
epoch 20 : 41.84 %
epoch 30 : 52.21 %
epoch 40 : 61.09 %
epoch 50 : 66.73 %
epoch 60 : 71.89 %
epoch 70 : 76.38 %
epoch 80 : 79.22 %
epoch 90 : 80.8 %
epoch 100 : 82.09 %
epoch 110 : 82.96 %
epoch 120 : 83.71 %
epoch 130 : 84.38 %
epoch 140 : 85.0 %
epoch 150 : 85.49 %
epoch 160 : 85.95 %
epoch 170 : 86.34 %
epoch 180 : 86.74 %
epoch 190 : 87.11 %
epoch 200 : 87.38 %
epoch 210 : 87.63 %
epoch 220 : 87.83 %
epoch 230 : 88.06 %
epoch 240 : 88.27 %
epoch 250 : 88.5 %
epoch 260 : 88.66 %
epoch 270 : 88.82 %
epoch 280 : 88.97 %
epoch 290 : 89.15 %
epoch 300 : 89.25 %
epoch 310 : 89.38 %
epoch 320 : 89.47 %
epoch 330 : 89.57 %
epoch 340 : 89.66 %
epoch 350 : 89.78 %
epoch 360 : 89.85 %
epoch 370 : 89.93 %
epoch 380 : 90.02 %
epoch 390 : 90.09 %
epoch 400 : 90.18 %
epoch 410 : 90.23 %
epoch 420 : 90.31 %
epoch 430 : 90.42 %
epoch 440 : 90.55 %
epoch 450 : 90.61 %
epoch 460 : 90.7 %
epoch 470 : 90.76 %
epoch 480 : 90.82 %
epoch 490 : 90.9 %
epoch 500 : 90.95

In [8]:
nn.set_learning_rate(0.01)
nn.train_epochs(1000)

epoch 0 : 95.17 %
epoch 10 : 95.17 %
epoch 20 : 95.17 %
epoch 30 : 95.17 %
epoch 40 : 95.17 %
epoch 50 : 95.18 %
epoch 60 : 95.17 %
epoch 70 : 95.19 %
epoch 80 : 95.19 %
epoch 90 : 95.19 %
epoch 100 : 95.19 %
epoch 110 : 95.2 %
epoch 120 : 95.21 %
epoch 130 : 95.21 %
epoch 140 : 95.21 %
epoch 150 : 95.21 %
epoch 160 : 95.21 %
epoch 170 : 95.21 %
epoch 180 : 95.21 %
epoch 190 : 95.22 %
epoch 200 : 95.22 %
epoch 210 : 95.23 %
epoch 220 : 95.24 %
epoch 230 : 95.24 %
epoch 240 : 95.25 %
epoch 250 : 95.25 %
epoch 260 : 95.26 %
epoch 270 : 95.26 %
epoch 280 : 95.26 %
epoch 290 : 95.26 %
epoch 300 : 95.26 %
epoch 310 : 95.27 %
epoch 320 : 95.27 %
epoch 330 : 95.27 %
epoch 340 : 95.27 %
epoch 350 : 95.28 %
epoch 360 : 95.28 %
epoch 370 : 95.28 %
epoch 380 : 95.28 %
epoch 390 : 95.28 %
epoch 400 : 95.28 %
epoch 410 : 95.29 %
epoch 420 : 95.3 %
epoch 430 : 95.3 %
epoch 440 : 95.3 %
epoch 450 : 95.3 %
epoch 460 : 95.31 %
epoch 470 : 95.31 %
epoch 480 : 95.31 %
epoch 490 : 95.32 %
epoch 500 : 95.3