In [None]:
# This copy was created to TEST converting the number of neurons into variables.
# Output: Successfully changed constants to neurons_1,2,3. 
# Issues: Not converging as it used to before. I think it requires hyperparameter tuning 

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv('mnist_train.csv')

train = df_train.to_numpy()
train = train.T

train_labels = train[0, :10000]
train_data = train[1:, :10000]

print(train_data.shape)

# Initialization
num_of_input_neurons, num_of_train = train_data.shape

neurons_1 = 50
neurons_2 = 50
neurons_3 = 10

W_2 = np.random.normal(loc=0.0, scale=np.sqrt(2 / 784), size=(neurons_1, 784))
W_3 = np.random.normal(loc=0.0, scale=np.sqrt(2 / 784), size=(neurons_2, neurons_1))
W_4 = np.random.normal(loc=0.0, scale=np.sqrt(2 / 784), size=(neurons_3, neurons_2))

b_2 = np.zeros(neurons_1)
b_3 = np.zeros(neurons_2)
b_4 = np.zeros(neurons_3)

dCdW_2_store = np.zeros((neurons_1, num_of_input_neurons))
dCdW_3_store = np.zeros((neurons_2, neurons_1))
dCdW_4_store = np.zeros((neurons_3, neurons_2))

dCdb_2_store = np.zeros(neurons_1)
dCdb_3_store = np.zeros(neurons_2)
dCdb_4_store = np.zeros(neurons_3)

C_store = 0
C_epoch = np.array([])

# ADAM m Parameter
m_W_2 = np.zeros((neurons_1, num_of_input_neurons))
m_W_3 = np.zeros((neurons_2, neurons_1))
m_W_4 = np.zeros((neurons_3, neurons_2))

m_b_2 = np.zeros(neurons_1)
m_b_3 = np.zeros(neurons_2)
m_b_4 = np.zeros(neurons_3)

# ADAM v Parameter
v_W_2 = np.zeros((neurons_1, num_of_input_neurons))
v_W_3 = np.zeros((neurons_2, neurons_1))
v_W_4 = np.zeros((neurons_3, neurons_2))

v_b_2 = np.zeros(neurons_1)
v_b_3 = np.zeros(neurons_2)
v_b_4 = np.zeros(neurons_3)

# ADAM Timestep
time = 0

# HYPERPARAMETERS
step = 0.0001  # alpha = step
epoch = 0
batch_size = 20
total_epochs = 100

# ADAM parameters
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1.e-8


###############################################################################################################
###################################################FUNCTIONS###################################################
###############################################################################################################

def ReLu(z):
    return np.maximum(0, z)


def softmax(z):
    # We gotta normalize a_4 before softmax, since exp can overflow for large numbers
    z -= max(z)
    return np.exp(z) / np.sum(np.exp(z))


def grad_ReLu(z):
    return np.heaviside(z, 0)


def calc_error(a_out, true_val):
    t = np.zeros(10)
    t[true_val] = 1

    C = -np.log(a_out[true_val])
    return C, t


def forward_prop(W_2, W_3, W_4, b_2, b_3, b_4, a_in):
    z_2 = W_2 @ a_in + b_2
    a_2 = ReLu(z_2)

    z_3 = W_3 @ a_2 + b_3
    a_3 = ReLu(z_3)

    z_4 = W_4 @ a_3 + b_4
    a_4 = ReLu(z_4)

    a_out = softmax(a_4)
    return a_2, a_3, a_4, a_out, z_2, z_3, z_4


def back_prop(a_in, a_2, a_3, a_4, a_out, z_2, z_3, z_4, t, W_2, W_3, W_4):
    del_4 = grad_ReLu(z_4) * (a_out - t)
    dCdW_4 = del_4.reshape(len(del_4),1)@a_3.reshape(1,len(a_3))
    dCdb_4 = del_4

    del_3 = grad_ReLu(z_3).reshape(len(z_3),1) * (W_4.T@del_4.reshape(len(del_4),1))
    dCdW_3 = del_3.reshape(len(del_3),1)@a_2.reshape(1,len(a_2))
    dCdb_3 = del_3

    del_2 = grad_ReLu(z_2).reshape(len(z_2),1) * (W_3.T@del_3.reshape(len(del_3),1))
    dCdW_2 = del_2.reshape(len(del_2),1)@a_in.reshape(1,len(a_in))
    dCdb_2 = del_2
    return dCdW_4, dCdW_3, dCdW_2, dCdb_4, dCdb_3, dCdb_2


def store_grad(dCdW_4, dCdW_3, dCdW_2, dCdb_4, dCdb_3, dCdb_2, dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store,
               dCdb_3_store, dCdb_2_store, ):
    dCdW_4_store += dCdW_4
    dCdW_3_store += dCdW_3
    dCdW_2_store += dCdW_2

    dCdb_4_store += dCdb_4
    dCdb_3_store += dCdb_3[:,0]
    dCdb_2_store += dCdb_2[:,0]

    return dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store, dCdb_3_store, dCdb_2_store


def update_grad(dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store,
                dCdb_3_store, dCdb_2_store, batch_size):
    dCdW_4_store = dCdW_4_store / batch_size
    dCdW_3_store = dCdW_3_store / batch_size
    dCdW_2_store = dCdW_2_store / batch_size

    dCdb_4_store = dCdb_4_store / batch_size
    dCdb_3_store = dCdb_3_store / batch_size
    dCdb_2_store = dCdb_2_store / batch_size

    return dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store, dCdb_3_store, dCdb_2_store


def update_weight_bias(W_2, W_3, W_4, b_2, b_3, b_4, dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store,
                       dCdb_3_store, dCdb_2_store, step):
    W_4 = W_4 - step * dCdW_4_store
    W_3 = W_3 - step * dCdW_3_store
    W_2 = W_2 - step * dCdW_2_store

    b_4 = b_4 - step * dCdb_4_store
    b_3 = b_3 - step * dCdb_3_store
    b_2 = b_2 - step * dCdb_2_store

    return W_2, W_3, W_4, b_2, b_3, b_4


###############################################################################################################
###################################################FUNCTIONS###################################################
###############################################################################################################


for i in range(0, total_epochs):  # Total epochs
    C_epoch = np.append(C_epoch, 0)

    for j in range(0, int(num_of_train / batch_size)):

        for k in range(0, batch_size):
            a_in = train_data[:, int(j * batch_size + k)] / 255
            true_val = train_labels[int(j * batch_size + k)]

            a_2, a_3, a_4, a_out, z_2, z_3, z_4 = forward_prop(W_2, W_3, W_4, b_2, b_3, b_4, a_in)
            C, t = calc_error(a_out, true_val)
            dCdW_4, dCdW_3, dCdW_2, dCdb_4, dCdb_3, dCdb_2 = back_prop(a_in, a_2, a_3, a_4, a_out, z_2, z_3, z_4, t,
                                                                       W_2, W_3, W_4)
            dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store, dCdb_3_store, dCdb_2_store = store_grad(dCdW_4,
                                                                                                            dCdW_3,
                                                                                                            dCdW_2,
                                                                                                            dCdb_4,
                                                                                                            dCdb_3,
                                                                                                            dCdb_2,
                                                                                                            dCdW_4_store,
                                                                                                            dCdW_3_store,
                                                                                                            dCdW_2_store,
                                                                                                            dCdb_4_store,
                                                                                                            dCdb_3_store,
                                                                                                            dCdb_2_store)
            C_store = C_store + C

        dCdW_4_store, dCdW_3_store, dCdW_2_store, dCdb_4_store, dCdb_3_store, dCdb_2_store = update_grad(dCdW_4_store,
                                                                                                         dCdW_3_store,
                                                                                                         dCdW_2_store,
                                                                                                         dCdb_4_store,
                                                                                                         dCdb_3_store,
                                                                                                         dCdb_2_store,
                                                                                                         batch_size)
        #####################  ADAM OPTIMIZER  #####################
        time = time + 1

        # UPDATE m
        m_W_2 = m_W_2 * beta_1 + (1 - beta_1) * dCdW_2_store
        m_W_3 = m_W_3 * beta_1 + (1 - beta_1) * dCdW_3_store
        m_W_4 = m_W_4 * beta_1 + (1 - beta_1) * dCdW_4_store

        m_b_2 = m_b_2 * beta_1 + (1 - beta_1) * dCdb_2_store
        m_b_3 = m_b_3 * beta_1 + (1 - beta_1) * dCdb_3_store
        m_b_4 = m_b_4 * beta_1 + (1 - beta_1) * dCdb_4_store

        # Update v
        v_W_2 = v_W_2 * beta_2 + (1 - beta_2) * np.square(dCdW_2_store)
        v_W_3 = v_W_3 * beta_2 + (1 - beta_2) * np.square(dCdW_3_store)
        v_W_4 = v_W_4 * beta_2 + (1 - beta_2) * np.square(dCdW_4_store)

        v_b_2 = v_b_2 * beta_2 + (1 - beta_2) * np.square(dCdb_2_store)
        v_b_3 = v_b_3 * beta_2 + (1 - beta_2) * np.square(dCdb_3_store)
        v_b_4 = v_b_4 * beta_2 + (1 - beta_2) * np.square(dCdb_4_store)

        # Bias corrected m
        m_W_2_cor = m_W_2 / (1 - (beta_1 ** time))
        m_W_3_cor = m_W_3 / (1 - (beta_1 ** time))
        m_W_4_cor = m_W_4 / (1 - (beta_1 ** time))

        m_b_2_cor = m_b_2 / (1 - (beta_1 ** time))
        m_b_3_cor = m_b_3 / (1 - (beta_1 ** time))
        m_b_4_cor = m_b_4 / (1 - (beta_1 ** time))

        # Bias corrected v
        v_W_2_cor = v_W_2 / (1 - (beta_2 ** time))
        v_W_3_cor = v_W_3 / (1 - (beta_2 ** time))
        v_W_4_cor = v_W_4 / (1 - (beta_2 ** time))

        v_b_2_cor = v_b_2 / (1 - (beta_2 ** time))
        v_b_3_cor = v_b_3 / (1 - (beta_2 ** time))
        v_b_4_cor = v_b_4 / (1 - (beta_2 ** time))

        # UPDATE WEIGHT AND BIAS
        W_2 = W_2 - step * m_W_2_cor / (np.sqrt(v_W_2_cor) + epsilon)
        W_3 = W_3 - step * m_W_3_cor / (np.sqrt(v_W_3_cor) + epsilon)
        W_4 = W_4 - step * m_W_4_cor / (np.sqrt(v_W_4_cor) + epsilon)

        b_2 = b_2 - step * m_b_2_cor / (np.sqrt(v_b_2_cor) + epsilon)
        b_3 = b_3 - step * m_b_3_cor / (np.sqrt(v_b_3_cor) + epsilon)
        b_4 = b_4 - step * m_b_4_cor / (np.sqrt(v_b_4_cor) + epsilon)

        # W_2, W_3, W_4, b_2, b_3, b_4 = update_weight_bias(W_2, W_3, W_4, b_2, b_3, b_4, dCdW_4_store, dCdW_3_store,
        #                                                  dCdW_2_store, dCdb_4_store, dCdb_3_store, dCdb_2_store, step)

        C_epoch[i] += C_store

        # Empty stores for next batch
        dCdW_2_store = np.zeros((neurons_1, num_of_input_neurons))
        dCdW_3_store = np.zeros((neurons_2, neurons_1))
        dCdW_4_store = np.zeros((neurons_3, neurons_2))

        dCdb_2_store = np.zeros(neurons_1)
        dCdb_3_store = np.zeros(neurons_2)
        dCdb_4_store = np.zeros(neurons_3)

        C_store = 0

    epoch += 1

    pred = np.array([], dtype=int)
    for cnt in range(0, num_of_train):
        a_in = train_data[:, cnt] / 255
        a_2, a_3, a_4, a_out, z_2, z_3, z_4 = forward_prop(W_2, W_3, W_4, b_2, b_3, b_4, a_in)
        pred = np.append(pred, np.argmax(a_out))

    print('------------------------------------------')
    print("Finished Epoch " + str(epoch) + " of " + str(total_epochs))
    print('\n')
    print("Global Error for Epoch #" + str(epoch) + ' is: ' + str(C_epoch[i] / num_of_train))
    print("Number of missclassified: " + str(np.count_nonzero(pred != train_labels)) + " out of " + str(num_of_train))
    print('------------------------------------------')
    print('\n')

    # Stopping once we reach n% of misclassifications (60,000 training data)
    npercent = 0.03
    if (i > 0) & (np.count_nonzero(pred != train_labels) < npercent * 10000):
        print('Reached ' + str(npercent * 100) + ' percent missclassifications. Stopping...')
        break

plt.plot(np.array(range(0, i + 1)) + 1, C_epoch / num_of_train, marker='o', markersize=7.5, markerfacecolor='r')
plt.grid()
plt.title('Epoch vs. Global Error')
plt.show()





In [None]:
np.count_nonzero(pred != train_labels)

In [None]:
df_test = pd.read_csv('mnist_test.csv')
test = df_test.to_numpy()
test = test.T
test_labels = test[0,:]
test_data = test[1:,:]

In [None]:
#TESTING THE MODEL

dummy, num_of_test = test_data.shape
predictions = np.array([],dtype = int)

for cnt in range(0,num_of_test):
    a_in = test_data[:,cnt]/255
    a_2, a_3, a_4, a_out, z_2, z_3, z_4 = forward_prop(W_2, W_3, W_4, b_2, b_3, b_4, a_in)
    predictions = np.append(predictions,np.argmax(a_out))
    

num_of_missclassified = np.count_nonzero(predictions != test_labels)
print("Number of missclassified from Test Data: " + str(num_of_missclassified) + " out of " + str(num_of_test))
print('\n')
print('Error rate: ' + str(100 - 100*(num_of_test-num_of_missclassified)/num_of_test) + ' percent'   )
