In [159]:
import pandas as pd
import numpy as np
import math
from layer import *  # HiddenLayer, LastLayer
from network import * # Network
from sklearn.metrics import accuracy_score
import warnings
from collections import defaultdict

In [160]:
data = pd.read_csv("mnist_data.csv")
print(data.shape)
data.head()

(42000, 785)


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
data = data.dropna(how="any")
data.shape

(42000, 785)

In [162]:
data["label"].value_counts()

label
1    4684
7    4401
3    4351
9    4188
2    4177
6    4137
0    4132
4    4072
8    4063
5    3795
Name: count, dtype: int64

In [163]:
# balancing categories
def balance_df(df, label_col):
    max_samples = df[label_col].value_counts().min()
    n_samples = max_samples
    
    df_list = list()
    for label in df[label_col].unique():
        label_df = df.loc[df[label_col] == label].reset_index(drop=True)
        label_df = label_df.iloc[:n_samples]
        df_list.append(label_df)
    
    return pd.concat(df_list).reset_index(drop=True)

In [164]:
data = balance_df(data, "label")

In [165]:
# shuffle the rows
data = data.sample(frac=1, random_state=9499)
data

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
27138,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1886,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16039,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26010,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30721,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10504,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6425,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27144,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1115,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
print("Total NaN:", data.isna().sum().sum())

Total NaN: 0


In [167]:
data = pd.get_dummies(data, columns=["label"], dtype=int)
data

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9
27138,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1886,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
16039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
26010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30721,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10504,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6425,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
27144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1115,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [168]:
train = data.head(30000)
test = data.tail(2000)

label_cols = [f"label_{i}" for i in range(10)]

train_x = train.drop(label_cols, axis=1)
train_y = train[label_cols]

test_x = test.drop(label_cols, axis=1)
test_y = test[label_cols]

In [169]:
train_x = np.array(train_x).T / 255
train_y = np.array(train_y).T
test_x = np.array(test_x).T / 255

print(test_x)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [170]:
# First define the activation function
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1-np.tanh(x)**2

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    sx = sigmoid(x)
    return sx * (1 - sx)

def hard_sigmoid(x):
    x_new = x.copy()
    x_new = np.where(x <= -5, -5, x_new)
    x_new = np.where(x >= 5, 5, x_new)
    return sigmoid(x_new)

def hard_sigmoid_derivative(x):
    return np.where((x <= -5) | (x >= 5), 0, sigmoid_derivative(x))


    

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def log_softmax(x):
    try:
        exp_values = np.exp(x)
    except RuntimeWarning:
        print(x)
    return x - np.log(np.sum(exp_values, axis=0, keepdims=True))

def log_softmax_derivative(x):
    exp_values = np.exp(x - np.max(x, axis=0, keepdims=True))
    C = np.sum(exp_values, axis=0, keepdims=True) - exp_values
    return 1 - (exp_values/(exp_values+C))


In [171]:
hidden1 = HiddenLayer([784, 128], relu, relu_derivative)
hidden2 = HiddenLayer([128, 64], relu, relu_derivative)
out = LastLayer([64,10], hard_sigmoid, hard_sigmoid_derivative)

In [172]:
net = Network([hidden1, hidden2, out])
net.fit(train_x, train_y, batch_size=100, epochs=30, learning_rate=0.1)

Epoch: 29	Error:0.019675316220614763

In [173]:
net.fit(train_x, train_y, batch_size=100, epochs=10, learning_rate=0.05)

Epoch: 9	Error:0.018482245168303296

In [174]:
y_pred = net.forward_propagate(test_x)[-1]

In [175]:
y_pred = y_pred.T

y_pred = [list(entry).index(entry.max()) for entry in y_pred]
test_y = [list(entry).index(entry.max()) for entry in np.array(test_y)]

In [176]:
accuracy_score(test_y, y_pred)

0.954