In [6]:
# Load the dataset
import numpy as np
import matplotlib.pyplot as plt

X_train = np.loadtxt("datasets/mnist_small_train_in.txt", delimiter=',', usecols=range(784))
y_train = np.loadtxt("datasets/mnist_small_train_out.txt")
X_test = np.loadtxt("datasets/mnist_small_test_in.txt", delimiter=',', usecols=range(784))
y_test = np.loadtxt("datasets/mnist_small_test_out.txt")

In [7]:
# One hot encoding for labels
def one_hot_encoding(y_train):
    n = y_train.shape[0]
    y_onehot = np.zeros((n, 10))
    for i in range(n):
        number = int(y_train[i])
        y_onehot[i][number] = 1
    return y_onehot.T

# activation function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))
    
# for classification at the output layer
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z), axis=0)
    

# categorical cross entropy loss function
def compute_loss(Y, pred):
    return -np.sum(Y * np.log(pred))
    

In [28]:
learning_rate = 2.
h1 = 500 # number of neurons in the first hidden layer
h2 = 25 # number of neurons in the second hidden layer
n = X_train.shape[1] # number of features, i.e. number of neurons in the input layer
d = 10 # number of digits, i.e. number of neurons in the output layer
m = X_train.shape[0] # number of training examples

batch_size = 128
batches = m // batch_size


# Initialize weight and bias parameters for first and second layer
w1 = np.random.randn(h1, n) * np.sqrt(1. / n) # divide by the variance for better initialization
b1 = np.zeros((h1, 1)) * np.sqrt(1. / n)
w2 = np.random.randn(h2, h1) * np.sqrt(1. / h1)
b2 = np.zeros((h2, 1)) * np.sqrt(1. / h1)
w3 = np.random.randn(d, h2) * np.sqrt(1. / h2)
b3 = np.zeros((d, 1)) * np.sqrt(1. / h2)

X = X_train.T
Y = one_hot_encoding(y_train)

# for gradient descent with momentum
b = .9
dw3_mom = np.zeros(w3.shape)
db3_mom = np.zeros(b3.shape)
dw2_mom = np.zeros(w2.shape)
db2_mom = np.zeros(b2.shape)
dw1_mom = np.zeros(w1.shape)
db1_mom = np.zeros(b1.shape)

# train
for i in range(3000): # number of epochs
    
    permutation = np.random.permutation(X.shape[1])
    X = X[:, permutation]
    Y = Y[:, permutation]
    
    
    # forward pass
    z1 = w1@X + b1
    a1 = sigmoid(z1)
    z2 = w2@a1 + b2
    a2 = sigmoid(z2)
    z3 = w3@a2 + b3
    a3 = softmax(z3)
    
    loss = compute_loss(Y, a3)
    
    # backward pass
    dz3 = a3 - Y
    dw3 = (dz3@a2.T) / m
    db3 = dz3@np.ones((m, 1)) / m 
    
    da2 = w3.T@dz3
    dz2 = da2 * sigmoid(z2) * (1 - sigmoid(z2)) 
    dw2 = dz2@a1.T / m
    db2 = dz2@np.ones((m ,1)) / m  
    
    da1 = w2.T@dz2
    dz1 = da1 * sigmoid(z1) * (1 - sigmoid(z1)) 
    dw1 = dz1@X.T / m
    db1 = dz1@np.ones((m ,1)) / m  
    
    # optimization of parameters
    dw3_mom = (b * dw3_mom) + (1 - b) * dw3
    db3_mom = (b * db3_mom) + (1 - b) * db3
    dw2_mom = (b * dw2_mom) + (1 - b) * dw2
    db2_mom = (b * db2_mom) + (1 - b) * db2
    dw1_mom = (b * dw1_mom) + (1 - b) * dw1
    db1_mom = (b * db1_mom) + (1 - b) * db1
    
    w3 = w3 - learning_rate * dw3_mom
    b3 = b3 - learning_rate * db3_mom
    w2 = w2 - learning_rate * dw2_mom
    b2 = b2 - learning_rate * db2_mom
    w1 = w1 - learning_rate * dw1_mom
    b1 = b1 - learning_rate * db1_mom
    
    if i % 500 == 0:
        print(i, loss)

0 15259.964317734743
500 788.5058954680916
1000 215.4832272953175
1500 81.88800600951497
2000 42.861859321574485
2500 27.906208832578013


In [29]:
from sklearn.metrics import classification_report

# Test on the test set
Y_test = one_hot_encoding(y_test)
X = X_test.T

# forward pass with the test data
z1 = w1@X + b1
a1 = sigmoid(z1)
z2 = w2@a1 + b2
a2 = sigmoid(z2)
z3 = w3@a2 + b3
a3 = softmax(z3)

predictions = np.argmax(a3, axis=0)
labels = np.argmax(Y_test, axis=0)
print(classification_report(predictions, labels))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95        99
           1       0.99      0.95      0.97       119
           2       0.92      0.91      0.92       105
           3       0.89      0.92      0.90        98
           4       0.87      0.91      0.89        94
           5       0.82      0.86      0.84        86
           6       0.95      0.94      0.94        97
           7       0.94      0.91      0.92       107
           8       0.92      0.93      0.92        97
           9       0.90      0.89      0.90       102

    accuracy                           0.92      1004
   macro avg       0.92      0.92      0.92      1004
weighted avg       0.92      0.92      0.92      1004

