In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/prathameshks/ML-From-Scratch/main/MNIST_DATA/train.csv')

In [3]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into test and training sets

data_test = data[0:1000].T
Y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape

In [4]:
print(f"{X_train.shape = }")
print(f"{X_test.shape = }")
print(f"{Y_train.shape = }")
print(f"{Y_test.shape = }")

X_train.shape = (784, 41000)
X_test.shape = (784, 1000)
Y_train.shape = (41000,)
Y_test.shape = (1000,)


In [5]:
class NeuralNetwork:
    def __init__(self):
        self.w1 = np.random.rand(16, 784) - 0.5
        self.b1 = np.random.rand(16, 1) - 0.5
        self.w2 = np.random.rand(16, 16) - 0.5
        self.b2 = np.random.rand(16, 1) - 0.5 
        self.w3 = np.random.rand(10, 16) - 0.5
        self.b3 = np.random.rand(10, 1) - 0.5        

    def ReLU(self, x):
        return np.maximum(0, x)

    def softMax(self, x):
        # limit overflow
        exp_x = np.exp(np.maximum(x, -60))  # Clip values to prevent overflow
        sum_exp_x = np.sum(exp_x, axis=0)
        return exp_x / sum_exp_x

    def one_hot(self,y):
        one_hot_y = np.zeros((y.size , y.max() + 1))
        one_hot_y[np.arange(y.size) , y] = 1
        one_hot_y = one_hot_y.T
        return one_hot_y

    def deriv_ReLU(self,x):
        return x>0
    
    def forward_prop(self, X):
        z1 = self.w1.dot(X) + self.b1
        a1 = self.ReLU(z1)
        z2 = self.w2.dot(a1) + self.b2
        a2 = self.ReLU(z2)
        z3 = self.w3.dot(a2) + self.b3
        a3 = self.softMax(z3)

        return z1,a1,z2,a2,z3,a3

    def back_prop(self, z1, a1, z2, a2,z3,a3,x, y):
        one_hot_y = self.one_hot(y)
        dz3 = a3 - one_hot_y
        dw3 = 1/m * dz3.dot(a2.T)
        db3 = 1/m * np.sum(dz3)
        
        dz2 = self.w3.T.dot(dz3) * self.deriv_ReLU(z2)
        dw2 = 1/ m * dz2.dot(a1.T)
        db2 = 1 / m * np.sum(dz2)

        dz1 = self.w2.T.dot(dz2) * self.deriv_ReLU(z1)
        dw1 = 1 / m * dz1.dot(x.T)
        db1 = 1 / m * np.sum(dz1)

        return dw1,db1,dw2,db2,dw3,db3

    def update_params(self, dw1,db1,dw2,db2,dw3,db3,alpha):
        self.w1 = self.w1 - alpha*dw1
        self.b1 = self.b1 - alpha*db1
        self.w2 = self.w2 - alpha*dw2
        self.b2 = self.b2 - alpha*db2
        self.w3 = self.w3 - alpha*dw3 
        self.b3 = self.b3 - alpha*db3

    def get_predictions(self, a):
        return np.argmax(a, 0)

    def get_accuracy(self, predictions, y):
        # print(predictions, y)
        return np.sum(predictions == y) / y.size

    def gradient_decent(self,x,y,iterations,alpha):
        for i in range(iterations):
            z1,a1,z2,a2,z3,a3 = self.forward_prop(x)
            dw1,db1,dw2,db2,dw3,db3 = self.back_prop(z1,a1,z2,a2,z3,a3,x,y)
            self.update_params(dw1,db1,dw2,db2,dw3,db3,alpha)
            if(i % 50 == 0):
                print("Iteration: ",i)
                predictions = self.get_predictions(a3)
                print("Accuracy: ",self.get_accuracy(predictions,y))
    

    def make_predictions(self,x):
        _,_,_,_,_,a3 = self.forward_prop(x)
        predictions = self.get_predictions(a3)
        return predictions

    def show_training_predictions(self,index,x,y):
        cur_img = x[:,index,None]
        np.save("img.txt",cur_img)
        prediction = self.make_predictions(cur_img)
        print("Prediction: ", prediction)
        print("Label: ", y[index])
        cur_img = cur_img.reshape((28,28))
        plt.imshow(cur_img, cmap='gray')
        plt.show()

    def save(self,file="4_l_digit_model.npz"):
        np.savez(file, w1=self.w1, b1=self.b1, w2=self.w2, b2=self.b2,w3 = self.w3,b3 = self.b3)
        
    def load(self,file="4_l_digit_model.npz"):
        npz = np.load(file)
        self.w1 = npz['w1']
        self.b1 = npz['b1']
        self.w2 = npz['w2']
        self.b2 = npz['b2']
        self.w3 = npz['w3']
        self.b3 = npz['b3']

In [14]:
# get testing accuracy

def get_accuracy(predictions, y):
    # print(predictions, y)
    return np.sum(predictions == y) / y.size


In [7]:
model = NeuralNetwork()

In [8]:
model.load()

In [12]:
# training
model.gradient_decent(X_train, Y_train, 101, 0.05)


Iteration:  0
Accuracy:  0.9233658536585366
Iteration:  50
Accuracy:  0.9236585365853659
Iteration:  100
Accuracy:  0.9238780487804878


In [13]:
model.save()

In [15]:
# test accuracy
predictions = model.make_predictions(X_test)
print("Accuracy: ",get_accuracy(predictions,Y_test))

Accuracy:  0.914


In [None]:
model.show_training_predictions(104,X_train,Y_train)