A Simple Digit Recognizer from the infamous MNIST dataset. This is actually called the "Hello World!" for Neural Networks. The project is taking help from a kaggle note book adn two you tube videos.

1. Building a neural network FROM SCRATCH (no Tensorflow/Pytorch, just numpy & math) by Samson Zhang - https://www.youtube.com/watch?v=w8yWXqWQYmU
2. Neural networks Series by 3Blue1Brown (First 4 videos are good enough for the project) - https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi


In [None]:
#Importing the needed libraries and connecting to the dataset
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data  =pd.read_csv("train.csv")

In [8]:
data = np.array(data)
m,n = data.shape
print(m,n)

42000 785


In [21]:
# There could be a chance the dataset will be ordered, so we shuffle it
np.random.shuffle(data) 

#Splitting the adatset into dev and training sets and transposing it

#Dev dataset
data_dev=data[1:1000].T
Y_dev=data_dev[0]
X_dev=data_dev[1:n]
# Normalizing the data from 0–255 (grayscale) to 0–1
X_dev= X_dev/255.0 

#Training dataset
data_train=data[1000:m].T
Y_train= data_train[0]
X_train=data_train[1:n]
X_train=X_train/255.0

_,m_train=X_train.shape

In [22]:
m_train

41000

Neural Network Architecture
Input Layer a[0]:

    784 units

    One for each pixel in a 28×28 image

Hidden Layer a[1]:

    10 units

    Uses ReLU activation function

Output Layer a[2]:

    10 units

    One for each digit class (0–9)

    Uses Softmax activation to output class probabilities

In [24]:
#Defining the input parameters or initial weight and biases for the neural network

def init_params():
    W1=np.random.rand(10,784)-0.5 #to keeep the values between 0.5 and -0.5
    b1=np.random.rand(10,1)-0.5
    W2=np.random.rand(10,10)-0.5
    b2=np.random.rand(10,1)-0.5
    return W1, b1, W2, b2


#Lets define the ReLu activation function
def ReLU(Z):
    return np.maximum(0, Z) #there is still possibility for making it faster- i will come back for this later

#Lets define the softmax function- will help convert the activation values into proobabilities
def softmax(Z):
    A= np.exp(Z)/sum(np.exp(Z))
    return A

#Lets define the forward propogation function
def forward_prop(W1, b1, W2, b2, X):
    Z1= np.dot(W1, X) + b1
    A1=ReLU(Z1)
    Z2= np.dot(W2, A1) + b2
    A2= softmax(Z2)
    return Z1, A1, Z2, A2

#Lets define the one-hot encoding function
def one_hot(Y):
    one_hot_Y= np.zeros((Y.size, Y.max()+1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y= one_hot_Y.T
    return one_hot_Y

#Lets define the ReLU derivative function
def ReLU_deriv(Z):
    return Z>0

#Lets define the backward propogation function
def backward_prop(W1, Z1, A1, W2, A2, Z2, X, Y):
    one_hot_Y=one_hot(Y)
    dZ2= A2- one_hot_Y
#Each column of A1 is the activation from the hidden layer for one example, and each column of dZ2 is the error signal for the same example. and hence we transponse A1
    dW2=1/m*np.dot(dZ2,A1.T)
    db2= 1/m*np.sum(dZ2)
    dZ1=np.dot(W2.T,dZ2)*ReLU_deriv(Z1) #called the hadamard product (element wise multiplication to keep the gradient only whre the neuron was active Z1>0
    dW1=1/m*np.dot(dZ1,X.T)
    db1=1/m*np.sum(dZ1)
    return dW1, db1, dW2, db2

#Lets define the update parameters function, also passing thhe hyper parameter learning rate- alpha
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1= W1-alpha*dW1
    b1= b1-alpha*db1
    W2= W2-alpha*dW2
    b2= b2-alpha*db2
    return W1, b1, W2, b2

In [25]:
#defining the get predictions function
def get_predictions(A2):
    return np.argmax(A2, axis=0)  #returns the index of the maximum value in each column of A2, which corresponds to the predicted class

#defining the accuracy function
def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions== Y) / Y.size  #calculates the accuracy by comparing the predictions with the true labels Y and dividing by the total number of examples

#defining the gradient descent function
def gradient_descent(X,Y,alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(W1, Z1, A1, W2, A2, Z2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i%10==0:
            print("iteration: ", i)
            predictions= get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return  W1, b1, W2, b2

In [None]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

In [32]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

#lets test some predictiosn from our dev dataset
def test_predictions(index, W1, b1, W2, b2):
    current_image= X_train[:, index, None]
    prediction = make_predictions(current_image,W1, b1, W2, b2)
    label = Y_train[index]
    print(f"Prediction: {prediction}, Actual: {label}")
    
    digit= current_image.reshape((28, 28))*255
    plt.gray()
    plt.imshow(digit, interpolation='nearest')
    plt.show()

In [None]:
test_predictions(74, W1, b1, W2, b2)

In [None]:
#predictions on the dev dataset
dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)
get_accuracy(dev_predictions, Y_dev)