In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [51]:
data = np.array(data)
d, e = data.shape
np.random.shuffle(data) #shuffles the data so training is better

data_test = data[0:1000].T #transpose so that each column is a picture instead of row
Y_test = data_test[0] #labels row
X_test = data_test[1:e] #all data thats not labels
X_test = X_test / 255. #regularize pixels between 0 and 1

data_train = data[1000:d].T
Y_train = data_train[0]
X_train = data_train[1:e]
X_train = X_train / 255.
_,d_train = X_train.shape #x is number of training examples

In [52]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
Y_train

array([5, 4, 5, ..., 0, 8, 4])

In [54]:
x_train

41000

In [63]:
def the_params():
    w1 = np.random.normal(size=(10, 784)) * np.sqrt(1./(784)) #xavier initialization that takes care of vanishing gradients
    b1 = np.random.normal(size=(10, 1)) * np.sqrt(1./10)
    w2 = np.random.normal(size=(10, 10)) * np.sqrt(1./20)
    b2 = np.random.normal(size=(10, 1)) * np.sqrt(1./(784))
    return w1, b1, w2, b2

def softmax(Z):
    exp = np.exp(Z - np.max(Z))
    return exp / exp.sum(axis=0)


def relU(C):
    K = np.maximum(C, 0)
    return K

def forward_prop(w1, b1, w2, b2, x):
    z1 = w1.dot(x) + b1
    a1 = relU(z1)
    z2 = w2.dot(a1) + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2

In [60]:
def relU_der(Z):
    return Z > 0  #derivative of relU function

def one_hot(Y):
    one_hot_Y = np.zeros((Y.max()+1,Y.size))
    one_hot_Y[Y,np.arange(Y.size)] = 1
    return one_hot_Y

def backward_prop(z1, a1, z2, a2, w1, w2, X, Y):
    one_hot_Y = one_hot(Y)
    dz2 = a2 - one_hot_Y
    dw2 = 1 / d * dz2.dot(a1.T)
    db2 = 1 / d * np.sum(dz2)
    dz1 = w2.T.dot(dz2) * relU_der(z1)
    dw1 = 1 / d * dz1.dot(X.T)
    db1 = 1 / d * np.sum(dz1)
    return dw1, db1, dw2, db2

def update_wb(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 = w1 - alpha * dw1
    w2 = w2 - alpha * dw2 
    b1 = b1 - alpha * db1    
    b2 = b2 - alpha * db2    
    return w1, b1, w2, b2

In [72]:
def get_predictions(a2):
    return np.argmax(a2, 0) # converts the output activations into class labels by highest prob

def get_accuracy(pred, Y):
    print(pred, Y)
    return np.sum(pred == Y) / Y.size # calculates the accuracy of the model's predictions by comparing them to the true class labels

def gradient_descent(X, Y, alpha, iterations):
    w1, b1, w2, b2 = the_params()

    for i in range(iterations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, X)
        dw1, db1, dw2, db2 = backward_prop(z1, a1, z2, a2, w1, w2, X, Y)
        w1, b1, w2, b2 = update_wb(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)

        if i % 10 == 0:
            print(f"Iteration: {i}")
            accuracy = get_accuracy(get_predictions(a2), Y)
            print(f"Accuracy: {accuracy}")

    return w1, b1, w2, b2

def get_the_predictions(X, w1, b1, w2, b2):
    _, _, _, a2 = forward_prop(w1, b1, w2, b2, X)
    predict = get_predictions(a2)
    return predict


In [62]:
w1, b1, w2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

Iteration: 0
[3 4 2 ... 1 3 3] [5 4 5 ... 0 8 4]
Accuracy: 0.1318048780487805
Iteration: 10
[3 4 4 ... 0 3 3] [5 4 5 ... 0 8 4]
Accuracy: 0.41136585365853656
Iteration: 20
[3 7 4 ... 0 3 3] [5 4 5 ... 0 8 4]
Accuracy: 0.4858048780487805
Iteration: 30
[3 7 4 ... 0 3 3] [5 4 5 ... 0 8 4]
Accuracy: 0.5832195121951219
Iteration: 40
[3 7 4 ... 0 8 3] [5 4 5 ... 0 8 4]
Accuracy: 0.6785121951219513
Iteration: 50
[3 9 4 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.7294390243902439
Iteration: 60
[3 9 4 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.7603414634146342
Iteration: 70
[3 9 4 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.7847317073170732
Iteration: 80
[3 9 4 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.8050243902439025
Iteration: 90
[3 9 5 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.8202682926829268
Iteration: 100
[3 9 5 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.8318780487804878
Iteration: 110
[5 9 5 ... 0 8 4] [5 4 5 ... 0 8 4]
Accuracy: 0.8412439024390244
Iteration: 120
[5 9 5 ... 0 8 4] [5 4 5 ... 0 8 4]

In [64]:
w1, b1, w2, b2 = gradient_descent(X_test, Y_test, 0.10, 500)

Iteration: 0
[6 6 6 6 6 6 6 2 4 6 6 6 4 6 6 6 6 6 9 6 6 6 6 2 6 6 6 6 6 6 6 6 6 6 6 4 6
 6 6 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 2 6 6 6 6 6 6 6 6 6 6 6 4 6 6 6 6 6
 6 6 6 6 6 6 6 2 4 0 6 6 6 6 9 6 6 6 6 6 6 6 6 6 6 6 6 3 6 6 6 2 6 6 4 6 6
 6 4 6 6 6 6 6 6 6 4 6 6 6 6 4 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6
 6 6 0 6 6 6 6 6 6 6 6 6 6 0 6 6 6 6 6 6 6 6 6 4 6 4 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 9 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 2 2 6 0 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 4 4 6 6 6 6 6 4 6 6 6 6
 6 6 6 6 6 9 6 4 6 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 4 6 6 4 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 4 6 6 6 6 4 6 6 0 6 6 6 6 0 6 4 2 6 0 3 0 6 6 6 6 6
 6 6 6 4 6 6 6 6 6 2 6 6 6 6 6 6 6 3 6 6 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 2 9 6 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 4 6 6

In [73]:
test_pred = get_the_predictions(X_test, w1, b1, w2, b2)
get_accuracy(test_pred, Y_test)

[0 2 2 6 6 6 4 2 4 2 6 4 2 6 2 6 6 4 2 0 6 6 6 4 2 2 4 2 2 6 6 0 6 2 0 4 6
 2 6 2 6 6 6 4 6 6 6 6 6 6 2 6 4 4 4 6 2 2 6 4 2 6 0 6 6 0 6 6 4 6 6 6 6 0
 0 2 6 6 6 6 2 2 4 0 6 2 4 4 2 6 6 6 2 6 3 4 6 2 6 6 6 6 4 4 0 2 6 6 4 4 6
 6 4 4 2 6 6 4 4 6 0 0 2 6 2 4 2 6 6 4 4 4 2 4 6 6 6 4 6 6 6 6 6 6 6 4 0 6
 6 6 0 4 6 4 6 0 6 6 0 2 0 6 0 6 0 2 6 0 2 4 4 6 4 4 4 0 6 6 4 2 6 2 2 6 4
 2 6 0 6 0 6 6 6 6 6 0 2 6 0 0 4 0 6 6 6 6 6 6 4 4 0 6 4 0 4 2 6 6 4 0 6 6
 6 6 2 2 2 0 2 4 0 4 6 0 4 4 6 4 6 6 4 2 6 0 6 4 6 6 0 6 6 2 4 2 2 6 6 6 6
 2 4 4 0 6 4 2 6 6 6 6 6 4 6 4 6 0 6 4 6 6 4 6 2 6 4 4 4 6 6 6 6 6 4 6 6 6
 2 6 2 6 2 0 4 4 6 2 4 4 2 6 2 6 6 6 6 4 6 6 6 0 6 2 0 6 0 2 4 6 2 0 6 2 6
 6 2 6 0 4 2 6 4 2 6 2 6 4 2 6 4 6 0 4 4 6 6 6 6 6 6 6 6 0 6 6 0 6 6 2 4 4
 6 2 4 2 6 6 4 6 6 0 6 4 2 6 4 6 0 4 6 0 6 6 2 0 0 0 4 2 4 0 2 0 0 6 4 6 6
 6 6 6 4 2 0 2 6 6 2 2 0 0 4 4 0 2 6 6 2 4 6 6 2 2 6 6 6 6 6 6 4 4 6 0 6 0
 6 2 2 0 6 6 6 6 6 6 6 6 6 6 2 6 6 2 4 6 2 6 4 2 2 6 2 2 4 4 6 6 0 6 6 4 6
 0 6 6 4 0 2 6 0 6 4 6 4 

0.16