In [34]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [35]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
data = np.array(data).T
y = data[0]
X = data[1:]

In [37]:
X = X/255

In [38]:
data.shape

(785, 42000)

In [39]:
m, n = X.shape

In [40]:
m, n

(784, 42000)

In [41]:
y.shape, X.shape

((42000,), (784, 42000))

In [42]:
def init_param():
    w1 = np.random.rand(50, 784) - 0.5
    b1 = np.random.rand(50, 1) - 0.5
    w2 = np.random.rand(25, 50) - 0.5
    b2 = np.random.rand(25, 1) - 0.5
    w3 = np.random.rand(10, 25) - 0.5
    b3 = np.random.rand(10, 1) - 0.5
    w4 = np.random.rand(10, 10) - 0.5
    b4 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2, w3, b3, w4, b4


In [43]:
def relu(x):
    return np.maximum(x, 0)

def softmax(x):
    exp_shifted = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_shifted / np.sum(exp_shifted, axis=0, keepdims=True)

def fwd_prop(w1, b1, w2, b2, w3, b3, w4, b4, X):
    z1 = w1.dot(X) + b1
    a1 = relu(z1)
    z2 = w2.dot(a1) + b2
    a2 = relu(z2)
    z3 = w3.dot(a2) + b3
    a3 = relu(z3)
    z4 = w4.dot(a3) + b4
    a4 = softmax(z4)
    return z1, a1, z2, a2, z3, a3, z4, a4

In [44]:
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [45]:
def deriv_relu(x):
    return x > 0

In [46]:
def back_prop(a1, z1, a2, z2, a3, z3, a4, z4, w1, w2, w3, w4, X, y):
    one_hot_y = one_hot(y)

    dz4 = a4 - one_hot_y  # y is already encoded
    dw4 = 1/m * dz4.dot(a3.T)
    db4 = 1/m * np.sum(dz4, axis=1, keepdims=True)

    dz3 = w4.T.dot(dz4) * deriv_relu(z3)
    dw3 = 1/m * dz3.dot(a2.T)
    db3 = 1/m * np.sum(dz3, axis=1, keepdims=True)

    dz2 = w3.T.dot(dz3) * deriv_relu(z2)
    dw2 = 1/m * dz2.dot(a1.T)
    db2 = 1/m * np.sum(dz2, axis=1, keepdims=True)

    dz1 = w2.T.dot(dz2) * deriv_relu(z1)
    dw1 = 1/m * dz1.dot(X.T)
    db1 = 1/m * np.sum(dz1, axis=1, keepdims=True)

    return dw1, db1, dw2, db2, dw3, db3, dw4, db4

In [47]:
def update(w1, b1, w2, b2, w3, b3, w4, b4, dw1, db1, dw2, db2, dw3, db3, dw4, db4, alpha):
    w1 = w1 - alpha*dw1
    b1 = b1 - alpha*db1
    w2 = w2 - alpha*dw2
    b2 = b2 - alpha*db2
    w3 = w3 - alpha*dw3
    b3 = b3 - alpha*db3
    w4 = w4 - alpha*dw4
    b4 = b4 - alpha*db4
    return w1, b1, w2, b2, w3, b3, w4, b4

In [51]:
def get_predictions(a4):
    return np.argmax(a4, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, y, iterations, alpha):
    
    w1, b1, w2, b2, w3, b3, w4, b4 = init_param()
    for i in range(iterations):
        z1, a1, z2, a2, z3, a3, z4, a4 = fwd_prop(w1, b1, w2, b2, w3, b3, w4, b4, X)
        dw1, db1, dw2, db2, dw3, db3, dw4, db4 = back_prop(a1, z1, a2, z2, a3, z3, a4, z4, w1, w2, w3, w4, X, y)
        w1, b1, w2, b2, w3, b3, w4, b4  = update(w1, b1, w2, b2, w3, b3, w4, b4, dw1, db1, dw2, db2, dw3, db3, dw4, db4, alpha)
        if i%100 == 0:
            print(f"Iteration: {i}")
            print(f"Accuracy: {get_accuracy(get_predictions(a4), y)}")
    return w1, b1, w2, b2, w3, b3, w4, b4

In [52]:
w1, b1, w2, b2, w3, b3, w4, b4 = gradient_descent(X, y, 1001, 0.01)

Iteration: 0
[3 6 3 ... 6 3 9] [1 0 1 ... 7 6 9]
Accuracy: 0.11426190476190476
Iteration: 100
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.6661904761904762
Iteration: 200
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.854904761904762
Iteration: 300
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9004285714285715
Iteration: 400
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9255238095238095
Iteration: 500
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.934
Iteration: 600
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9419761904761905
Iteration: 700
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9469285714285715
Iteration: 800
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9454761904761905
Iteration: 900
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9551904761904761
Iteration: 1000
[1 0 1 ... 7 6 9] [1 0 1 ... 7 6 9]
Accuracy: 0.9602619047619048


In [60]:
test = pd.read_csv("test.csv")
test = np.array(test).T
z1, a1, z2, a2, z3, a3, z4, a4 = fwd_prop(w1, b1, w2, b2, w3, b3, w4, b4, test)
prediction = get_predictions(a4)

In [61]:
prediction.shape

(28000,)

In [63]:
df = pd.DataFrame(data={"ImageId": np.arange(1, len(test[0])+1), "Label": prediction})

In [64]:
df

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9


In [65]:
df.to_csv("submission.csv", sep=",", index=False, encoding="utf-8")

In [69]:
np.save("w1.npy", w1)
np.save("w2.npy", w2)
np.save("w3.npy", w3)
np.save("w4.npy", w4)
np.save("b1.npy", b1)
np.save("b2.npy", b2)
np.save("b3.npy", b3)
np.save("b4.npy", b4)
