In [10]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
import time
import sys
#np.set_printoptions(threshold=sys.maxsize)
timer = time.time()

In [11]:
path = ''

In [3]:
X_train = pd.read_csv(path+'train_image.csv', header=None).values
Y_train = pd.read_csv(path+'train_label.csv', header=None).values
X_test = pd.read_csv(path+'test_image.csv', header=None).values
Y_test = pd.read_csv(path+'test_label.csv', header=None).values

In [21]:
# X_train = genfromtxt(path+'train_image.csv', delimiter=',')
# Y_train = genfromtxt(path+'train_label.csv', delimiter=',')
# X_test = genfromtxt(path+'test_image.csv', delimiter=',')
# Y_test = genfromtxt(path+'test_label.csv', delimiter=',')

In [22]:
# one-hot encoding train labels
classes = 10
num = Y_train.shape[0]
Y_train = Y_train.reshape(1, num)
Y_train = np.eye(classes)[Y_train.astype('int32')]
Y_train = Y_train.T.reshape(classes, num)

# one-hot encoding train labels
num = Y_test.shape[0]
Y_test = Y_test.reshape(1, num)
Y_test = np.eye(classes)[Y_test.astype('int32')]
Y_test = Y_test.T.reshape(classes, num)

In [23]:
X_train = X_train.T
X_test = X_test.T

In [24]:
def sigmoid(z):
    s = 1/(1+np.exp(-z))
    return s

def relu(z):
    return np.maximum(0, z)

def dReLU(x):
    return 1*(x>0)

def compute_loss(Y, Y_hat):
    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    L = -(1/Y.shape[1]) * L_sum
    return L

In [25]:
def feed_forward(X, params):
    
    cache = {}

    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]
    cache["A1"] = relu(cache["Z1"])
    
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]
#     cache["Z2"]/=255.0
    cache["A2"] = relu(cache["Z2"])
    
    cache["Z3"] = np.matmul(params["W3"], cache["A2"]) + params["b3"]
    cache["A3"] = np.exp(cache["Z3"]) / np.sum(np.exp(cache["Z3"]), axis=0)

    return cache

def back_propagate(X, Y, params, cache, m):
    
    dZ3 = cache["A3"] - Y
    dW3 = (1/m) * np.matmul(dZ3, cache["A2"].T)
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

    dA2 = np.matmul(params["W3"].T, dZ3)
#     dZ2 = dA2 * sigmoid(cache["Z2"]) * (1 - sigmoid(cache["Z2"]))
    dZ2 = dA2 * dReLU(cache["Z2"])
    dW2 = (1/m) * np.matmul(dZ2, cache['A1'].T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.matmul(params["W2"].T, dZ2)
#     dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))
    dZ1 = dA1 * dReLU(cache["Z1"])
    dW1 = (1/m) * np.matmul(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    return {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2, "dW3": dW3, "db3": db3}

In [26]:
def xavier(n_cur,n_prev):
    return np.sqrt(6/(n_cur+n_prev))
def kaiming(n_prev):
    return np.sqrt(2/n_prev)

In [27]:
n_x = X_train.shape[0]
n_h1 = 128
n_h2 = 128
# initialization
init1 = xavier(n_h1, n_x)
init2 = xavier(n_h2,n_h1)
init3 = xavier(classes,n_h2)

params = {"W1": np.random.randn(n_h1, n_x) * 0.01,"b1": np.zeros((n_h1, 1)),
          "W2": np.random.randn(n_h2, n_h1) * 0.01,"b2": np.zeros((n_h2, 1)),
          "W3": np.random.randn(classes, n_h2) * 0.01,"b3": np.zeros((classes, 1))}

In [28]:
epochs = 20
batch_size = 64
beta = 0.85 #momentum
lr = 0.01 #learning rate
batches = int(np.ceil(X_train.shape[1]/batch_size))

In [29]:
t = time.time()
backup = []
for i in range(epochs):
    
    dW1, db1 = 0, 0
    dW2, db2 = 0, 0
    dW3, db3 = 0, 0
    
#     if i>12:
#         lr = 0.001
    
    for j in range(batches):

        a = j * batch_size
        b = min(a+batch_size, X_train.shape[1]-1)
        X = X_train[:, a:b]
        Y = Y_train[:, a:b]
        m = b-a

        cache = feed_forward(X, params)
        grads = back_propagate(X, Y, params, cache, m)
        
        dW1 = (beta * dW1 + (1 - beta) * grads["dW1"])
        db1 = (beta * db1 + (1 - beta) * grads["db1"])
        dW2 = (beta * dW2 + (1 - beta) * grads["dW2"])
        db2 = (beta * db2 + (1 - beta) * grads["db2"])
        dW3 = (beta * dW3 + (1 - beta) * grads["dW3"])
        db3 = (beta * db3 + (1 - beta) * grads["db3"])

        # gradient descent
        params["W1"] = params["W1"] - lr * dW1
        params["b1"] = params["b1"] - lr * db1
        params["W2"] = params["W2"] - lr * dW2
        params["b2"] = params["b2"] - lr * db2
        params["W3"] = params["W3"] - lr * dW3
        params["b3"] = params["b3"] - lr * db3

    # forward pass on training set
    backup = feed_forward(X_train, params)
    train_loss = compute_loss(Y_train, backup["A3"])
    cache = feed_forward(X_test, params)
#     test_loss = compute_loss(Y_test, cache["A3"])
    
    print("Epoch {}: training loss = {}".format(i + 1, train_loss))
print('Time taken for Predictions - ',(time.time()-t)/60)

Epoch 1: training loss = 0.1394328612642224
Epoch 2: training loss = 0.08765555420472494
Epoch 3: training loss = 0.07725672761648951
Epoch 4: training loss = 0.06280135611328559
Epoch 5: training loss = 0.045084621559952506
Epoch 6: training loss = 0.056319887814142044
Epoch 7: training loss = 0.03374082094558189
Epoch 8: training loss = 0.026228804788225184
Epoch 9: training loss = 0.04102899172221208
Epoch 10: training loss = 0.02244475603344319
Epoch 11: training loss = 0.0257766899946393
Epoch 12: training loss = 0.023012051844750886
Epoch 13: training loss = 0.015060984721757195
Epoch 14: training loss = 0.01980176322197511
Epoch 15: training loss = 0.024690482498565352
Epoch 16: training loss = 0.011018727922597999
Epoch 17: training loss = 0.008117709753188243
Epoch 18: training loss = 0.01586675375062442
Epoch 19: training loss = 0.011435696327844034
Epoch 20: training loss = 0.009911402620447637
Time taken for Predictions -  1.1233383019765217


In [30]:
print('Train Accuracy - ',np.sum(np.argmax(backup['A3'],axis=0) == np.argmax(Y_train,axis=0))/(Y_train.shape[1]/100))
print('Test Accuracy - ',np.sum(np.argmax(cache['A3'],axis=0) == np.argmax(Y_test,axis=0))/(Y_test.shape[1]/100))
print('Train Size - ',Y_train.shape[1], ' Test Size - ',cache['A3'].shape[1])

Train Accuracy -  99.66
Test Accuracy -  98.01
Train Size -  60000  Test Size -  10000


In [48]:
df = {'predictions':np.argmax(cache['A3'], axis=0)}
df2 = pd.DataFrame(data=df)
df2.to_csv(path+'test_predictions.csv', header=None, index=None)
print('Time taken - ',time.time()-timer)

Time taken -  1124.0676889419556


In [49]:
np.savetxt(path+'test_predictions.csv', np.argmax(cache['A3'], axis=0), fmt='%i', delimiter=",")