In [410]:
import numpy as np
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt
import time
import gzip
import pickle
import pandas as pd

In [2]:
with gzip.open('MNIST_data/mnist.pkl.gz','rb') as ff :
    u = pickle._Unpickler( ff )
    u.encoding = 'latin1'
    train, val, test = u.load()

In [3]:
train_data = train[0]
train_data_labels = train[1]
test_data = test[0]
test_data_labels = test[1]

print(np.shape(train_data))
print(np.shape(train_data_labels))
print(np.shape(test_data))
print(np.shape(test_data_labels))

(50000, 784)
(50000,)
(10000, 784)
(10000,)


In [4]:
cls_one_hot = tf.one_hot(train_data_labels, depth=10)
cls_t_one_hot = tf.one_hot(test_data_labels, depth=10)
sess = tf.Session()
with sess.as_default():
    train_target = cls_one_hot.eval()
    test_target = cls_t_one_hot.eval()

In [93]:
shfl = np.arange(train_data.shape[0])
np.random.shuffle(shfl)
train_data_shfl = train_data[shfl,:]
train_target_shfl = train_target[shfl]

In [108]:
train_data_mean = np.mean(train_data_shfl)
train_data_shfl2 = (train_data_shfl - train_data_mean)

test_data_mean = np.mean(test_data)
test_data2 = (test_data - test_data_mean)

In [426]:
train_data_var = np.sqrt(np.var(train_data_shfl2))
train_data_shfl3 = (train_data_shfl2 / train_data_var)

test_data_var = np.sqrt(np.var(test_data2))
test_data3 = (test_data2 / test_data_var)

In [120]:
print(np.var(train_data_shfl3))
print(np.var(test_data3))

1.0
1.0


In [114]:
print(np.shape(train_data))
print(np.shape(train_target))
print(np.shape(test_data))
print(np.shape(test_target))

(50000, 784)
(50000, 10)
(10000, 784)
(10000, 10)


In [6]:
def rgb_to_y(rgb):
    rgb_to_y_mat = np.array([[0.299], [0.587], [0.114]])
    y = np.dot(rgb, rgb_to_y_mat)
    y = np.reshape(y, [y.shape[0], y.shape[1], y.shape[2]])
    return y

# Batch Normalization

In [368]:
def initialize_w(shape):
    return (np.random.randn(shape[0],shape[1])*0.1)

def initialize_b(shape):
    return np.zeros(shape)

In [369]:
def initialize_parameters(layers):
    parameters = {}
    np.random.seed(10)
    num_layers = np.size(layers) - 1
    for i in range(num_layers):
        parameters["W" + str(i+1)] = initialize_w([layers[i+1], layers[i]])
        parameters["b" + str(i+1)] = initialize_b([layers[i+1], 1])
    
    return parameters

In [370]:
def calc_z(X, W, b):
    Z = np.dot(W, X.T) + b
    return Z

In [433]:
def calc_activation_fn(Z, type, dropout=0):
    if(type == 'relu'):
        A = np.maximum(0,Z)
        if(dropout):
            drop_rows = np.random.randn(A.shape[0],1)
            mid_point = np.mean(drop_rows)
            drop_rows = np.where(drop_rows>mid_point, 1, 0)
            A = np.multiply(A, drop_rows)
                
    else: #softmax
        denom = np.sum(np.exp(Z), axis=0, keepdims=True)
        A = np.exp(Z) / (denom)
        
    return A.T

In [372]:
def compute_relu_grad(Z):
    grad = np.where(Z>0, 1, 0)
    return grad

In [428]:
def forward_pass(X, parameters, num_layers, dropout=0):
    cache = {}
    cache["A0"] = X
    A_prev = X
    
    for i in range(num_layers-1):        
        cache["Z" + str(i+1)] = calc_z(A_prev, parameters["W" + str(i+1)], parameters["b" + str(i+1)])
        cache["A" + str(i+1)] = calc_activation_fn(cache["Z" + str(i+1)], "relu", dropout)
        A_prev = cache["A" + str(i+1)]
    
    cache["Z" + str(num_layers)] = calc_z(A_prev, parameters["W" + str(num_layers)], parameters["b" + str(num_layers)])    
    cache["A" + str(num_layers)] = calc_activation_fn(cache["Z" + str(num_layers)], "softmax")
    
    return cache

In [374]:
def backward_pass(Y, parameters, cache, num_layers):
    
    m = Y.shape[0]
    grads = {}
    
    dZ_prev = np.transpose(cache["A" + str(num_layers)] - Y)
    grads["dW" + str(num_layers)] = (1/m) * np.dot(dZ_prev, cache["A" + str(num_layers-1)])
    grads["db" + str(num_layers)] = (1/m) * np.sum(dZ_prev, axis=1, keepdims=True)
    
    for i in range(num_layers-1, 0, -1):
        dZ = np.transpose(np.dot(dZ_prev.T, parameters["W" + str(i+1)])) * compute_relu_grad(cache["Z" + str(i)])
        grads["dW" + str(i)] = (1/m) * np.dot(dZ, cache["A" + str(i-1)])
        grads["db" + str(i)] = (1/m) * np.sum(dZ, axis=1, keepdims=True)
        dZ_prev = dZ
    
    return grads

In [375]:
def update_weights(parameters, grads, learning_rate, num_layers):
    for i in range(num_layers):
        parameters["W" + str(i+1)] -= learning_rate*grads["dW" + str(i+1)]
        parameters["b" + str(i+1)] -= learning_rate*grads["db" + str(i+1)]
    return parameters

In [376]:
def compute_cost(Ypred, Y):
    m = Y.shape[0]
    cost = -(1/m) * np.sum(np.multiply(Y, np.log(Ypred+1e-10)) + np.multiply((1-Y), np.log(1-Ypred+1e-10)))
    return cost

In [429]:
def model_mb(X, Y, layers, num_iterations=100, learning_rate=0.001, mini_batch_size=100, dropout=0):
    parameters = initialize_parameters(layers)
    num_layers = np.size(layers)-1
    num_mini_batches = int(X.shape[0]/mini_batch_size)
    
    for i in range(num_iterations):
        shfl = np.arange(num_mini_batches).astype(int)
        np.random.shuffle(shfl)
        cost = 0
        for j in range(num_mini_batches):
            batch_idx = shfl[j]
            X_batch = X[batch_idx*mini_batch_size:(batch_idx+1)*mini_batch_size,:]
            Y_batch = Y[batch_idx*mini_batch_size:(batch_idx+1)*mini_batch_size,:]
            
            cache = forward_pass(X_batch, parameters, num_layers, dropout)
            grads = backward_pass(Y_batch, parameters, cache, num_layers)
            parameters = update_weights(parameters, grads, learning_rate, num_layers)
            
            cost += compute_cost(cache["A"+str(num_layers)], Y_batch)
        
        if(0 == ((i+1)%100)):
            print("Iteration: %d, Cost: %.3f", i+1, cost/num_mini_batches)
    
    return parameters

In [378]:
def predictions(labels, data, params, num_layers):
    cache = forward_pass(data, params, num_layers)
    preds = cache["A"+str(num_layers)]
    #preds = np.random.randn(preds.shape[0], preds.shape[1])

    pm = np.amax(preds, axis=1, keepdims=True)
    t = (preds == pm).astype(int)
        
    a = np.sum((t != labels).astype(int))
    accuracy = 1 - (a/(2*labels.shape[0]))
    
    return accuracy

# Case 11: Dropout

In [436]:
nl0 = 28*28
nl1 = 32
nl2 = 16
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model_mb(train_data_shfl3, train_target_shfl, layers, num_iterations=100, \
                  learning_rate=0.001, mini_batch_size=250, dropout=0)

Iteration: %d, Cost: %.3f 100 0.643667760508


In [437]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

0.88852
0.8958


# Kaggle competition

In [416]:
nl0 = 28*28
nl1 = 128
nl2 = 64
nl3 = 32
nl4 = 10
layers = [nl0, nl1, nl2, nl3, nl4]
params = model_mb(train_data_shfl3, train_target_shfl, layers, num_iterations=500, learning_rate=0.001, mini_batch_size=250)

Iteration: %d, Cost: %.3f 100 0.341124978776
Iteration: %d, Cost: %.3f 200 0.231072942059
Iteration: %d, Cost: %.3f 300 0.173907728941
Iteration: %d, Cost: %.3f 400 0.135792527549
Iteration: %d, Cost: %.3f 500 0.107177961263


In [417]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

0.9831
0.9619


In [418]:
test_images = pd.read_csv('../Kaggle/MNIST/test.csv').values

In [425]:
test_images2 = (test_images - np.mean(test_images))/(np.sqrt(np.var(test_images)) + 1e-10)

In [423]:
cache = forward_pass(test_images2, params, num_layers)
preds = cache["A"+str(num_layers)]
kaggle_output = np.argmax(preds, axis=1)

In [424]:
# save results
np.savetxt('../Kaggle/MNIST/submission.csv', 
           np.c_[range(1,len(test_images)+1),kaggle_output], 
           delimiter=',', 
           header = 'ImageId,Label', 
           comments = '', 
           fmt='%d')

# Case 10: Batch normalization (incomplete)

# Case 9: Changed hyper parameters to save time

In [176]:
nl0 = 28*28
nl1 = 32
nl2 = 16
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model_mb(train_data_shfl3, train_target_shfl, layers, 100, 0.001, 250)

Iteration: %d, Cost: %.3f 100 0.643667760508


In [177]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

0.88852
0.8958


# Case 8: Mini-batch

In [155]:
nl0 = 28*28
nl1 = 32
nl2 = 16
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model_mb(train_data_shfl3, train_target_shfl, layers, 700, 0.05, 100)

Iteration: %d, Cost: %.3f 0 1.22120072629
Iteration: %d, Cost: %.3f 100 0.00330191896417
Iteration: %d, Cost: %.3f 200 0.000677789697588
Iteration: %d, Cost: %.3f 300 0.000338467524802
Iteration: %d, Cost: %.3f 400 0.000217997485401
Iteration: %d, Cost: %.3f 500 0.000157799447052
Iteration: %d, Cost: %.3f 600 0.000122214119799


In [156]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

1.0
0.9636


# Case 7: smaller network, less iterations (to reduce computation time)

In [126]:
nl0 = 28*28
nl1 = 32
nl2 = 16
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model(train_data_shfl3, train_target_shfl, layers, 700, 0.05)

Iteration: %d, Cost: %.3f 0 3.28322205529
Iteration: %d, Cost: %.3f 100 1.52498070491
Iteration: %d, Cost: %.3f 200 0.925394513908
Iteration: %d, Cost: %.3f 300 0.735371843318
Iteration: %d, Cost: %.3f 400 0.642561162086
Iteration: %d, Cost: %.3f 500 0.58322147419
Iteration: %d, Cost: %.3f 600 0.540691518373


In [127]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

0.91178
0.9162


# Case 6: Unit std dev (correct)

In [121]:
nl0 = 28*28
nl1 = 64
nl2 = 32
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model(train_data_shfl3, train_target_shfl, layers, 1000, 0.05)

Iteration: %d, Cost: %.3f 0 3.32719518049
Iteration: %d, Cost: %.3f 100 0.920747641883
Iteration: %d, Cost: %.3f 200 0.667183892741
Iteration: %d, Cost: %.3f 300 0.569935887113
Iteration: %d, Cost: %.3f 400 0.51119767978
Iteration: %d, Cost: %.3f 500 0.468540201849
Iteration: %d, Cost: %.3f 600 0.434777414317
Iteration: %d, Cost: %.3f 700 0.406875158972
Iteration: %d, Cost: %.3f 800 0.383157315886
Iteration: %d, Cost: %.3f 900 0.362787210359


In [122]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

0.9401
0.9397


# Case 5: Unit variance

In [115]:
nl0 = 28*28
nl1 = 64
nl2 = 32
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model(train_data_shfl3, train_target_shfl, layers, 1000, 0.05)

Iteration: %d, Cost: %.3f 0 4.04739617434
Iteration: %d, Cost: %.3f 100 0.702949176404
Iteration: %d, Cost: %.3f 200 0.515706832665
Iteration: %d, Cost: %.3f 300 0.436034542052
Iteration: %d, Cost: %.3f 400 0.385267458556
Iteration: %d, Cost: %.3f 500 0.348151210512
Iteration: %d, Cost: %.3f 600 0.319520331222
Iteration: %d, Cost: %.3f 700 0.296176147099
Iteration: %d, Cost: %.3f 800 0.276604907786
Iteration: %d, Cost: %.3f 900 0.259939585609


In [116]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl3, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data3, params, num_layers)
print(accuracy)

0.95818
0.9508


# Case 4: Add another layer

In [111]:
nl0 = 28*28
nl1 = 128
nl2 = 64
nl3 = 32
nl4 = 10
layers = [nl0, nl1, nl2, nl3, nl4]
params = model(train_data_shfl2, train_target_shfl, layers, 1000, 0.05)

Iteration: %d, Cost: %.3f 0 3.25586302648
Iteration: %d, Cost: %.3f 100 2.72440423418
Iteration: %d, Cost: %.3f 200 1.18542395667
Iteration: %d, Cost: %.3f 300 0.799798939977
Iteration: %d, Cost: %.3f 400 0.670901248039
Iteration: %d, Cost: %.3f 500 0.600259972392
Iteration: %d, Cost: %.3f 600 0.551252278373
Iteration: %d, Cost: %.3f 700 0.512885402917
Iteration: %d, Cost: %.3f 800 0.48096523522
Iteration: %d, Cost: %.3f 900 0.453369933814


In [112]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl2, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data2, params, num_layers)
print(accuracy)

0.9264
0.9262


# Case 3: Zero mean

In [109]:
nl0 = 28*28
nl1 = 64
nl2 = 32
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model(train_data_shfl2, train_target_shfl, layers, 1000, 0.05)

Iteration: %d, Cost: %.3f 0 3.25593657005
Iteration: %d, Cost: %.3f 100 2.30840618614
Iteration: %d, Cost: %.3f 200 1.1881943244
Iteration: %d, Cost: %.3f 300 0.864869208823
Iteration: %d, Cost: %.3f 400 0.730145121645
Iteration: %d, Cost: %.3f 500 0.656298411581
Iteration: %d, Cost: %.3f 600 0.607328476808
Iteration: %d, Cost: %.3f 700 0.571203471051
Iteration: %d, Cost: %.3f 800 0.542487802948
Iteration: %d, Cost: %.3f 900 0.518415656222


In [110]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl2, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data2, params, num_layers)
print(accuracy)

0.91354
0.9172


# Case 2: Shuffled data

In [95]:
nl0 = 28*28
nl1 = 64
nl2 = 32
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model(train_data_shfl, train_target_shfl, layers, 1000, 0.05)

Iteration: %d, Cost: %.3f 0 3.26419535528
Iteration: %d, Cost: %.3f 100 2.33337878448
Iteration: %d, Cost: %.3f 200 1.16612822578
Iteration: %d, Cost: %.3f 300 0.855196248566
Iteration: %d, Cost: %.3f 400 0.729524977565
Iteration: %d, Cost: %.3f 500 0.660332431973
Iteration: %d, Cost: %.3f 600 0.613826190547
Iteration: %d, Cost: %.3f 700 0.578924857411
Iteration: %d, Cost: %.3f 800 0.550779046348
Iteration: %d, Cost: %.3f 900 0.52690506348


In [97]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target_shfl, train_data_shfl, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data, params, num_layers)
print(accuracy)

0.91328
0.9184


# Case 1: Normal

In [80]:
nl0 = 28*28
nl1 = 64
nl2 = 32
nl3 = 10
layers = [nl0, nl1, nl2, nl3]
params = model(train_data, train_target, layers, 1000, 0.05)

Iteration: %d, Cost: %.3f 0 3.26419535528
Iteration: %d, Cost: %.3f 100 2.33337878448
Iteration: %d, Cost: %.3f 200 1.16612822578
Iteration: %d, Cost: %.3f 300 0.855196248566
Iteration: %d, Cost: %.3f 400 0.729524977565
Iteration: %d, Cost: %.3f 500 0.660332431973
Iteration: %d, Cost: %.3f 600 0.613826190547
Iteration: %d, Cost: %.3f 700 0.578924857411
Iteration: %d, Cost: %.3f 800 0.550779046348
Iteration: %d, Cost: %.3f 900 0.52690506348


In [81]:
num_layers = np.size(layers) - 1
accuracy = predictions(train_target, train_data, params, num_layers)
print(accuracy)
accuracy = predictions(test_target, test_data, params, num_layers)
print(accuracy)

0.91328
0.9184
