# Samer Baslan
# CMPE-258: Deep Learning, Vijay Eranti
# Spring 2021, SJSU
# Homework 4 Part 1: MNIST classifier with various training knobs with Numpy


Resources used:
1. Grokking Deep Learning (Chapter 8)
2. https://machinelearningmastery.com/handwritten-digit-recognition-using-convolutional-neural-networks-python-keras/


Note: Some functionality (confusion matrix, update learning rate, show errors) was difficult to implement in this version because all the examples use models created in Keras and the fit/predict API. They will be implemented in part 2.


##Imports

In [1]:
import numpy as np
import sys

In [61]:
from keras.datasets import mnist
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

##Function Definitions

In [3]:
#return x if x>0; 0 otherwise
def relu(x):
  return (x >= 0) * x 

#returns 1 for input > 0
def relu2deriv(output):
  return output >= 0

In [56]:
def plot_confusion_matrix(true, pred):
  cm = confusion_matrix(true, pred)
  plt.imshow(cm, interpolation = "nearest", mcap = plt.cm.rainbow)
  labels = range(10)
  ticks = np.arange(len(labels))
  plt.xticks(ticks, labels, rotation = 50)
  plt.yticks(tick_marks, labels)
  thresh = cm.max() / 2.0
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, cm[i, j], horizontalalignment = "center", color = "white" if cm[i, j] > thresh else "black")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()
    

##Parameters

In [4]:
batch_size = 100
alpha, iterations = (0.001, 300)
pixels_per_image, num_labels, hidden_size = (784, 10, 100)

##Load/Scale/Normalize Data

In [5]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [39]:
normalization_layer = preprocessing.Normalization()
normalization_layer.adapt(X_train)
normalization_layer(X_train)

<tf.Tensor: shape=(60000, 28, 28), dtype=float32, numpy=
array([[[-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        ...,
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948]],

       [[-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        [-0.00300378, -0.00990673, -0.02825189, ..., -0.06605659,
         -0.02814181, -0.00788948],
        ..

In [45]:
normalization_layer.adapt(X_test)
normalization_layer(X_test)

<tf.Tensor: shape=(10000, 28, 28), dtype=float32, numpy=
array([[[-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        ...,
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782]],

       [[-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        [-0.00329633, -0.01032096, -0.02632251, ..., -0.06433523,
         -0.02871007, -0.00899782],
        ..

In [46]:
images, labels = (X_train[0:1000].reshape(1000, 28*28) / 255, y_train[0:1000])

In [47]:
one_hot_labels = np.zeros((len(labels), 10))
for i, l in enumerate(labels):
  one_hot_labels[i][l] = 1

labels = one_hot_labels #now labels is (1000, 10)

In [48]:
test_images = X_test.reshape(len(X_test), 28*28) / 255
test_labels = np.zeros((len(y_test), 10))

for i, l in enumerate(y_test):
  test_labels[i][l] = 1

I found that using this structure gave me most consistent results

In [52]:
np.random.seed(1)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size * 2)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size * 2, 80)) - 0.1
weights_2_3 = 0.2*np.random.random((80, num_labels)) - 0.1

## 3 Layer Neural Network, RELU nonlinearity, no dropout (overfitting expected)

In [53]:
alpha, iterations, hidden_size = (0.005, 300, 100)
pixels_per_image, num_labels = (784, 10)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size * 2)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size * 2, 80)) - 0.1
weights_2_3 = 0.2*np.random.random((80, num_labels)) - 0.1

for j in range(iterations):
    error, correct_cnt = (0.0,0)
    for i in range(len(images)):
        layer_0 = images[i:i+1]
        layer_1 = relu(np.dot(layer_0,weights_0_1))
        layer_2 = relu(np.dot(layer_1,weights_1_2))
        layer_3 = np.dot(layer_2, weights_2_3)

        error += np.sum((labels[i:i+1] - layer_3) ** 2)
        correct_cnt += int(np.argmax(layer_3) == np.argmax(labels[i:i+1]))
        layer_3_delta = (labels[i:i+1] - layer_3)
        layer_2_delta = layer_3_delta.dot(weights_2_3.T) * relu2deriv(layer_2)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)

        weights_2_3 += alpha * layer_2.T.dot(layer_3_delta)
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if(j%10 == 0):
        test_error = 0.0
        test_correct_cnt = 0

        for i in range(len(test_images)):
            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0,weights_0_1))
            layer_2 = relu(np.dot(layer_1, weights_1_2))
            layer_3 = np.dot(layer_2, weights_2_3)

            test_error += np.sum((test_labels[i:i+1] - layer_3) ** 2)
            test_correct_cnt += int(np.argmax(layer_3) == np.argmax(test_labels[i:i+1]))

        sys.stdout.write("\n" + \
                         "I:" + str(j) + \
                         " Test-Err:" + str(test_error/ float(len(test_images)))[0:5] +\
                         " Test-Acc:" + str(test_correct_cnt/ float(len(test_images)))+\
                         " Train-Err:" + str(error/ float(len(images)))[0:5] +\
                         " Train-Acc:" + str(correct_cnt/ float(len(images))))


I:0 Test-Err:0.588 Test-Acc:0.6518 Train-Err:0.705 Train-Acc:0.563
I:10 Test-Err:0.369 Test-Acc:0.8405 Train-Err:0.240 Train-Acc:0.933
I:20 Test-Err:0.355 Test-Acc:0.8493 Train-Err:0.179 Train-Acc:0.967
I:30 Test-Err:0.358 Test-Acc:0.8408 Train-Err:0.148 Train-Acc:0.984
I:40 Test-Err:0.367 Test-Acc:0.838 Train-Err:0.128 Train-Acc:0.99
I:50 Test-Err:0.377 Test-Acc:0.8326 Train-Err:0.115 Train-Acc:0.994
I:60 Test-Err:0.390 Test-Acc:0.8243 Train-Err:0.105 Train-Acc:0.996
I:70 Test-Err:0.405 Test-Acc:0.8147 Train-Err:0.098 Train-Acc:0.998
I:80 Test-Err:0.421 Test-Acc:0.8095 Train-Err:0.094 Train-Acc:0.998
I:90 Test-Err:0.437 Test-Acc:0.8002 Train-Err:0.090 Train-Acc:0.998
I:100 Test-Err:0.452 Test-Acc:0.793 Train-Err:0.087 Train-Acc:0.998
I:110 Test-Err:0.467 Test-Acc:0.7879 Train-Err:0.084 Train-Acc:0.998
I:120 Test-Err:0.482 Test-Acc:0.7828 Train-Err:0.082 Train-Acc:0.998
I:130 Test-Err:0.493 Test-Acc:0.7786 Train-Err:0.080 Train-Acc:0.999
I:140 Test-Err:0.502 Test-Acc:0.7745 Train-Err:

##3 Layer Neural Network, RELU nonlinearity, with dropout to reduce overfitting

In [54]:
alpha, iterations, hidden_size = (0.001, 160, 100)
pixels_per_image, num_labels = (784, 10)

weights_0_1 = 0.2*np.random.random((pixels_per_image, hidden_size * 2)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size * 2, 80)) - 0.1
weights_2_3 = 0.2*np.random.random((80, num_labels)) - 0.1

for j in range(iterations):
    error, correct_cnt = (0.0,0)
    for i in range(len(images)):
        layer_0 = images[i:i+1]
        layer_1 = relu(np.dot(layer_0,weights_0_1))
        dropout_mask_1 = np.random.randint(2, size = layer_1.shape)
        layer_1 *= dropout_mask_1 * 2
        layer_2 = relu(np.dot(layer_1,weights_1_2))
        dropout_mask_2 = np.random.randint(2, size = layer_2.shape)
        layer_2 *= dropout_mask_2 * 2
        layer_3 = np.dot(layer_2, weights_2_3)

        error += np.sum((labels[i:i+1] - layer_3) ** 2)
        correct_cnt += int(np.argmax(layer_3) == np.argmax(labels[i:i+1]))
        layer_3_delta = (labels[i:i+1] - layer_3)
        layer_2_delta = layer_3_delta.dot(weights_2_3.T) * relu2deriv(layer_2)
        layer_2_delta *= dropout_mask_2
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
        layer_1_delta *= dropout_mask_1

        weights_2_3 += alpha * layer_2.T.dot(layer_3_delta)
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if(j%10 == 0):
        test_error = 0.0
        test_correct_cnt = 0

        for i in range(len(test_images)):
            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0,weights_0_1))
            layer_2 = relu(np.dot(layer_1, weights_1_2))
            layer_3 = np.dot(layer_2, weights_2_3)

            test_error += np.sum((test_labels[i:i+1] - layer_3) ** 2)
            test_correct_cnt += int(np.argmax(layer_3) == np.argmax(test_labels[i:i+1]))

        sys.stdout.write("\n" + \
                         "I:" + str(j) + \
                         " Test-Err:" + str(test_error/ float(len(test_images)))[0:5] +\
                         " Test-Acc:" + str(test_correct_cnt/ float(len(test_images)))+\
                         " Train-Err:" + str(error/ float(len(images)))[0:5] +\
                         " Train-Acc:" + str(correct_cnt/ float(len(images))))


I:0 Test-Err:0.885 Test-Acc:0.261 Train-Err:1.151 Train-Acc:0.103
I:10 Test-Err:0.750 Test-Acc:0.6466 Train-Err:0.765 Train-Acc:0.47
I:20 Test-Err:0.655 Test-Acc:0.7006 Train-Err:0.663 Train-Acc:0.585
I:30 Test-Err:0.599 Test-Acc:0.7437 Train-Err:0.616 Train-Acc:0.618
I:40 Test-Err:0.560 Test-Acc:0.7699 Train-Err:0.592 Train-Acc:0.636
I:50 Test-Err:0.541 Test-Acc:0.7731 Train-Err:0.582 Train-Acc:0.647
I:60 Test-Err:0.530 Test-Acc:0.7777 Train-Err:0.569 Train-Acc:0.672
I:70 Test-Err:0.524 Test-Acc:0.7777 Train-Err:0.556 Train-Acc:0.681
I:80 Test-Err:0.515 Test-Acc:0.7735 Train-Err:0.549 Train-Acc:0.711
I:90 Test-Err:0.520 Test-Acc:0.778 Train-Err:0.559 Train-Acc:0.681
I:100 Test-Err:0.518 Test-Acc:0.7701 Train-Err:0.550 Train-Acc:0.708
I:110 Test-Err:0.514 Test-Acc:0.7785 Train-Err:0.573 Train-Acc:0.698
I:120 Test-Err:0.508 Test-Acc:0.783 Train-Err:0.553 Train-Acc:0.698
I:130 Test-Err:0.515 Test-Acc:0.7875 Train-Err:0.554 Train-Acc:0.685
I:140 Test-Err:0.517 Test-Acc:0.8018 Train-Err:0