<a href="https://colab.research.google.com/github/rlempka/mnist/blob/master/mnist_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal
from sklearn.metrics import accuracy_score
import pandas as pd

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

In [2]:
# First step is to get the mnist data
# We will use the keras.datasets library to load mnist

(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
# Note that the data is already split into train and test datasets
# Let's check the shape of our data

print("Training data shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Testing data sahpe:", X_test.shape)
print("Testing labels shape:", y_test.shape)

# Normalize data
X_train = X_train / 255.0
X_test = X_test / 255.0

Training data shape: (60000, 28, 28)
Training labels shape: (60000,)
Testing data sahpe: (10000, 28, 28)
Testing labels shape: (10000,)


In [4]:
# Proportion of data allocated to testing is therefore
print("{:.2f}%".format((X_test.shape[0] / (X_train.shape[0] + X_test.shape[0]))*100))

14.29%


In [0]:
# However because we want to use early stopping we will need to create a 
# validation set because this is how tensorflow determines early stopping

# Note that our grid search will still use cross validation on the 
# training data that is not part of the validation set

# The validation set will only be used at the end of each epoch

# We use 10% of the training data to create validation sets

X_valid, y_valid = X_train[:int(len(X_train)*.1)], y_train[:int(len(X_train)*.1)]

In [0]:
# We will try three approaches to solving the mnist problem. Our goal is to see
# if we can break 98% accuracy on the test set

# Our approaches will be as follows

# 1) Try using a simple MLP with a few hidden layers
# 2) Try using a simple convnet

# Let's start with the MLP
# We will use the Sequential library from keras

# We will define the model architecture in a function so that we can wrap
# the function in an sklearn wrapper and perform randomized grid search to
# tune hyperparameters

def build_mnist_mlp(n_hidden=2, n_neurons=30, learning_rate=10e-3, 
                    input_shape=[28, 28], activation="relu"):

  model = keras.models.Sequential()
  model.add(keras.layers.Flatten())

  for i in range(n_hidden):
    model.add(keras.layers.Dense(n_neurons, activation = activation))
  
  # Note that this is the output layer for which we need 10 neurons
  # to predict the 10 possible classes found in the MNIST dataset
  model.add(keras.layers.Dense(10, activation="softmax"))

  optimizer= keras.optimizers.SGD(lr=learning_rate)
  model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
                metrics=['accuracy'])
  return model


mnist_mlp = keras.wrappers.scikit_learn.KerasClassifier(build_mnist_mlp)

In [0]:
# Let's now create our parameter grid
# This will be sampled from during our randomized parameter space search

params = {
    "n_hidden" : [1, 2, 3, 4],
    "n_neurons" : [10, 20, 30, 40, 50],
    "learning_rate" : [10e-5,10e-4,10e-3]
}

In [0]:
# Note in the params distribution we sample from the reciprocal 
# distribution, the pdf is plotted below, note that the most likely
# values are near 0 hence the learning rateis highly unlikely to
# get larger than 0.01

# Note the pdf of the reciprocal random variable is (1 / (x*log(b/a)))
# For more information see the scipy documentation on scipy.stats.reciprocal

# a, b = 3e-4, 3e-2
# fig, ax = plt.subplots(1,1)
# x = np.linspace(reciprocal.ppf(0.01, a, b),
#                 reciprocal.ppf(0.99, a, b), 100)
# ax.plot(x, reciprocal.pdf(x, a, b), 'r-', lw=5, alpha=0.6, label="reciprocal pdf")

# Due to an error this causes I have to change the values in the learning_rate
# parameter search space to discrete values

In [9]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_mlp.h5") 

rnd_search_cv = RandomizedSearchCV(mnist_mlp, params,
                                   n_iter=3, cv=3)
rnd_search_cv.fit(X_train, y_train, epochs=50,
                  validation_data = (X_valid, y_valid),
                  callbacks=[keras.callbacks.EarlyStopping(patience=10), checkpoint_cb])

Train on 40000 samples, validate on 6000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Train on 40000 samples, validate on 6000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4ac62b05c0>,
                   iid='deprecated', n_iter=3, n_jobs=None,
                   param_distributions={'learning_rate': [0.0001, 0.001, 0.01],
                                        'n_hidden': [1, 2, 3, 4],
                                        'n_neurons': [10, 20, 30, 40, 50]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [10]:
best_mnist_mlp_params = rnd_search_cv.best_params_

print("Best params:", best_mnist_mlp_params)
print("Best score: {:.2f}%".format(rnd_search_cv.best_score_*100))

final_mnist_mlp = build_mnist_mlp(**best_mnist_mlp_params)

# Note we use 150 epochs instead of 100 on our final model to attempt to improve
# further

history = final_mnist_mlp.fit(X_train, y_train, epochs=150, 
                              validation_data=(X_valid, y_valid),
                        callbacks=[keras.callbacks.EarlyStopping(patience=10)])

Best params: {'n_neurons': 30, 'n_hidden': 3, 'learning_rate': 0.01}
Best score: 95.92%
Train on 60000 samples, validate on 6000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
E

In [0]:
# Save the final mnist model

final_mnist_mlp.save("final_mnist_mlp.h5")

In [0]:
# # Now let's build our convolutional neural network

# We will use the keras sequential API

# The architecutre will be simple with the following design
# Input -> (Conv -> Relu -> Pool)*N -> FC
# Or Input -> (Conv -> Relu)*N -> FC
# Where N specifies the number of time the layer pattern within
# the parenthesis is repeated

# Note that although the general pattern will repeat we will tend to
# reduce spatial dimensions and increase the depth of our network
# as we move towards the FC (i.e. output) layer



def build_mnist_convnet(input_shape=[28,28, 1], kernel_size=2, strides=(1,1),
                        activation='relu', num_layers=3, pool_layer=True,
                        padding="same", dropout=False, 
                        learning_rate=10e-4):
 
  model = keras.models.Sequential()
  current_filter_size = 32
  
  # Note that we have the kernel size as 4 for the first conv layer
  # instead of 2 to capture a larger set of the spatial dimension
  # in the first layer
  model.add(keras.layers.Conv2D(current_filter_size, 4, activation=activation, 
                                input_shape=input_shape, padding=padding))
  for i in range(num_layers):
    # increase depth of network by a factor of two every layer
    current_filter_size *= 2
    model.add(keras.layers.Conv2D(current_filter_size, kernel_size, 
                                  activation=activation, padding=padding))
    if pool_layer:
      model.add(keras.layers.MaxPooling2D(2))
  
  # Architecture at the end of the network flattens output from final pool
  # or conv layer above and then connects to a dense network with two hidden
  # layers

  # Depth is now reduced by a factor of 2 until the final output layer of 10
  # neurons is reached

  model.add(keras.layers.Flatten())
  model.add(keras.layers.Dense(128, activation=activation))
  if dropout:
    model.add(keras.layers.Dropout(0.5))
  model.add(keras.layers.Dense(64, activation=activation))
  if dropout:
    model.add(keras.layers.Dropout(0.5))
  model.add(keras.layers.Dense(10, activation="softmax"))

  optimizer= keras.optimizers.SGD(lr=learning_rate)
  model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
                metrics=['accuracy'])
  return model

convnet_mnist = keras.wrappers.scikit_learn.KerasClassifier(build_mnist_convnet)
  

In [0]:
cnet_params = {
    "num_layers" : [1, 2, 3, 4],
    "dropout" : [True, False],
    "learning_rate" : [10e-4,10e-3, 10e-2]

}

In [0]:
# Keras expects a depth dimension for the matrices hence we need to rehsape
# so that each instance is [28,28,1] instead of [28,28]

X_train_conv = X_train.reshape(-1, 28, 28, 1)
X_valid_conv = X_valid.reshape(-1, 28, 28, 1)
X_test_conv = X_test.reshape(-1, 28, 28, 1)

In [15]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("mnist_mlp.h5") 


rnd_search_conv_cv = RandomizedSearchCV(convnet_mnist, cnet_params,
                                   n_iter=5, cv=2)

# To save time I am going to lower the epochs from 100 to 20 here and we will
# train on a higher number of epochs based on the best params

# Also we've trained cv to 2

rnd_search_conv_cv.fit(X_train_conv, y_train, epochs=20,
                  validation_data = (X_valid_conv, y_valid),
                  callbacks=[keras.callbacks.EarlyStopping(patience=10), 
                             checkpoint_cb,
                             keras.callbacks.TensorBoard('./')])

Train on 30000 samples, validate on 6000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Train on 30000 samples, validate on 6000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 30000 samples, validate on 6000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 30000 samples, validate on 6000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


RandomizedSearchCV(cv=2, error_score=nan,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f4a105b8e48>,
                   iid='deprecated', n_iter=5, n_jobs=None,
                   param_distributions={'dropout': [True, False],
                                        'learning_rate': [0.001, 0.01, 0.1],
                                        'num_layers': [1, 2, 3, 4]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [17]:
best_convnet_params = rnd_search_conv_cv.best_params_

print("Best convnet params: ", best_convnet_params)
print("Best score {:.2f}".format(rnd_search_conv_cv.best_score_*100))



Best convnet params:  {'num_layers': 3, 'learning_rate': 0.1, 'dropout': False}
Best score 99.08


In [45]:
y_conv_preds = rnd_search_conv_cv.best_estimator_.predict(
        np.expand_dims(X_test,axis=-1))
y_mlp_preds = rnd_search_cv.best_estimator_.predict(
        X_test.reshape(-1,28*28))

print("Convnet accuracy: {:2f}%".format(
    accuracy_score(y_test,y_conv_preds)))

print("MLP accuracy: {:2f}%".format(
    accuracy_score(y_test, y_mlp_preds)
))


Convnet accuracy: 0.993700%
MLP accuracy: 0.967800%


In [0]:
# Hence the convolutional networks performs the best out of our two
# models

# The best conv net was not very deep having only 3 layers. Also, we did
# not use dropout on the winning convet and had a learning rate of .1 which
# is pretty high

# If we were to spend more time on this problem we could increase the learning
# rate even further (perhaps to .15) or increase the number of epochs at the
# current best learning rate of .1


In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
X_kaggle_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')

In [0]:
y_kaggle_preds = rnd_search_conv_cv.best_estimator_.predict(np.asarray(X_kaggle_test).reshape(-1,28,28,1)/255.0)

In [0]:
submission = pd.DataFrame(np.hstack([np.arange(1,len(y_kaggle_preds)+1).reshape(-1,1),y_kaggle_preds.reshape(-1,1)]), columns=['ImageId', 'Label'])

In [0]:
from google.colab import files
submission.to_csv("/content/drive/My Drive/Colab Notebooks/submission.csv", index=False)
files.download("/content/drive/My Drive/Colab Notebooks/submission.csv")


# Note that this submission should get you in the top 10% on Kaggle