# MNIST Recognizing Digits
MNIST is the hello world dataset for Image Recognition or Object Detection. The objective is to detect the handwritten digit numbered 0 to 9. This notebook will build 4 models
1. XGBoost Model 
2. Multi Layer Perceptron
3. Basic CNN
4. Large CNN
We will compare these models with accuracy metric. 

The following image captures the performance of these four models as submissions. We can observe that XGBoost model is not far behind but a simple MLP outperforms XGBoost and CNNs improve the performance by a good amount.


![Kaggle Performance of Models](https://lh3.googleusercontent.com/qb8VQRuRjuzPIUp4764gfRkVaxYzdgQRWSmggiQXmyQX-M4nO7XcgwFo6nZ3FCATtSV5lxe9jrDPDsWrJ08pWgv2dyzA_HhKjTwRRchKj4TacEL8YSvYqw1HFyvVKk9f93ySWFtoqU6WZYh9UD63hxwN-xaiEZigFSzF0yvV93A4L2qeDZ8TIzN0hywpBCHOVrLfGqrpeDupdrwPo4eS96Q86WathAYspMT4TVVVdSp7tmPH00byAp4Olgb03Opry1roEp_3NPwSe6B5EFXfLr0jT3JDYY8kb__uO8kgc7w-8edkGx3jkB_Q-evPgpdCEANW6fJlP98OXt702koT329uGBIpsKUIb42H5ZKkEl7ErYIRwIsFBiCtJzxgKc66J0f6Pc_Xv_PfcghnoNH-okSgmMw4-T7xoCoyGrkmHta6VuHHqWe9_OBm8Jc6t9xss5vpN5ziFPD1eppmmB-B5AIpINmjp7-VgZ4Psjy1Ky2Eko4eLVE8bvwdXm1GTqHXhflOOYRMh4Rfaw3JUUkSHGZPqmdYdc0040HAgz1uvPfrP-rMSq05Xy7hy8RYzRf2iWV3dv1kHM94LLzImBcu3c4YfUfekUaNsU9gu9fK6ElT5qmH_oXcb6Pj8nx_hGSMmgZq3qLYoMLW7gw-kp7Zpz2uEH73O9T_fy9V5CsLOYUmRRxA7T4E1gEaeY09nx4=w1675-h814-no?authuser=0)

The details for these models are below

## Data Loadin' and Preppin'

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('../input/digit-recognizer/train.csv')
test  = pd.read_csv('../input/digit-recognizer/test.csv')
sub   = pd.read_csv('../input/digit-recognizer/sample_submission.csv')

print ('Train Shape :', train.shape)
print ('test Shape :' , test.shape)

setting SEED values to make results reproducible

In [1]:
SEED = 42
from tensorflow.random import set_seed
from numpy.random import seed
seed(SEED)
set_seed(SEED)

## 0. Pitting against XGBoost

In [1]:
import xgboost as xgb
X = train.drop('label', axis = 1).values
Y = train.label.values


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state = SEED)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

train_dm = xgb.DMatrix(data = X_train, label = y_train)
test_dm  = xgb.DMatrix(data = X_test,  label = y_test)

params = {
    'max_depth'             : 5, 
    'eta'                   : 0.03, 
    'min_child_weight'      : 50, 
    'num_boost_round'       : 250, 
    'objective'             :'multi:softprob', 
    'seed'                  : SEED, 
    'num_class'             : 10,
    'silent'                : 1,
    'colsample_bytree'      : 0.5
}
%time model  = xgb.train(params, train_dm, num_boost_round = params['num_boost_round'])

In [1]:
from sklearn.metrics import accuracy_score, classification_report
print ('TRAIN ACCURACY : ', accuracy_score(y_train, [x.argmax() for x in model.predict(train_dm)]))
print ('VAL ACCURACY : '  , accuracy_score(y_test,  [x.argmax() for x in model.predict(test_dm)]))

In [1]:
score_dm = xgb.DMatrix(data = test.values)
sub_xgb = pd.Series([x.argmax() for x in model.predict(score_dm)], index  = np.arange(test.shape[0]) + 1).reset_index()
sub_xgb.columns = sub.columns
sub_xgb.to_csv('submission_xgboost.csv', index = False)

Reshaping dataset to 28 x 28 pixel values and standardizing the output of the images by dividing by 255

## 1. Basline Model - Multilayer Perceptron

In [1]:
X_train = train.drop('label', axis = 1).values.reshape(train.shape[0], 784).astype('float')/255
y_train = train.label.values
X_test  = test.values.reshape(test.shape[0], 784).astype('float')/255
print (X_train.shape, X_test.shape)

One hot encoding of label 

In [1]:
from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
print (y_train.shape)

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def baseline_model():
    model = Sequential([
        Dense(784, input_dim = (784), activation = 'relu'),
        Dense(10, activation = 'softmax'),
    ])
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

#### Fit Model

In [1]:
model = baseline_model()
model.summary()

In [1]:
%time history = model.fit(X_train, y_train, validation_split = 0.1, epochs=40, batch_size=200, verbose=0)

In [1]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('LOSS')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()


plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('ACCURACY')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()    

In [1]:
import numpy as np
sub_mlp = pd.Series([x.argmax() for x in model.predict(X_test)], index  = np.arange(test.shape[0]) + 1).reset_index()
sub_mlp.columns = sub.columns
sub_mlp.to_csv('submission_baseline.csv', index = False)

## 2. Training Simple CNN

In [1]:
X_train = train.drop('label', axis = 1).values.reshape(train.shape[0], 28, 28, 1).astype('float')/255
y_train = to_categorical(train.label.values)
X_test  = test.values.reshape(test.shape[0], 28, 28, 1).astype('float')/255
print (X_train.shape, X_test.shape, y_train.shape)

In [1]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten

def basic_cnn_model():
    model = Sequential([
        Conv2D(32, (5,5), input_shape = (28,28,1), activation = 'relu'),
        MaxPooling2D(2,2),
        Dropout(0.2),
        Flatten(),
        Dense(128, activation = 'relu'),
        Dense(10, activation = 'softmax')
    ])
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [1]:
model = basic_cnn_model()
model.summary()

In [1]:
%time history = model.fit(X_train, y_train, validation_split = 0.1, epochs=40, batch_size=200, verbose=0)

In [1]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('LOSS')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()


plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('ACCURACY')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()

### Note
* The curves for these plots are much more closer for CNN version than the multilayer perceptron version
* The model trained with less number of parameters than the MLP version
* The training time was slightly higher than MLP version

In [1]:
# saving submission file
sub_cnn = pd.Series([x.argmax() for x in model.predict(X_test)], index  = np.arange(test.shape[0]) + 1).reset_index()
sub_cnn.columns = sub.columns
sub_cnn.to_csv('submission_basic_CNN.csv', index = False)

## 3. Training a Larger Convolutional Neural Network 

In [1]:
def large_cnn_model():
    model = Sequential([
        Conv2D(32, (5,5), input_shape = (28,28,1), activation = 'relu'),
        MaxPooling2D(2,2),
        Conv2D(32, (3,3), activation = 'relu'),
        MaxPooling2D(2,2),
        Dropout(0.2),
        Flatten(),
        Dense(128, activation = 'relu'),
        Dense(10, activation = 'softmax')
    ])
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [1]:
model = large_cnn_model()
model.summary()

In [1]:
%time history = model.fit(X_train, y_train, validation_split = 0.1, epochs=40, batch_size=200, verbose=0)

In [1]:
# saving submission file
sub_large_cnn = pd.Series([x.argmax() for x in model.predict(X_test)], index  = np.arange(test.shape[0]) + 1).reset_index()
sub_large_cnn.columns = sub.columns
sub_large_cnn.to_csv('submission_large_CNN.csv', index = False)

In [1]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('LOSS')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()


plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('ACCURACY')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'])
plt.show()

### Note
* This time the loss and validation accuracy are closer for train and test sets for previous two versions
* The model is trained with nearly 1/5th of previous CNN params and 1/6th of MLP's params
* The training time even though the parameters were less was higher

In [1]:
ls -l