In [1]:
"""
    Digit Recognition using Deep Learning
    Author: Sadip Giri
    Date: 25th Oct. 2018
"""

'\n    Digit Recognition using Deep Learning\n    Author: Sadip Giri\n    Date: 25th Oct. 2018\n'

In [2]:
import pandas as pd
import numpy as np

In [3]:
# read digit csv file using pandas
train_set = pd.read_csv('./digit_data/train.csv')
test_set = pd.read_csv('./digit_data/test.csv')

In [4]:
# visualize the data
train_set.head(3)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
test_set.head(3)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
trainset_without_labels = train_set.iloc[:, 1:785] # asbtracting train_set from 1 to 785 that is all pixels.. without lables

In [7]:
trainset_without_labels.tail(3)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
41997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
trainset_labels = train_set.iloc[:, 0] # abstracting out labels from training set

In [11]:
trainset_labels[:4] # visualize the labels

0    1
1    0
2    1
3    4
Name: label, dtype: int64

In [12]:
X_test = test_set.iloc[:, 0:784] # its good: we don't have labels in the test set

In [15]:
X_test.tail()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
27995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
from sklearn.model_selection import train_test_split
# validation set is 20 percent
X_train, X_cv, Y_train, Y_cv = train_test_split(trainset_without_labels, trainset_labels, test_size = 0.2, random_state = 1111)

In [22]:
# 33600 rows × 784 columns in X_train
X_train = X_train.as_matrix().reshape(33600, 784) # to train.. chaning in matrix form with the same dimensions

In [23]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
# also for validation set whose dim is 8400 rows × 784 columns
X_cv = X_cv.as_matrix().reshape(8400, 784)

In [25]:
X_cv

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
# similarly doing the same for X_test whose dim is 28000 rows × 784 columns
X_test = X_test.as_matrix().reshape(28000, 784)

In [27]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
# Data Cleaning and Normalization:
# At first, lets check the pixels intensities range:
print(min(X_train[1]), max(X_train[1]))

0 255


In [29]:
# so the pixels intensities are currently between the range of 0 and 255, we proceed to normalize the features, using broadcasting,
# Feature Normalization
X_train = X_train.astype('float32')
X_cv = X_cv.astype('float32')
X_test = X_test.astype('float32')

X_train /= 255
X_cv /= 255
X_test /= 255

In [30]:
# import keras
from keras.models import Sequential
from keras.layers import *
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [31]:
# also we convert our lables from a class vector to binary One Hot Encoded
# Convert lables to ONE HOT ENCODED
num_of_digits = 10
y_train = to_categorical(Y_train, num_classes=num_of_digits)
# same for validation set
y_cv = to_categorical(Y_cv, num_classes=num_of_digits)

In [32]:
# to check!
y_train[0] # is number 7 

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.], dtype=float32)

In [82]:
# Model Fitting:
# since its an empirical process so we try different model with different optimizers along with tuning hyperparameters
model = Sequential()
model.add(Dense(units=300, activation='relu', name='1st_Hidden_Layer', input_dim=784)) # input_dim = fetures = columns = 784
model.add(Dense(units=100, activation='relu', name='2nd_Hidden_Layer'))
model.add(Dense(units=100, activation='relu', name='3rd_Hidden_Layer'))
model.add(Dense(units=200, activation='relu', name='4th_Hidden_Layer'))
model.add(Dense(units=num_of_digits, activation='softmax', name='Output_Layer'))

In [83]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1st_Hidden_Layer (Dense)     (None, 300)               235500    
_________________________________________________________________
2nd_Hidden_Layer (Dense)     (None, 100)               30100     
_________________________________________________________________
3rd_Hidden_Layer (Dense)     (None, 100)               10100     
_________________________________________________________________
4th_Hidden_Layer (Dense)     (None, 200)               20200     
_________________________________________________________________
Output_Layer (Dense)         (None, 10)                2010      
Total params: 297,910
Trainable params: 297,910
Non-trainable params: 0
_________________________________________________________________


In [84]:
# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
# note: loss function is categorical becoz we have 10 classes to classify for e.g. we'd do binary classification
# we are using Stochastic Gradient Descent as our optimizer
# we are interested in accruacy metrics for now!

In [85]:
# Fitting the Model
model.fit(x=X_train, y=y_train, batch_size=100, epochs=20, verbose=2, validation_data=(X_cv, y_cv))
# we are using 100 batch_size with 20 epochs for fitting the model
# also we are giving validation data too

Train on 33600 samples, validate on 8400 samples
Epoch 1/20
 - 3s - loss: 1.9196 - acc: 0.4617 - val_loss: 1.1477 - val_acc: 0.7515
Epoch 2/20
 - 2s - loss: 0.6910 - acc: 0.8259 - val_loss: 0.4694 - val_acc: 0.8708
Epoch 3/20
 - 2s - loss: 0.4105 - acc: 0.8835 - val_loss: 0.3593 - val_acc: 0.8944
Epoch 4/20
 - 2s - loss: 0.3372 - acc: 0.8995 - val_loss: 0.3115 - val_acc: 0.9074
Epoch 5/20
 - 2s - loss: 0.2973 - acc: 0.9124 - val_loss: 0.2872 - val_acc: 0.9148
Epoch 6/20
 - 2s - loss: 0.2702 - acc: 0.9201 - val_loss: 0.2692 - val_acc: 0.9192
Epoch 7/20
 - 2s - loss: 0.2491 - acc: 0.9270 - val_loss: 0.2450 - val_acc: 0.9264
Epoch 8/20
 - 2s - loss: 0.2301 - acc: 0.9332 - val_loss: 0.2340 - val_acc: 0.9282
Epoch 9/20
 - 2s - loss: 0.2155 - acc: 0.9369 - val_loss: 0.2162 - val_acc: 0.9357
Epoch 10/20
 - 2s - loss: 0.2019 - acc: 0.9408 - val_loss: 0.2073 - val_acc: 0.9360
Epoch 11/20
 - 2s - loss: 0.1895 - acc: 0.9455 - val_loss: 0.1978 - val_acc: 0.9408
Epoch 12/20
 - 2s - loss: 0.1784 - a

<keras.callbacks.History at 0x11f658b00>

In [33]:
# We had validation score that is - training score of 95.60% 
# We can also check for test score  but we don't have label for that now!
# could submit it in the kaggle and know it or see it..

In [34]:
# Let's use Adam optimizer for building the model since it is said to increase lots of performance
# building different model!
model2 = Sequential()
model2.add(Dense(activation='relu', units=300, name='1st_Hidden_Layer', input_dim=784)) # input layer dim is number of varibales or features
model2.add(Dense(activation='relu', units=100, name='2nd_Hidden_Layer'))
model2.add(Dense(activation='relu', units=100, name='3rd_Hidden_Layer'))
model2.add(Dense(activation='relu', units=200, name='4th_Hidden_Layer'))
model2.add(Dense(activation='softmax', units=num_of_digits, name='Output_Layer'))
# note: name=" should not have space in them"

In [35]:
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1st_Hidden_Layer (Dense)     (None, 300)               235500    
_________________________________________________________________
2nd_Hidden_Layer (Dense)     (None, 100)               30100     
_________________________________________________________________
3rd_Hidden_Layer (Dense)     (None, 100)               10100     
_________________________________________________________________
4th_Hidden_Layer (Dense)     (None, 200)               20200     
_________________________________________________________________
Output_Layer (Dense)         (None, 10)                2010      
Total params: 297,910
Trainable params: 297,910
Non-trainable params: 0
_________________________________________________________________


In [36]:
# compiling another model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
# fitting the model:
model2.fit(x=X_train, y=y_train, batch_size=100, epochs=20, verbose=2, validation_data=(X_cv, y_cv))

Train on 33600 samples, validate on 8400 samples
Epoch 1/20
 - 4s - loss: 0.3392 - acc: 0.9008 - val_loss: 0.1405 - val_acc: 0.9550
Epoch 2/20
 - 3s - loss: 0.1207 - acc: 0.9639 - val_loss: 0.1095 - val_acc: 0.9658
Epoch 3/20
 - 3s - loss: 0.0779 - acc: 0.9766 - val_loss: 0.1011 - val_acc: 0.9693
Epoch 4/20
 - 3s - loss: 0.0579 - acc: 0.9819 - val_loss: 0.1122 - val_acc: 0.9655
Epoch 5/20
 - 3s - loss: 0.0447 - acc: 0.9854 - val_loss: 0.0980 - val_acc: 0.9726
Epoch 6/20
 - 3s - loss: 0.0321 - acc: 0.9903 - val_loss: 0.1174 - val_acc: 0.9662
Epoch 7/20
 - 3s - loss: 0.0262 - acc: 0.9915 - val_loss: 0.1000 - val_acc: 0.9712
Epoch 8/20
 - 3s - loss: 0.0308 - acc: 0.9901 - val_loss: 0.0959 - val_acc: 0.9739
Epoch 9/20
 - 3s - loss: 0.0227 - acc: 0.9926 - val_loss: 0.1054 - val_acc: 0.9749
Epoch 10/20
 - 3s - loss: 0.0203 - acc: 0.9935 - val_loss: 0.1443 - val_acc: 0.9654
Epoch 11/20
 - 3s - loss: 0.0173 - acc: 0.9940 - val_loss: 0.1358 - val_acc: 0.9705
Epoch 12/20
 - 3s - loss: 0.0202 - a

<keras.callbacks.History at 0x1205724a8>

In [39]:
"""
    As it turns out, it does appear to be the case that the optimizer plays a crucial part in the validation score. 
    In particular, the model which relies on 'Adam' as its optimizer tend to perform 1.5 - 2.5% better on average. 
    Going forward, we will use 'Adam' as our optimizer of choice.
"""

"\n    As it turns out, it does appear to be the case that the optimizer plays a crucial part in the validation score. \n    In particular, the model which relies on 'Adam' as its optimizer tend to perform 1.5 - 2.5% better on average. \n    Going forward, we will use 'Adam' as our optimizer of choice.\n"

In [43]:
# We now proceed to include dropout (dropout rate of 0.3) in our model to prevent overfitting.
# before that: we'd try adding another hidden layer and see if its make sense! .. did it and it doesn't change performance alot
# so let's stick with our model2 architechture and add drop out in it!
model3 = Sequential()
model3.add(Dense(units=300, activation='relu', name='1st_hidden_layer', input_dim=784))
model3.add(Dropout(0.3))
model3.add(Dense(units=100, activation='relu', name='2nd_hidden_layer'))
model3.add(Dropout(0.3))
model3.add(Dense(units=100, activation='relu', name='3rd_hidden_layer'))
model3.add(Dropout(0.3))
model3.add(Dense(units=200, activation='relu', name='4th_hidden_layer'))
model3.add(Dense(units=num_of_digits, activation='softmax', name='Output_Layer'))

In [44]:
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1st_hidden_layer (Dense)     (None, 300)               235500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
2nd_hidden_layer (Dense)     (None, 100)               30100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
3rd_hidden_layer (Dense)     (None, 100)               10100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
4th_hidden_layer (Dense)     (None, 200)               20200     
__________

In [45]:
# compile the model3
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
# fit the model:
model3.fit(x=X_train, y=y_train, batch_size=100, epochs=20, validation_data=(X_cv, y_cv))

Train on 33600 samples, validate on 8400 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x127560eb8>