# Convolutional Neural Network for Digit Recognition

We use Keras to build and train a convolutional network.
The below uses TPU to speed up the training process, the set up is taken from https://www.kaggle.com/c/tpu-getting-started. This gives just above 99% accuracy if you run for about 300 epochs (below is done with 100 only).  
EDIT: In an effort to improve generalisation of the model (and higher accuracy on the test set), we try data augmentation, i.e. apply random rotations, shifts, etc. of our training images to the training set (see https://www.kaggle.com/alewicka/mnist-digit-recognizer-cnn-in-keras-99-57 for a nice example using Keras ImageDataGenerator).   
TPU doesn't currently work with Keras ImageDataGenerator (I believe this will be added in a future version though) but we can loop through the ImageDataGenerator manually to retrieve the augmented images as suggested here (https://www.kaggle.com/wahyusetianto/cnn-keras-cv-0-996-tpu)
We compare the performance for the model trained on original data vs augmented data 

In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers import Conv2D, Dense, Flatten, MaxPooling2D, Dropout
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

In [None]:
# Load data
path = '/kaggle/input/digit-recognizer/'
data = pd.read_csv(path +'train.csv')
print(data.shape)

In [None]:
# Prepare input data
# split data into label and features
label = tf.keras.utils.to_categorical(data['label'].values) # convert labels to categorical
X = data.drop('label',axis=1).values
# Note: you can convert to one-hot-encoding as above and use categorical_crossentropy as loss function
# OR you can keep data['label'] (without converting) and use sparse_categorical_loss function instead

# get shapes
num_images = X.shape[0] # examples/images
m = X.shape[1] # pixels
max_pixel = 255 # maximum pixel value for rescaling (improves convergence rate in training)
img_dim = np.sqrt(m).astype(int) #image dimensions
#out_dim = label.nunique() #shape[1]
out_dim = label.shape[1]
print('Number of different labels/output dimension ' + str(out_dim))

#reshape and normalize features
X = X.reshape((num_images,img_dim,img_dim,1)) / max_pixel
print('Shape of input data X ' + str(X.shape))

In [None]:
# Let's print the first couple of images
plt.figure(figsize=(4,4))
for i in range(4*4):
    plt.subplot(4,4,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(X[i].reshape(img_dim,img_dim))
    plt.xlabel(data.iloc[i,:].label)
plt.subplots_adjust(hspace=0.5)
plt.show()

In [None]:
# We want to use the TPU to speed up training
# Let's set it up 
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
# Let's write a function for our model
def create_model():
    my_model = Sequential()
    my_model.add(Conv2D(32,
                        kernel_size = (3,3),
                        activation = 'relu',
                        input_shape = (img_dim,img_dim,1)))
    my_model.add(Dropout(0.5))
    my_model.add(MaxPooling2D(pool_size=(2, 2), strides=None, padding="valid"))
    my_model.add(Conv2D(32, kernel_size = (3,3), activation = 'relu'))
    my_model.add(Dropout(0.5))
    my_model.add(MaxPooling2D(pool_size=(2, 2), strides=None, padding="valid"))
    my_model.add(Flatten())
    my_model.add(Dense(units = 500))
    my_model.add(Dense(out_dim, activation = 'softmax'))
    
    return my_model

In [None]:
# Set up the CNN architecture and compile the model
# The layers of the CNN are 2x(Conv2D > Dropout > MaxPooling2D) > Flatten > Dense > Dense output layer
# We use 'relu' activation in the hidden layers with 'softmax' in the output layer 
# Instantiating the model in the strategy scope creates the model on the TPU

with tpu_strategy.scope():
    my_model = create_model()    
    # compile model
    my_model.compile(optimizer = 'adam',
                    loss = 'categorical_crossentropy',
                    metrics = ['accuracy'])

In [None]:
my_model.summary()

In [None]:
# Split into train and validation data
# > I used the built in validation_split parameter before. 
# > This turned out to take more than 3x times as long to train per epoch than splitting it with train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, label, test_size = 0.1)

In [None]:
# Set the parameter for our model and then fit it
BATCH_SIZE = 128 * tpu_strategy.num_replicas_in_sync # 16 with TPU off and 128 with TPU on
EPOCHS = 100
STEPS_PER_EPOCH = num_images // BATCH_SIZE

In [None]:
#train model
history = my_model.fit(X_train, y_train ,
#                         X, data['label'],
                         epochs = EPOCHS,
                         steps_per_epoch = STEPS_PER_EPOCH,
                         validation_data = (X_val, y_val))

In [None]:
# Plot training and validation accuracy curves
import matplotlib.pyplot as plt 

train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epochs = range(0,len(train_loss))

fig, (ax1,ax2) = plt.subplots(1,2)
fig.suptitle('Training progress')

ax1.plot(epochs,train_loss,label = 'train')
ax1.plot(epochs,val_loss,label = 'validation')
ax1.set_ylabel('Loss')
ax2.plot(epochs,train_acc,label = 'train')
ax2.plot(epochs,val_acc,label = 'validation')
ax2.set_ylabel('Accuracy')
ax1.legend()
ax2.legend()

plt.show()

In [None]:
# load test data 
data_test  = pd.read_csv(path+'test.csv').values
num_test = data_test.shape[0]
data_test = data_test.reshape((num_test,img_dim,img_dim,1)) / max_pixel

In [None]:
# Let's try data augmentation and see if this leads to a better generalisation/ better performance on the validation set

In [None]:
datagen = ImageDataGenerator(rotation_range=20,  
                             zoom_range = 0.20,
                             width_shift_range=0.15,
                             height_shift_range=0.15)

In [None]:
# Loop through the ImageDataGenerator to retrieve augmented data set
# (since Keras doesnt currently accept ImageDataGenerator as an input)
Train_x, Train_y = None, None
batch = 0
for x_batch, y_batch in datagen.flow(X_train, y_train, 
                                     batch_size=BATCH_SIZE, shuffle=False):
    if batch == 0:
        Train_x, Train_y = x_batch, y_batch
    elif batch >= len(datagen.flow(X_train, y_train, 
                                     batch_size=BATCH_SIZE)): # X.shape[0] // BATCH_SIZE:
        break
    else:
        Train_x = np.concatenate((Train_x, x_batch))
        Train_y = np.concatenate((Train_y, y_batch))
    batch += 1

In [None]:
# Let's compare the originals to the augmented images
plt.figure(figsize=(10,4))
for i in range(10):
    plt.subplot(2,10,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(X_train[i].reshape(img_dim,img_dim))
for i in range(10):
    plt.subplot(2,10,10+i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(Train_x[i].reshape(img_dim,img_dim))
#plt.subplots_adjust(hspace=0.2)
plt.show()

In [None]:
# Looks like it worked, the bottom row looks a bit more wonky than the top row :)
# Let's see what it does to our model training

In [None]:
# Create model on TPU
with tpu_strategy.scope():
    my_model_aug = create_model()
    # compile model
    my_model_aug.compile(optimizer = 'adam',
                    loss = 'categorical_crossentropy',
                    metrics = ['accuracy'])   

In [None]:
# Training
history_aug = my_model_aug.fit(Train_x, Train_y, 
                    epochs = EPOCHS,
                    steps_per_epoch = STEPS_PER_EPOCH,
                    validation_data = (X_val, y_val))

In [None]:
# Plot training and validation accuracy curves for both the original model and the model trained on augmented data
import matplotlib.pyplot as plt 

train_loss_aug = history_aug.history['loss']
val_loss_aug = history_aug.history['val_loss']
train_acc_aug = history_aug.history['accuracy']
val_acc_aug = history_aug.history['val_accuracy']

epochs = range(0,len(train_loss))

fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,5))
fig.suptitle('Training progress')

ax1.plot(epochs,train_loss,label = 'train')
ax1.plot(epochs,val_loss,label = 'validation')
ax1.plot(epochs,train_loss_aug,label = 'train augmented')
ax1.plot(epochs,val_loss_aug,label = 'validation augmented')
ax1.set_ylabel('Loss')
ax2.plot(epochs,train_acc,label = 'train')
ax2.plot(epochs,val_acc,label = 'validation')
ax2.plot(epochs,train_acc_aug,label = 'train augmented')
ax2.plot(epochs,val_acc_aug,label = 'validation augmented')
ax2.set_ylabel('Accuracy')
# axes.set_ylim([ymin,ymax])
ax1.legend()
ax2.legend()

plt.show()

In [None]:
# I find it hard to see if data augmentation actually improves generalisation ¯\_(ツ)_/¯

In [None]:
# predict with trained model
my_predictions = my_model.predict(data_test)
imageId = np.arange(len(my_predictions))
results = my_predictions.argmax(axis=1)

In [None]:
submission = pd.DataFrame(np.array([imageId + 1,results]).transpose(), columns = ['ImageId','Label'])
submission.to_csv('submission.csv', index = False)

In [None]:
# predict with model trained on augmented data
my_predictions_aug = my_model_aug.predict(data_test)
imageId = np.arange(len(my_predictions_aug))
results_aug = my_predictions_aug.argmax(axis=1)

In [None]:
submission_aug = pd.DataFrame(np.array([imageId + 1,results_aug]).transpose(), columns = ['ImageId','Label'])
submission_aug.to_csv('submission_aug.csv', index = False)