In [None]:
# General Imports
import numpy as np
import pandas as pd
import time

# Tensorflow/Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.preprocessing import MinMaxScaler

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Logging/Warnings
import warnings
warnings.filterwarnings('ignore')
tf.autograph.set_verbosity(0)

# Load data
submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

features = [x for x in train.columns if x != 'label']

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)
y = train["label"]
X = train.drop("label",axis=1)
X_test = test

# **Pls Upvote if you like this notebook**  

# **Split Train/Val/Test data**

In [None]:
from sklearn.model_selection import train_test_split
INPUT_SIZE = (28,28,1)
NUM_CLASSES = 10
scaler = MinMaxScaler()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=41)
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(test)   
# Convert training examples to shape (28,28,1)
X_train = np.reshape(X_train, (X_train.shape[0], *INPUT_SIZE))
X_valid = np.reshape(X_valid, (X_valid.shape[0], *INPUT_SIZE))
X_test = np.reshape(X_test, (X_test.shape[0], *INPUT_SIZE))
        
# Convert target vectors for keras input
Y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
Y_valid = keras.utils.to_categorical(y_valid, NUM_CLASSES)

# **Model : Convolutional Neural Network**


In [None]:
inp = keras.Input(shape = INPUT_SIZE)
x = layers.Conv2D(16, 4,strides=2, padding="same", activation="relu")(inp)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(32, 3,strides=1, padding="same", activation="relu")(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(64, 5,strides=1, padding="same", activation="relu")(x)

x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dense(64, activation="relu")(x)
out = layers.Dense(10, activation="softmax")(x)

model = keras.Model(inp, out, name="model")
model.summary()

In [None]:
def get_lr_callback(epoch,lr):
    lr_start   = 0.00001
    lr_max     = 0.01#0.00000125 * 1 * batch_size
    lr_min     = 0.0001
    lr_ramp_ep = 2
    lr_sus_ep  = 1
    lr_decay   = 0.7
    
    def lrfn(epoch):
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        return lr
    
    return lrfn(epoch)

lr_callback = tf.keras.callbacks.LearningRateScheduler(get_lr_callback, verbose=True)
x = [x for x in range(60)]
plt.plot(x,[get_lr_callback(x,.1) for x in x])
plt.show()

The frist part is the warm up, then the fast learning , then converge to ideal

In [None]:
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), 
                optimizer = tf.keras.optimizers.Adam(), 
                metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
history = model.fit(X_train, Y_train,validation_data = (X_valid, Y_valid),batch_size = 64,epochs = 20,callbacks=[lr_callback])

In [None]:
plt.plot(history.history['loss'][1:])
plt.plot(history.history['val_loss'][1:])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

The model is overfitted, as we can see, the training loss continues to decrease while the value loss increases.

# **How to help the model to generalize ?**

With our first model, we can use several different methods :
   * Adding dropout between layers
   * Using Earlystopping
   * Adding noise in the inputs
   * SDG instead of Adam ( Adam is faster but SDG generalizes more, in general we use Adam to speed up the process then SDG)
   * Adding BatchNormalization for a faster learning
   * Use Data-augmentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(
                            rotation_range=10,
                            zoom_range = 0.2, 
                            width_shift_range=0.1, 
                            height_shift_range=0.1)
                        

datagen.fit(X_train)

In [None]:
inp = keras.Input(shape = INPUT_SIZE)
x = layers.GaussianNoise(.1)(inp)  # adding noise in the inputs 
x = layers.Conv2D(16, 4,strides=2, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(.3)(x) # adding dropout 
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(32, 3,strides=1, padding="same", activation="relu")(x)
x = layers.Dropout(.3)(x) # adding dropout 
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D(pool_size=(2, 2))(x)
x = layers.Conv2D(64, 5,strides=1, padding="same", activation="relu")(x)
x = layers.Dropout(.3)(x) # adding dropout 
x = layers.BatchNormalization()(x)

x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dense(64, activation="relu")(x)
out = layers.Dense(10, activation="softmax")(x)

model = keras.Model(inp, out, name="model")
model.summary()

In [None]:
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=15, verbose=1,
    mode='auto', restore_best_weights=True
)

# the earlystop will let the training continue to 10 epochs more afeter the val_loss increases, then restore the best model 

In [None]:
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), 
                optimizer = tf.keras.optimizers.Adam(), 
                metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
hist1 = model.fit(datagen.flow(X_train, Y_train,batch_size=64),validation_data = (X_valid, Y_valid),batch_size = 64,epochs = 20,callbacks=[lr_callback,earlystop])

In [None]:
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), 
                optimizer = tf.keras.optimizers.SGD(), # optimize with SGD
                metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=3, min_lr=0.0001,verbose=1)

In [None]:
hist2 = model.fit(datagen.flow(X_train, Y_train,batch_size=64),validation_data = (X_valid, Y_valid),batch_size = 128,epochs = 50,callbacks=[earlystop,reduce_lr])

In [None]:
plt.plot(hist1.history['loss'][1:]+hist2.history['loss'])
plt.plot(hist1.history['val_loss'][1:]+hist2.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

We can see the val_loss still decreasing with the training one : the model generalize.

However, we can notice that the model does not improve anymore.  We can either decrease the dropout to increase the representation capacity of the model or increase the size of the model. 

In [None]:
inp = keras.Input(shape = INPUT_SIZE)
x = layers.GaussianNoise(.1)(inp)  # adding noise in the inputs 

x = layers.Conv2D(32, 3,strides=1, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(32, 3,strides=1, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(32, 5,strides=2, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
X = layers.Dropout(.4)(x)

x = layers.Conv2D(64, 3,strides=1, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, 3,strides=1, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Conv2D(64, 5,strides=2, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
X = layers.Dropout(.4)(x)

x = layers.Conv2D(128, 4,strides=1, padding="same", activation="relu")(x)
x = layers.BatchNormalization()(x)
X = layers.Dropout(.4)(x)

x = layers.Flatten()(x)
out = layers.Dense(10, activation="softmax")(x)

model = keras.Model(inp, out, name="model")
model.summary()

In [None]:
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), 
                optimizer = tf.keras.optimizers.Adam(), 
                metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
hist1 = model.fit(datagen.flow(X_train, Y_train,batch_size=64),validation_data = (X_valid, Y_valid),epochs = 10,callbacks=[lr_callback,earlystop])

In [None]:
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(), 
                optimizer = tf.keras.optimizers.SGD(), # optimize with SGD
                metrics=[tf.keras.metrics.CategoricalAccuracy()])

In [None]:
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=3, min_lr=0.0001,verbose=1)

In [None]:
hist2 = model.fit(datagen.flow(X_train, Y_train,batch_size=128),validation_data = (X_valid, Y_valid),batch_size = 128,epochs = 40,callbacks=[earlystop,reduce_lr])

In [None]:
plt.plot(hist1.history['loss'][1:]+hist2.history['loss'])
plt.plot(hist1.history['val_loss'][1:]+hist2.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Miss Match

In [None]:
def viz_num(num):
    #Reshape the 768 values to a 28x28 image
    image = X_valid[num].reshape([28,28])
    plt.title('Sample: %d  Label: %d , pred : %d' % (num, y_valid.iloc[num],model.predict(X_valid)[num].argmax()))
    plt.imshow(image, cmap=plt.get_cmap('gray'))
    plt.show()

In [None]:
pred = model.predict(X_valid).argmax(axis=1)
dis_match = list(y_valid-pred)
num1 = np.argmin(dis_match)
dis_match = dis_match[:num1]
num2 = np.argmin(dis_match)
dis_match = dis_match[:num2]
num3 = np.argmin(dis_match)

In [None]:
viz_num(num1)

In [None]:
viz_num(num2)

In [None]:
viz_num(num3)

In [None]:
submission['Label'] = model.predict(X_test).argmax(axis=-1)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)