In [None]:
from numpy.random import seed
seed(1)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools
import tensorflow as tf
from tensorflow import keras
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input, BatchNormalization
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

# Warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Random seeds
import random
random.seed(319)
np.random.seed(319)
tf.random.set_seed(319)

<a id=0></a>
## <p style="background-color:lightblue; font-family:newtimeroman; font-size:120%; text-align:left; border-radius: 15px 50px;">Table of Content</p>
* [1. Introduction and updates](#1)
* [2. Data Preparation](#2)
    * [2.1. Load Data](#2.1)
    * [2.2. Count record number per each category](#2.2)
    * [2.3. Check null and missing](#2.3)
    * [2.4. Preprocessing](#2.4)
    * [2.5 Data Augmentation](#2.5)
* [3. Model](#3)
    * [3.1 Define Model](#3.1)
    * [3.2 Reducing Learning Rate](#3.2)
    * [3.3 Re-instantiate the hypermodel and train it with the optimal number of epochs](#3.3)
    

* [References](#10)

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">1. Introduction and updates</p>
 

<a id=1.1 ></a>
<font size="+3" color="#5bc0de"><b>1.1. Introduction </b></font><br>
[Content](#0)

* In this kernel, Data is augmented to increace validation.
* Using a simple CNN model.
* To prevent overfitting, [Reduce Learning Rate technic](https://keras.io/api/callbacks/reduce_lr_on_plateau/) to apply Learning Rate in model.

<a id=1.2 ></a>
<font size="+3" color="#5bc0de"><b>1.2. Update via Versions </b></font><br>
[Content](#0)

### Current Version
* Fit the error in [here](https://www.kaggle.com/c/digit-recognizer/discussion/290372)

### Version 7
* Add One BatchNormalization layer in front of the last layer in model.
* Increase Epoch to 30 to prevent Underfitting. 

### Version 6
* This version, the Decay hyperparameter in Optimizer is '5*learningrate/epoch'
* Using ImageDataGenerator for both Train/Validation Data

### Version 5
* This version, the Decay hyperparameter in Optimizer is '7*learningrate/epoch'

### Version 4
* Model is use RMSprop Optimizer. Decay parameter (in this optimizer) will decays the learning rate over time, so we can move even closer to the local minimum in the end of training. The previous version, this parameter is not used (set equal zero). And in this version, the value is '5*learningrate/epoch'


### Version 2,3
* Using Data Augmentation and running with the best epoch
* Update Markdowns

### Version 1
* No Data Augmentation --> 0.984



<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;"> 2. Data Preparation</p>


<a id=2.1 ></a>
<font size="+3" color="#5bc0de"><b>2.1. Load Data </b></font><br>
[Content](#0)

In [None]:
train_full = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

In [None]:
pixels = train_full.drop(labels=["label"],axis=1, inplace=False)
labels =  train_full['label']

In [None]:
# Free memory
del train_full

<a id=2.2 ></a>
<font size="+3" color="#5bc0de"><b>2.2. Count record number per each category </b></font><br>
[Content](#0)


In [None]:
sns.countplot(labels)

### Data is distributed similarity from number 0 to 9 

<a id=2.3 ></a>
<font size="+3" color="#5bc0de"><b>2.3. Check null and missing </b></font><br>
[Content](#0)


In [None]:
pixels.isnull().any().describe()

In [None]:
labels.isnull().any()

In [None]:
test.isnull().any().describe()

### No corrupted images(missing/null inside)

<a id=2.4 ></a>
<font size="+3" color="#5bc0de"><b>2.4. Preprocessing </b></font><br>
[Content](#0)


In [None]:
# Normalization
pixels = pixels/255
test = test/255

In [None]:
# Reshape
pixels = pixels.values.reshape(-1,28,28,1)
test = test.values.reshape(-1,28,28,1)

In [None]:
# Label encoding
labels = to_categorical(labels, num_classes=10)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(pixels, labels,
                                                 test_size=0.1,
                                                 random_state=42)

In [None]:
X_train.shape

<a id=2.5 ></a>
<font size="+3" color="#5bc0de"><b>2.5 Data Augmentation </b></font><br>
[Content](#0)

   - Randomly rotate some training images by 10 degrees
   - Randomly  Zoom by 10% some training images
   - Randomly shift images horizontally by 10% of the width
   - Randomly shift images vertically by 10% of the height
   
**Vertical_flip** and **Horizontal_flip** are not applied because they could have lead to misclassify symetrical numbers such as 6 and 9.


In [None]:
BATCH_SIZE = 64 # random number depend on your processor.

In [None]:
train_datagen = ImageDataGenerator(featurewise_center=False,
                             samplewise_center=False,
                             featurewise_std_normalization=False,
                             samplewise_std_normalization=False,
                             zca_whitening=False,
                             rotation_range=10,
                             zoom_range=0.1,
                             width_shift_range=0.1,
                             height_shift_range=0.1,
                             horizontal_flip=False,
                             vertical_flip=False
                            )
train_generator = train_datagen.flow(X_train, y_train,
                                     batch_size=BATCH_SIZE,
                                     shuffle=True)

In [None]:
val_datagen = ImageDataGenerator()
val_generator = val_datagen.flow(X_val, y_val,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True)

<a id=3 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 20px 50px;">3. CNN Model</p>
[Content](#0)


<a id="3.1"></a>
<font size="+3" color="#5bc0de"><b>3.1. Define Model </b></font><br>
[Content](#0)

In [None]:
EPOCH = 20
LEARNING_RATE = 0.001
kernel_initializer = tf.keras.initializers.GlorotNormal(seed=319)

In [None]:
def create_model():
    model = Sequential()

    model.add(Input(shape=(28,28,1)))
    model.add(Conv2D(32, kernel_size=(5,5), kernel_initializer=kernel_initializer, padding="Same", activation="relu",))
    model.add(Conv2D(32, kernel_size=(5,5), kernel_initializer=kernel_initializer, padding="Same", activation="relu",))
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(0.25, seed=319))

    model.add(Conv2D(64, kernel_size=(5,5), kernel_initializer=kernel_initializer, padding="Same", activation="relu"))
    model.add(Conv2D(64, kernel_size=(5,5), kernel_initializer=kernel_initializer, padding="Same", activation="relu"))
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.25, seed=319))

    model.add(Flatten())
    model.add(Dense(256, kernel_initializer=kernel_initializer, activation="relu"))
    model.add(Dropout(0.5, seed=319))
    model.add(BatchNormalization())
    model.add(Dense(10, kernel_initializer=kernel_initializer, activation="softmax"))
    
    # Define the optimizer
    decay= 5 * LEARNING_RATE / EPOCH
    optimizer = RMSprop(learning_rate=LEARNING_RATE, rho=0.9, epsilon=1e-08, decay=decay)
    
    #Compile model
    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=['accuracy']
                 )
    return model

<a id="3.2"></a>
<font size="+3" color="#5bc0de"><b>3.2  Reducing Learning Rate </b></font><br>
[Content](#0)

Reducing the Learning Rate by half of the accuracy is not improved after 3 epoches

In [None]:
lr_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                patience=3,
                                verbose=1, # update messages.
                                factor=0.5,
                                min_lr=0.00001)

In [None]:
model = create_model()
model.summary()

In [None]:
history = model.fit(train_generator,
                    epochs= EPOCH,
                    validation_data=val_generator,
                    verbose=2,
                    callbacks=[lr_reduction]
                   )

In [None]:
def plot_loss_accuracy(history):
    # Plot the loss and accuracy curves for training and validation 
    fig, ax = plt.subplots(2,1)
    ax[0].plot(history.history['loss'], color='b', label="Training loss")
    ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
    legend = ax[0].legend(loc='best', shadow=True)

    ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
    ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
    legend = ax[1].legend(loc='best', shadow=True)

plot_loss_accuracy(history)

<a id="3.3"></a>
<font size="+3" color="#5bc0de"><b>3.3  Re-instantiate the hypermodel and train it with the optimal number of epochs </b></font><br>
[Content](#0)


In [None]:
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch))+1
best_epoch, max(val_acc_per_epoch)

In [None]:
model_best = create_model()
history_best = model_best.fit(train_generator,
                    epochs= best_epoch,
                    validation_data=val_generator,
                    verbose=2,
                    callbacks=[lr_reduction]
                   )
plot_loss_accuracy(history_best)

In [None]:
# predict results
results = model.predict(test)

# select the indix with the maximum probability
results = np.argmax(results,axis = 1)
results = pd.Series(results,name="Label")

In [None]:
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),results],axis = 1)

submission.to_csv("cnn_mnist_datagen.csv",index=False)

<a id=10 ></a>
## <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 20px 50px;">Reference</p>

[Content](#0)


* [ZCA Whitening](https://martin-thoma.com/zca-whitening/)

* [Basic Data Augumentation](https://youtu.be/yYqAvlkRwUQ)

    * https://machinelearningmastery.com/how-to-configure-image-data-augmentation-when-training-deep-learning-neural-networks/
    
* [RMSprop](https://keras.io/api/optimizers/rmsprop/)