**Lets Import our Tools**

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import tensorflow as tf
import random
from sklearn.model_selection import train_test_split
from PIL import Image,ImageFilter

**And our Data**

In [None]:
df = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
X_test_main = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

# Preprocessing

In [None]:
df.head()

**So We split the data into X and Y**

In [None]:
Y = df.iloc[:,0]
X = df.iloc[:,1:]

**And convert them to numpy for preprocessing**

In [None]:
X = X.to_numpy()
Y = Y.to_numpy()
X_test_main = X_test_main.to_numpy()

**So we will reshape it into proper image shapes as we will be using convolutional networks**

In [None]:
X = X.reshape(-1,28,28)
X_test_main = X_test_main.reshape(-1,28,28)
print(X.shape)
print(X_test_main.shape)

**Lets have a look at our data**

In [None]:
w=14
h=14
fig=plt.figure(figsize=(w,h))
columns = 4
rows = 5
for i in range(1, rows*columns+1):
    img1 = X[i+random.randrange(1,300)]
    fig.add_subplot(rows, columns, i)
    plt.imshow(img1)
plt.show()

In [None]:
X = np.expand_dims(X,axis=-1)
X_test_main = np.expand_dims(X_test_main,axis=-1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size= 0.2,shuffle=True)

In [None]:
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
print(X_test_main.shape)

1. **Alright then lets normalize the images and go to the model**

In [None]:
X_train = X_train/255.
X_test = X_test/255.
X_test_main = X_test_main/255.
print(X_train.shape)
print(X_test.shape)
print(X_test_main.shape)

**SO 10 classes, One hot matrix encoding**

In [None]:
def one_hottie(labels,C):
    """
    One hot Encoding is used in multi-class classification problems to encode every label as a vector of binary values
        eg. if there are 3 class as 0,1,2
            one hot vector for class 0 could be : [1,0,0]
                           then class 1: [0,1,0]
                           and class 2: [0,0,1]
    We need this encoding in out labels for the model learns to predict in a similar way.
    
    Without it,if only integer values are used in labels,it could affect model in different ways,
        such as predicting a class that does not exist.
        
    """
    One_hot_matrix = tf.one_hot(labels,C)
    return tf.keras.backend.eval(One_hot_matrix)

Y_test_later = Y_test.copy()
Y_train = one_hottie(Y_train, 10)
Y_test = one_hottie(Y_test, 10)
print ("Y shape: " + str(Y_train.shape))
print ("Y test shape: " + str(Y_test.shape))

In [None]:
# def res_net_block(input_data, filters=[128], conv_size=[3,5]):
#     x = tf.keras.layers.Conv2D(filters[0], conv_size[0], activation='relu', padding='same')(input_data)
#     x = tf.keras.layers.BatchNormalization()(x)
#     x = tf.keras.layers.Conv2D(filters[0], conv_size[1], activation=None, padding='same')(x)
#     x = tf.keras.layers.BatchNormalization()(x)
#     x = tf.keras.layers.Add()([x, input_data])
#     x = tf.keras.layers.Activation('relu')(x)
#     return x

# BUILDING THE MODEL

**Lets use a sequential for real fast assemble of layers**

As the data is quite simple, we won't need any complex model, so here is just a simple good enough model

A conv2d block = "**CONV2D(number of filters,size of filters) -> ReLU -> MAXPOOL2D**"

*INPUT DATA -> CONV2D(64,3) -> ReLU -> CONV2D BLOCK(128,3) -> CONV2D BLOCK(256,3) -> CONV2D BLOCK(256,3) -> 
Flatten out the ouput -> DENSE(100) -> DROPOUT(0.4) -> ReLU -> DENSE(50) -> ReLU -> DROPOUT(0.4) -> DENSE(10) -> OUTPUT DATA(that is kinda of likeliness of each particular class being correct) -> SOFTMAX FOR CLASSIFICATION

BETTER TO CHECK THIS OUT AT MODEL SUMMARY

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(64, 3, activation='relu', input_shape=(28,28,1),padding="same"),
    tf.keras.layers.MaxPool2D(strides=2),
    
    
    tf.keras.layers.Conv2D(128, 3, activation='relu',padding="same"),
    tf.keras.layers.MaxPool2D(strides=2),
    
    tf.keras.layers.Dropout(0.2),
        
    tf.keras.layers.Conv2D(256, 3, activation='relu',padding="same"),
    tf.keras.layers.MaxPool2D(strides=2),
    
    tf.keras.layers.Conv2D(256, 3, activation='relu',padding="same"),
    tf.keras.layers.MaxPool2D(strides=2),
        
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(100,kernel_regularizer=tf.keras.regularizers.l2(0.01), activation='relu'),
    
    tf.keras.layers.Dense(50,kernel_regularizer=tf.keras.regularizers.l2(0.01), activation='relu'),
        
    tf.keras.layers.Dense(10, kernel_regularizer=tf.keras.regularizers.l2(0.01) ,activation='softmax')
])

## MODEL SUMMARY

In [None]:
model.summary()

In [None]:
# initial_learning_rate = 0.001 #initial rate
# # Rate decay with exponential decay
# # new rate = initial_learning_rate * decay_rate ^ (step / decay_steps)

# lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate,
#     decay_steps=800,
#     decay_rate=0.5,
#     staircase=True)

**For simplicity , We won't be varying the learning rate with scheduler, instead we will just train it multiple times**

* For the 1st train, I'll boost the training with a 0.006 learning rate in just 20 epochs
* Actually less than that were required, i just made sure it gets a little stable

In [None]:
model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.006),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy','Recall','Precision'])

result = model.fit(x=X_train,y=Y_train,batch_size=64,epochs=20,verbose=1,shuffle=False,initial_epoch=0,
                   validation_split=0.1)

So right now we already hit 98 but that is expected as the data is just too simple

LETS CHECK OUT THE PLOTS
* As you can see, it hits the 96+ immediately and gets stable there
* So now we will reduce the learning rate by a lot, and train the model for real

In [None]:
plt.plot(result.history['acc'], label='train')
plt.plot(result.history['val_acc'], label='valid')
plt.legend(loc='upper left')
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()
plt.plot(result.history['loss'], label='train')
plt.plot(result.history['val_loss'], label='test')
plt.legend(loc='upper right')
plt.title('Model Cost')
plt.ylabel('Cost')
plt.xlabel('Epoch')
plt.show()

See those spikes, that tells me that learning rate was a bit higher.

**2nd TRaining**
Now we start form 99% but taking this to 100 would be hard with this model
* Also we need to reduce the loss a lot. Being such a simple model the loss is still high

In [None]:
model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.0001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy','Recall','Precision'])

result = model.fit(x=X_train,y=Y_train,batch_size=64,epochs=40,verbose=1,shuffle=False,initial_epoch=20,
                   validation_split=0.1)

**So we barely touched 99.1 here**
* I saw a variance problem here so i actually reduced dropouts.

In [None]:
plt.plot(result.history['acc'], label='train')
plt.plot(result.history['val_acc'], label='valid')
plt.legend(loc='upper left')
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()
plt.plot(result.history['loss'], label='train')
plt.plot(result.history['val_loss'], label='test')
plt.legend(loc='upper right')
plt.title('Model Cost')
plt.ylabel('Cost')
plt.xlabel('Epoch')
plt.show()

***3rd Training***
We reduce the learning rate further to check more performance can be extracted here

In [None]:
model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.00006),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy','Recall','Precision'])

result = model.fit(x=X_train,y=Y_train,batch_size=64,epochs=60,verbose=1,shuffle=False,initial_epoch=40,
                   validation_split=0.1)



**And WHen you check out the training and validation set combined**

In [None]:
check = model.evaluate(X_test,Y_test)

**We have hit 99.3
% now**
* So lets see where things went wrong with a confusion matrix

In [None]:
preds = model.predict_classes(X)
preds.shape

In [None]:
# X = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
# Y_test = X.iloc[:,0]
# Y_test = Y_test.to_numpy()

In [None]:
conf = tf.math.confusion_matrix(preds,Y)

In [None]:
with tf.Session() as session:
    print(conf.eval())

One more training. This time with data Augmentation

In [None]:
train_gen = tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=20,
                                                            zoom_range=0.20,
                                                            width_shift_range=0.2,
                                                            height_shift_range=0.2,
                                                            shear_range=0.20,
                                                            horizontal_flip=False,
                                                            brightness_range=[0.1,1],
                                                            rescale=1./255)
test_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

As you can see I included rescale in the data generator, but the images are already normalized. So we denormalize it before passing it to generator

In [None]:
model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=0.00005),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy','Recall','Precision'])

result = model.fit(train_gen.flow(X_train*255,Y_train,batch_size=64),
                   validation_data = test_gen.flow(X_test*255,Y_test,batch_size=16),
                   epochs=70,
                   verbose=1)

# Make predictions

In [None]:
preds = model.predict_classes(X_test_main)

In [None]:
preds.shape

In [None]:
arr = [x for x in range(1,28001)]
label = pd.DataFrame(arr,columns = ["ImageId"])
label["Label"] = pd.DataFrame(preds)
label.head()

In [None]:
label.to_csv('Y_test.csv',header=True,index = False)

In [None]:
model.save("saved_model")