In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Plotting
import matplotlib.pyplot as plt
import cv2 as cv

# Neural networks:
from keras.layers import Conv2D, Input, LeakyReLU, Dense, Activation, Flatten, Dropout, MaxPool2D
from keras import models
from keras.optimizers import Adam, RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

import pickle
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load and visualize the Dataset
#### MNIST Dataset:
* 60000 labeled images of handwritten numbers.
* Image size is 28 * 28.
* The images have only one color chanel(grayscale).
* Labeled from 0 to 9.

In [None]:
np.random.seed(1)
# Load Dataset
df_train = pd.read_csv("/kaggle/input/digit-recognizer/train.csv") 
# Random permutation(the seed is used to resample the same permutation evey time)
df_train = df_train.iloc[np.random.permutation(len(df_train))]

In [None]:
df_train.head(5)

In [None]:
df_train.shape

**The training set has 42000 images, with 784 pixels each(the other col is the label)**

## Separate training and test sets
In this case i'll be using 90% of the set for training and the other 10% for testing

In [None]:
sample_size = df_train.shape[0] # Training size
validation_size = int(df_train.shape[0] * 0.1) # Testing size

# train_x y train_y
# Take all the columns except for the 0th one
train_x = np.asarray(df_train.iloc[:sample_size - validation_size:, 1:]).reshape([sample_size - validation_size, 28, 28, 1])
train_y = np.asarray(df_train.iloc[:sample_size - validation_size:, 0]).reshape([sample_size - validation_size, 1])# 0th column

# val_x y val_y
val_x = np.asarray(df_train.iloc[sample_size - validation_size:,1:]).reshape([validation_size,28,28,1])
val_y = np.asarray(df_train.iloc[sample_size - validation_size:, 0]).reshape([validation_size, 1])

In [None]:
# Training set size:
train_x.shape, train_y.shape

### **Load test.csv**

In [None]:
df_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")
#Reshape it as a numpy array:
test_x = np.asarray(df_test.iloc[:, :]).reshape([-1, 28, 28, 1])

### **Normlalize the data:**
Each pixel values lies between [0, 255], This range is too high and can be difficult  for the model to learn, what we do is scale the range of pixels to [0, 1]

In [None]:
train_x = train_x/255
val_x = val_x/255
test_x = test_x/255

## Visualize digits dataset frequency
An important thing to do is is check the frequency of classes in the dataset, is always better to work with a balanced dataset.

**Frequency plot for the training set:**

In [None]:
# First we check the frequency of digits in trainin and validation set
counts = df_train.iloc[:sample_size - validation_size, :].groupby('label')['label'].count()

f = plt.figure(figsize = (10, 6))
f.add_subplot(111)

plt.bar(counts.index, counts.values, width = 0.8, color = "orange")
for i in counts.index:
    plt.text(i, counts.values[i] + 50, str(counts.values[i]), horizontalalignment = 'center', fontsize = 14)

plt.tick_params(labelsize = 14)
plt.xticks(counts.index)
plt.xlabel("Digits", fontsize = 16)
plt.ylabel("Frequency", fontsize = 16)
plt.title("Frequency Graph training set", fontsize = 20)
plt.savefig('digit_frequency_train.png')
plt.show()

**Frequency plot for the training set:**

In [None]:
counts = df_train.iloc[sample_size - validation_size:, :].groupby('label')['label'].count()

f = plt.figure(figsize = (10, 6))
f.add_subplot(111)

plt.bar(counts.index, counts.values, width = 0.8, color = "orange")
for i in counts.index:
    plt.text(i, counts.values[i] + 5, str(counts.values[i]), horizontalalignment = 'center', fontsize = 14)

plt.tick_params(labelsize = 14)
plt.xticks(counts.index)
plt.xlabel("Digits", fontsize = 16)
plt.ylabel("Frequency", fontsize = 16)
plt.title("Frequency Graph Validation set", fontsize = 20)
plt.savefig('digit_frequency_Val.png')
plt.show()

It seems that there's a good balance of classes on the Train and Test sets

### **Visualizing the digits**

In [None]:
rows = 5
cols = 6

f = plt.figure(figsize = (2*cols, 2*rows))

for i in range(rows*cols):
    f.add_subplot(rows, cols, i + 1)#Adding a suubplot on each iteration
    plt.imshow(train_x[i].reshape([28, 28]), cmap = "Blues")
    plt.axis("off")
    plt.title(str(train_y[i]), y = -0.15, color = "green")
plt.savefig("digits.png")

# Building the model
## Convolutional neural network
**Description of the model:**
* 2 convolutional blocks(with leaky relu) with 2 MaxPool layers and a dropout layer.
* The output layer has 10 nodes with sigmoid activation

In [None]:
#Define the model sequential
model = models.Sequential()

In [None]:
# Block 1
model.add(Conv2D(32,3, padding  ="same",input_shape=(28, 28, 1)))
model.add(LeakyReLU())
model.add(Conv2D(32,3, padding  ="same"))
model.add(LeakyReLU())
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))

# Block 2
model.add(Conv2D(64,3, padding  ="same"))
model.add(LeakyReLU())
model.add(Conv2D(64,3, padding  ="same"))
model.add(LeakyReLU())
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(10,activation="sigmoid"))

## Compile the model:
I'll be using "Sparse categorical crossentropy" as a loss, accuracy as the performance metric and Adam as optimization algorithm. 

In [None]:
initial_lr = 0.001
loss = "sparse_categorical_crossentropy"
model.compile(Adam(lr = initial_lr), loss = loss, metrics = ['accuracy'])
model.summary()

### **Training**

In [None]:
epochs = 20 # epochs
batch_size = 256 # batch size
history_1 = model.fit(train_x, train_y, batch_size = batch_size, epochs = epochs, validation_data = (val_x, val_y))

## Training performance

In [None]:
#Deffining figure:
f = plt.figure(figsize = (20, 7))

#Adding accuracy subplot
f.add_subplot(121)

#Accuracy curve for training set
plt.plot(history_1.epoch, history_1.history['accuracy'], label = "accuracy")
#Accuracy curve for test set
plt.plot(history_1.epoch, history_1.history['val_accuracy'], label = "val_accuracy")

plt.title("Accuracy Curve", fontsize = 18)
plt.xlabel("Epochs", fontsize = 15)
plt.ylabel("Accuracy", fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend()

#Adding loss subplot
f.add_subplot(122)

#Loss curve for the training set
plt.plot(history_1.epoch,history_1.history['loss'],label="loss")
#Loss curve for the test set
plt.plot(history_1.epoch,history_1.history['val_loss'],label="val_loss")

plt.title("Loss Curve", fontsize = 18)
plt.xlabel("Epochs", fontsize = 15)
plt.ylabel("Loss", fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend()

plt.show()

As we see, the accuracy and error of the test set are worse than the ones from the train set, it means the algorithm is overfiting.

Let's analize the performance of the algorithm on the test set more deeper.

## Confusion Matrix:
Each row of the matrix represents the instances in a predicted class, while each column represents the instances in an actual class (or vice versa). The name stems from the fact that it makes it easy to see if the system is confusing two classes.

In [None]:
val_p = np.argmax(model.predict(val_x), axis = 1)

#Fill the confusion matrix and sumarize the error:
error = 0
confusion_matrix = np.zeros([10, 10])
for i in range(val_x.shape[0]):
    confusion_matrix[val_y[i], val_p[i]] += 1
    if val_y[i] != val_p[i]:
        error += 1
        
print("Confusion Matrix: \n\n", confusion_matrix)
print("\nErrors in validation set: ", error)
print("\nError Persentage: ", (error * 100) / val_p.shape[0])
print("\nAccuracy: ", 100 - (error * 100) / val_p.shape[0])
print("\nValidation set Shape: ", val_p.shape[0])

In [None]:
# Ploting confusion matrix:
f = plt.figure(figsize = (10, 10))
f.add_subplot(111)

plt.imshow(np.log2(confusion_matrix + 1), cmap = "Blues")
plt.colorbar()
plt.tick_params(size = 5, color = "white")
plt.xticks(np.arange(0, 10), np.arange(0, 10))
plt.yticks(np.arange(0, 10), np.arange(0, 10))

threshold = confusion_matrix.max()/2

for i in range(10):
    for j in range(10):
        plt.text(j, i, int(confusion_matrix[i, j]), horizontalalignment = "center", color = "white" if confusion_matrix[i, j] > threshold else "black")
        

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.savefig("Confusion_matrix1.png")
plt.show()

# Improve results by image augmentation
![image.png](attachment:1f2da43c-aa9c-40fa-9095-31004d05a34f.png)

For improving the results we'll be using image augmentation, which is a technique widely used in machine learning, where we pick examples of data and we slightly change them by roting them or flipping them, etc.

In [None]:
# Function of keras for image augmentation
datagen = ImageDataGenerator(
    featurewise_center = False,
    samplewise_center = False,
    featurewise_std_normalization = False,
    samplewise_std_normalization = False,
    zca_whitening = False,
    rotation_range = 10, #Rotate images in range
    zoom_range = 0.1, #Zoom image
    width_shift_range = 0.1, #Shift images horizontaly
    height_shift_range = 0.1, #Shift images vertically
    horizontal_flip = False,
    vertical_flip = False)
datagen.fit(train_x)

In [None]:
# Keras function used tu reduce the learning rate when a metric has stopped improving
lrr = ReduceLROnPlateau(monitor = 'val_accuracy', patience = 2, verbose = 1, factor = 0.5, min_lr = 0.00001)

## Training
We'll use now the model.fit_generator() funcion, which is the function used for training on a generated batch.

In [None]:
epochs = 30
history_2 = model.fit_generator(datagen.flow(train_x, train_y, batch_size = batch_size), steps_per_epoch = int(train_x.shape[0]/batch_size) + 1, epochs = epochs, validation_data = (val_x, val_y), callbacks = [lrr])


### Training performance
After the training with the new generated data, let's see the results.

In [None]:
# Deffining figure
f = plt.figure(figsize = (20, 7))
f.add_subplot(121)

# Adding accuracy subplot
#Train
plt.plot(history_1.epoch+list(np.asarray(history_2.epoch) + len(history_1.epoch)),history_1.history['accuracy']+history_2.history['accuracy'],label = "accuracy")
#Test
plt.plot(history_1.epoch+list(np.asarray(history_2.epoch) + len(history_1.epoch)),history_1.history['val_accuracy']+history_2.history['val_accuracy'],label = "val_accuracy")

plt.title("Accuracy Curve", fontsize = 18)
plt.xlabel("Epochs", fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend()

# Adding loss subplot
f.add_subplot(122)
#Train
plt.plot(history_1.epoch+list(np.asarray(history_2.epoch) + len(history_1.epoch)),history_1.history['loss']+history_2.history['loss'],label="loss") 
#Test
plt.plot(history_1.epoch+list(np.asarray(history_2.epoch) + len(history_1.epoch)),history_1.history['val_loss']+history_2.history['val_loss'],label="val_loss")

plt.title("Loss Curve", fontsize = 18)
plt.xlabel("Epochs", fontsize = 15)
plt.ylabel("Loss", fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend()

plt.show()

## **Confusion Matrix**

In [None]:
val_p = np.argmax(model.predict(val_x), axis = 1)

error = 0
confusion_matrix = np.zeros([10, 10])
for i in range(val_x.shape[0]):
    confusion_matrix[val_y[i], val_p[i]] += 1
    if val_y[i] != val_p[i]:
        error += 1
        
print("Confusion Matrix: \n\n", confusion_matrix)
print("\nErrors in validation set: ", error)
print("\nError Persentage: ", (error * 100) / val_p.shape[0])
print("\nAccuracy: ", 100 - (error * 100) / val_p.shape[0])
print("\nValidation set Shape: ", val_p.shape[0])

In [None]:
f = plt.figure(figsize = (10, 10))
f.add_subplot(111)

plt.imshow(np.log2(confusion_matrix + 1), cmap = "Blues")
plt.colorbar()
plt.tick_params(size = 5, color = "white")
plt.xticks(np.arange(0, 10), np.arange(0, 10))
plt.yticks(np.arange(0, 10), np.arange(0, 10))

threshold = confusion_matrix.max()/2

for i in range(10):
    for j in range(10):
        plt.text(j, i, int(confusion_matrix[i, j]), horizontalalignment = "center", color = "white" if confusion_matrix[i, j] > threshold else "black")
        

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.savefig("Confusion_matrix2.png")
plt.show()

# Visualizing results
## Errors in the validation set:

In [None]:
rows = 4
cols = 9

f = plt.figure(figsize = (2*cols, 2*rows))
subplot = 1
for i in range(val_x.shape[0]):
    if val_y[i] != val_p[i]:
        f.add_subplot(rows, cols, subplot)
        subplot += 1
        plt.imshow(val_x[i].reshape([28, 28]), cmap = "Blues")
        plt.axis("off")
        plt.title("T: " + str(val_y[i]) + "P: " + str(val_p[i]), y = -0.15, color = "Red")
        
plt.savefig("error_plots.png")
plt.show()

# Predictions on the test set:

In [None]:
test_y = np.argmax(model.predict(test_x), axis = 1)

In [None]:
rows = 5
cols = 10

f = plt.figure(figsize = (2*cols, 2*rows))

for i in range(rows*cols):
    f.add_subplot(rows, cols, i+1)
    plt.imshow(test_x[i]. reshape([28, 28]), cmap = "Blues")
    plt.axis("off")
    plt.title(str(test_y[i]))

## Creating submission

In [None]:
df_submission = pd.DataFrame([df_test.index + 1, test_y], ["ImageId", "Label"]).transpose()
df_submission.to_csv("MySubmission.csv", index = False)

## 