In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook we will go through different classyfication algorithms that you may think of when you want to implement a model that can distinguish between different hand written and how you can enhance your model and the right way to perform different sequence of experiments on your dataset till you find the optimum point.

# Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from tensorflow.keras.layers import Dense, Conv2D, Input, Dropout, BatchNormalization, MaxPooling2D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

import matplotlib.pyplot as plt
import numpy as np
import time

# Data Anlysis

Firstly let's get the MNIST training data which is represented in csv format where a row represents one image sample. and each row contains **785** column.

The first column is the **Label** column which contains the ground truth of this image sample, and the rest of the **784 columns** are the values of pixels of this image sample. 

Let's load it

In [None]:
train_df = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
train_df.head()

Let's now divide the dataset into **Features (X)** and their corespoinding **Labels (Y)**

In [None]:
X = train_df.drop(columns=["label"]).values
Y = train_df["label"].values

print("Shape of features:", X.shape)
print("Shape of features:", Y.shape)

print("Type of X:", type(X))
print("Type of Y:", type(Y))

So basically now we have **42000 training samples** each one has **784 features** and **1 label** and they are represented in numpy arrays

Let's try to visualize the data to make sure that everything is good till now.<br>
In order to do this we need to reshape each sample from being one row of shape (784,1) to be a square matrix of shape (28,28)

In [None]:
X_square = X.reshape(-1, 28, 28)

# Just a value to index both the features and the labels
# Feel free to change this number to visualize the samples
index = 100

plt.imshow(X_square[100])
print(f"Label is {Y[100]}")

The second step you need always to do whenever you want to analyse a dataset is to check the distribustion of your samples throughout the space of the targets. In simplified words, you need to see how many samples does this dataset has in each category. so basically we have 10 categories (the digits from 0 to 10) and we want to check how many samples we have for each digit.

In [None]:
uniques, count = np.unique(Y, return_counts=True)
plt.bar(uniques, count)
plt.xlabel("Digit Category")
plt.ylabel("Number of samples")
plt.show()

So according to this bar chart, we have a good distribution of the samples for each category. Now we can safely state that this dataset is **Balanced**

Now if we gave a look to the features we will find that a single features may have an integer number in the range of (0:255), and that's a large range for some models that prefers to work with data of features varies between (0 to 1). Therefore, we need to normalize the features

In [None]:
X_square = X_square / 255.
X = X / 255.

# Prepare Train and Test sets

In order to have a good model, we need to prepare a test set that covers most of the cases in our dataset to have a proper evaluation for our models to compare between them.

Other thing we also need to **unify** the test set for all of the following experiments

In [None]:
test_ratio = 0.1  # 10%
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_ratio, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

uniques, count = np.unique(y_train, return_counts=True)
plt.bar(uniques, count, label="train")
uniques, count = np.unique(y_test, return_counts=True)
plt.bar(uniques, count, label="test")
plt.xlabel("Digit Category")
plt.ylabel("Number of samples")
plt.legend()
plt.show()


Well now we can say that we have a **balanced** and **well-distributed** test set

**We are now ready to do some experiments !!**

# Experiments

One good fact, is that there's nothing called "Best Classifier Algorithm Ever". A classifier algorithm can fit well over a dataset which is linearlly separable - for example the SVM -, can't really fit very well over other types of data. 

Therefore it's always a good practice to test multiple algorithms on your dataset before choosing one, then you can fine tune that model till you get yourself the **best model** for this specific **dataset**

In this section we will dive into different types of experiments to try to find the best classifier that can fit over this dataset. 

Let's start !!

## 1. Decision Tree

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_classifier = DecisionTreeClassifier(random_state=0)
decision_tree_classifier.fit(x_train, y_train)
start = time.time()
predictions = decision_tree_classifier.predict(x_test)
end = time.time()

accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print("Decision Tree")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix\n", conf_matrix)
print(f"Elapsed Time: {end - start: .4f} second")

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier()
random_forest_classifier.fit(x_train, y_train)
start = time.time()
predictions = random_forest_classifier.predict(x_test)
end = time.time()

accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print("Random Forest")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix\n", conf_matrix)
print(f"Elapsed Time: {end - start: .4f} second")

# Support Vector Machine

In [None]:
from sklearn.svm import SVC

svm_classifier = SVC()
svm_classifier.fit(x_train, y_train)
start = time.time()
predictions = svm_classifier.predict(x_test)
end = time.time()

accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print("Support Vector Machine")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix\n", conf_matrix)
print(f"Elapsed Time: {end - start: .4f} second")

# Neural Network

## Model 1 "simple Fully connected Neural Network"

In this model we're going to build a simple fully connected neural network with input shape = (784,) and output shape = (10,) activated with softmax so that we can have 10 probabilities for the input image.

The reason why we've used "Sparse Categorical Cross Entropy" is that this is a categorical classification problem, where the labels are integers ranged from 0 to 9. If we decided to change the labels to be one-hot encoded then we will need to change that loss function to be the normal "Categorical Cross Entropy"

In [None]:
input_layer = Input(shape=(X.shape[-1]))

x = Dense(700, activation="relu")(input_layer)
x = Dense(350, activation="relu")(x)
x = Dense(175, activation="relu")(x)
x = Dense(100, activation="relu")(x)
x = Dense(50, activation="relu")(x)
x = Dense(25, activation="relu")(x)
x = Dense(15, activation="relu")(x)
output_layer = Dense(10, activation="softmax")(x)

model1 = Model(inputs=input_layer, outputs=output_layer)

model1.summary()

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate=0.01)

model1.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
model1.fit(
    x=x_train,
    y=y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.1
)

In [None]:
pd.DataFrame(model1.history.history).plot()

As we can see the model overfitted because we over trained it on the data. So let's try to solve this problem by adding some Dropout layers and observe the difference.

## Model 2 "Add Dropouts"

In [None]:
input_layer = Input(shape=(X.shape[-1]))

x = Dense(700, activation="relu")(input_layer)
x = Dense(350, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(175, activation="relu")(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dense(25, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(15, activation="relu")(x)
output_layer = Dense(10, activation="softmax")(x)

model2 = Model(inputs=input_layer, outputs=output_layer)
model2.summary()

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate=0.01)

model2.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
model2.fit(
    x=x_train,
    y=y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.1
)

In [None]:
pd.DataFrame(model2.history.history).plot()

## Model 3 "Add Early Stopping"

It looks like that the model was doing good till the third epoch then it overfitted. let's then try to add an early stopping call back that will stop the training and save the best weights whenever the model starts to overfit. and in order to decide that the model overfitted we will monitor the validation loss, if it started to increase for 2 epochs then it will stop

In [None]:
input_layer = Input(shape=(X.shape[-1]))

x = Dense(700, activation="relu")(input_layer)
x = Dense(350, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(175, activation="relu")(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dense(25, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(15, activation="relu")(x)
output_layer = Dense(10, activation="softmax")(x)

model3 = Model(inputs=input_layer, outputs=output_layer)
model3.summary()

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate=0.01)
early_stopping = EarlyStopping(monitor="val_loss", patience=2)

model3.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
model3.fit(
    x=x_train,
    y=y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stopping]
)

In [None]:
pd.DataFrame(model3.history.history).plot()

## Model 4 "Add Decaying Learning Rate"

Well then, at least now we have a better model. but that's not enough. Why don't we try to reduce the learning rate to avoid falling in some local minimum. The idea is to star with some "relatively" big learning rate to move quickly from the initial point then whenever an overfitting is triggering, the learning rate will be reduced with some factor to avoid it. once we reach the minimum learning rate, and some overfitting started to happen then the early stopping will prevent it. now we can safely increase the number of epochs

In [None]:
# This function only to print out the learning rate
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr


input_layer = Input(shape=(X.shape[-1]))

x = Dense(700, activation="relu")(input_layer)
x = Dense(350, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(175, activation="relu")(x)
x = Dense(100, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dense(25, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(15, activation="relu")(x)
output_layer = Dense(10, activation="softmax")(x)

model4 = Model(inputs=input_layer, outputs=output_layer)
model4.summary()

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate=0.01)
early_stopping = EarlyStopping(monitor="val_loss", patience=3)
decaying_learning_rate = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.1, min_lr=0.0001)
lr_metric = get_lr_metric(optimizer)

model4.compile(loss=loss, optimizer=optimizer, metrics=["accuracy", lr_metric])
model4.fit(
    x=x_train,
    y=y_train,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stopping, decaying_learning_rate]
)

In [None]:
pd.DataFrame(model4.history.history).plot()

**It looks like we're going somewhere now :D**

## Model 5 "Add Batch Normalization"

One good practice in trainig a neural network activated with relu, is to always add batch normalization after the activation to avoid any overshooting in the weights.

In [None]:
# This function only to print out the learning rate
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr


input_layer = Input(shape=(X.shape[-1]))

x = Dense(700, activation="relu")(input_layer)
x = BatchNormalization()(x)
x = Dense(350, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(175, activation="relu")(x)
x = BatchNormalization()(x)
x = Dense(100, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = BatchNormalization()(x)
x = Dense(25, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(15, activation="relu")(x)
x = BatchNormalization()(x)
output_layer = Dense(10, activation="softmax")(x)

model5 = Model(inputs=input_layer, outputs=output_layer)
model5.summary()

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate=0.01)
early_stopping = EarlyStopping(monitor="val_loss", patience=3)
decaying_learning_rate = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.1, min_lr=0.0001)
lr_metric = get_lr_metric(optimizer)

model5.compile(loss=loss, optimizer=optimizer, metrics=["accuracy", lr_metric])
model5.fit(
    x=x_train,
    y=y_train,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stopping, decaying_learning_rate]
)

In [None]:
pd.DataFrame(model5.history.history).plot()

**Okay, That's a good gain to be honest :D**

Let's try this model over the test set

In [None]:
start = time.time()
predictions = model5.predict(x_test)
end = time.time()

# Decode the one-hotted output
predictions = [np.argmax(p) for p in predictions]

accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print("Support Vector Machine")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix\n", conf_matrix)
print(f"Elapsed Time: {end - start: .4f} second")

# Convolutional Neural Network

Well the Neural network did very well actually, let's try to improve it a little bit by chnging the input itself. Let's try to extract some features first before we feeding it to the network, and in order to do that, let's add some base line of convolutional and pooling layers before the network to get a set of features and feed it to our network 

but first we need to square our **x_train** and **x_test**

In [None]:
x_train_square = x_train.reshape(-1, 28, 28)
x_test_square = x_test.reshape(-1, 28, 28)

In [None]:
# This function only to print out the learning rate
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr


input_layer = Input(shape=(x_train_square.shape[1], x_train_square.shape[2], 1))

x = Conv2D(16, (2, 2), padding="same", activation="relu")(input_layer)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

x = Conv2D(32, (2, 2), padding="same", activation="relu")(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

x = Conv2D(64, (2, 2), padding="same", activation="relu")(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

x = Conv2D(128, (2, 2), padding="same", activation="relu")(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2))(x)

x = Flatten()(x)

x = Dense(700, activation="relu")(x)
x = BatchNormalization()(x)
x = Dense(350, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(175, activation="relu")(x)
x = BatchNormalization()(x)
x = Dense(100, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = BatchNormalization()(x)
x = Dense(25, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(15, activation="relu")(x)
x = BatchNormalization()(x)

output_layer = Dense(10, activation="softmax")(x)

model6 = Model(inputs=input_layer, outputs=output_layer)
model6.summary()

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate=0.01)
early_stopping = EarlyStopping(monitor="val_loss", patience=3)
decaying_learning_rate = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.1, min_lr=0.0001)
lr_metric = get_lr_metric(optimizer)

model6.compile(loss=loss, optimizer=optimizer, metrics=["accuracy", lr_metric])
model6.fit(
    x=x_train_square,
    y=y_train,
    epochs=20,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stopping, decaying_learning_rate]
)

In [None]:
pd.DataFrame(model6.history.history).plot()

**ALRIGHT that's a very optimistic result**, let's try it on the test set

In [None]:
start = time.time()
predictions = model6.predict(x_test_square)
end = time.time()

# Decode the one-hotted output
predictions = [np.argmax(p) for p in predictions]

accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

print("Support Vector Machine")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix\n", conf_matrix)
print(f"Elapsed Time: {end - start: .4f} second")

# Submit

In [None]:
df_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

test_data = df_test.values
test_data = test_data / 255.

test_data_square = test_data.reshape(-1, 28, 28)
predictions = model6.predict(test_data_square)

# Decode the one-hotted output
predictions = [np.argmax(p) for p in predictions]

with open("/kaggle/working/submission.csv", "w") as out:
    out.write("ImageId,Label\n")
    for i, p in enumerate(predictions):
        out.write(f"{i+1},{p}\n")

# Conclusion

There are a lot of other experiments that you can do over this data set, try to increase the size of the network, reduce it, try different activations and different combinations of trainig techniques.

Hope that was helpful and useful, if you enjoyed it please **Upvote** :D