In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

The MNIST database (Modified National Institute of Standards and Technology database) of handwritten digits consists of a training set of 42,000 examples, and a test set of 28,000 examples. It is a subset of a larger set available from NIST. Additionally, the black and white images from NIST were size-normalized and centered to fit into a 28x28 pixel bounding box and anti-aliased, which introduced grayscale levels.

This database is well liked for training and testing in the field of machine learning and image processing. It is a remixed subset of the original NIST datasets. 

# Reading the MNIST data set

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head(2)

In [None]:
test.head(2)

# Data processing

The images from the data set have the size 28 x 28. Every line of these files consists of an image, i.e. 785 numbers between 0 and 255. The first number of each line is the label, i.e. the digit which is depicted in the image. The following 784 numbers are the pixels of the 28 x 28 image.

In [None]:
X = train.drop('label',axis=1)
Y = train['label']

In [None]:
X.isnull().any().describe()

In [None]:
X.shape

# Image representation

Now, we reshape the data in 3 dimensions to represent an image:
+ -1 keeps the number of data as it, values convert the dataframe to arrays
+ 28, 28 is height and width
+ 1 is grayscale, if we have coloured we should use 3.

In [None]:
X = X.values.reshape(-1, 28,28,1)

In [None]:
fig, ax_arr = plt.subplots(10, 10, figsize=(7, 7))
fig.subplots_adjust(wspace=.025, hspace=.025)

ax_arr = ax_arr.ravel()
for i, ax in enumerate(ax_arr):
    ax.imshow(X[i], cmap="gray")
    ax.axis("off")
    
plt.show()

In [None]:
X.shape #Shape of X_train

# Check if the dataset is unbalanced?

In [None]:
import seaborn as sns
sns.countplot(Y)

# Check for null and missing values

In [None]:
test.isnull().any().describe()

# Now, we convert Y from series type to array type

In [None]:
Y = Y.values
type(Y)

# Data normalization

Note that, the algorithm converges faster on [0 to 1] data than on [0 to 255]. The images of the MNIST dataset are greyscale and the pixels range between 0 and 255 including both bounding values. We will map these values into an interval from [0.01 to 1] by multiplying each pixel by 0.99 / 255 and adding 0.01 to the result. This way, we avoid 0 values as inputs, which are capable of preventing weight updates.

In [None]:
X = np.array(X, dtype="float") / 255.0 * 0.99 + 0.01

# We have all setup, we now create a convolution model using tensorflow.keras 

Let’s create the architecture for our CNN model. The architecture is simple, it has three Convolutional layers and two fully connected layers.

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (3,3), activation='relu', input_shape=(28,28,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')])

model.summary() #model summary

# compile the model
model.compile(
    optimizer='adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['acc']) 

# Train the Model

Finally, let’s train our model and see if the augmentations had any positive impact on the result!

In [None]:
history = model.fit(X, Y, validation_split=0.1, epochs=40, batch_size=128, verbose=0)

# list all data in history
print(history.history.keys())

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Confusion matrix¶

Confusion matrix can be very helpfull to see your model drawbacks. We plot the confusion matrix of the validation results.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
model.fit(X_train,y_train, epochs=6, validation_data=(X_test,y_test), batch_size=128, verbose=1)

from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Predict the values from the validation dataset
y_pred = model.predict(X_train)
# Convert predictions classes to one hot vectors 
y_pred_classes = np.argmax(y_pred,axis = 1) 
# compute the confusion matrix
confusion_mtx = confusion_matrix(y_train, y_pred_classes) 
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10)) 

In [None]:
# Predict the values from the validation dataset
y_pred = model.predict(X_test)
# Convert predictions classes to one hot vectors 
y_pred_classes = np.argmax(y_pred,axis = 1) 
# compute the confusion matrix
confusion_mtx = confusion_matrix(y_test, y_pred_classes) 
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10)) 

To better understand what happen
Display some error results

In [None]:
# Errors are difference between predicted labels and true labels
errors = (y_pred_classes - y_test != 0)

Y_pred_classes_errors = y_pred_classes[errors]
Y_pred_errors = y_pred[errors]
Y_true_errors = y_test[errors]
X_val_errors = X_test[errors]

def display_errors(errors_index,img_errors,pred_errors, obs_errors):
    """ This function shows 6 images with their predicted and real labels"""
    n = 0
    nrows = 2
    ncols = 3
    fig, ax = plt.subplots(nrows,ncols,sharex=True,sharey=True)
    for row in range(nrows):
        for col in range(ncols):
            error = errors_index[n]
            ax[row,col].imshow((img_errors[error]).reshape((28,28)))
            ax[row,col].set_title("Predicted label :{}\nTrue label :{}".format(pred_errors[error],obs_errors[error]))
            n += 1

# Probabilities of the wrong predicted numbers
Y_pred_errors_prob = np.max(Y_pred_errors,axis = 1)

# Predicted probabilities of the true values in the error set
true_prob_errors = np.diagonal(np.take(Y_pred_errors, Y_true_errors, axis=1))

# Difference between the probability of the predicted label and the true label
delta_pred_true_errors = Y_pred_errors_prob - true_prob_errors

# Sorted list of the delta prob errors
sorted_dela_errors = np.argsort(delta_pred_true_errors)

# Top 6 errors 
most_important_errors = sorted_dela_errors[-10:]

# Show the top 6 errors
display_errors(most_important_errors, X_val_errors, Y_pred_classes_errors, Y_true_errors)

# Process the prediction data set by reshaping & normalizing the data

Reshape image in 3 dimensions (height = 28px, width = 28px, canal = 1)


In [None]:
test = test.values.reshape(-1,28,28,1)
test = np.array(test, dtype="float") / 255.0 * 0.99 + 0.01
test.shape

Now, apply the model to predict the test dataset and check the result

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.001, random_state=42)
model.fit(X_train,y_train, epochs=6, validation_data=(X_test,y_test), batch_size=128, verbose=0)

predictions = model.predict(test)
results = predictions.argmax(axis=-1)
#check if your model predicted correctly or not
print("Prediction result for a score {}".format(results[22250]))
plt.imshow(test[22250]) #model predicted correclty 

# Now, submit the prediction result

In [None]:
result = pd.DataFrame()
result['ImageId'] = list(range(1,28001))
result['Label'] = results
result.to_csv("output.csv", index = False)