# QMNIST: Using a Simple CNN to reach the Top 1% 

The goal of this notebook is to classify with the best accuracy possible handwritten digits. The input is a (28,28) "image" in grey scale. This notebook is presents multiple technics to achieve 99.9 accuracy:

- CNN
- Denser Dataset (we use MNIST images and the extended QMNIST Dataset)
- Data Augmentation

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = [20, 20]

def plot_images(images, labels, shape=(3,3)):
    fig, p = plt.subplots(shape[0], shape[1])
    i = 0
    for x in p:
        for ax in x:
            ax.imshow(images[i])
            ax.set_title(labels[i])
            i += 1

## Read the data

In [None]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

We will use the QMNIST extended data to boost the performance to the max, see https://www.kaggle.com/fedesoriano/qmnist-the-extended-mnist-dataset-120k-images

In [None]:
# Read qmnist data
qmnist = unpickle("/kaggle/input/qmnist-the-extended-mnist-dataset-120k-images/MNIST-120k")
 
# Load test data
test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

# we reshape and normalize the data
X_qmnist = np.array(qmnist['data'], dtype="float32") / 255
X_qmnist = X_qmnist.reshape(-1, 28, 28, 1)

# Convert labels to one hot vectors
y_qmnist = tf.keras.utils.to_categorical(qmnist['labels'])

X_test = np.array(test, dtype="float32") / 255
X_test = X_test.reshape(-1, 28, 28, 1)

plot_images(X_qmnist[:9], y_qmnist[:9], shape=(3,3))

In order to get the biggest amount of data possible we will train our model with both the MNIST and the QMNIST data.

In [None]:
from tensorflow.keras.datasets import mnist

# Load MNIST data
(X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = mnist.load_data()

X_mnist = np.concatenate((X_train_mnist, X_test_mnist))
y_mnist = np.concatenate((y_train_mnist, y_test_mnist))

# Preprocess MNIST to match our preprocessing
X_mnist = X_mnist.reshape(-1,28,28,1)
X_mnist = X_mnist.astype(np.float32) / 255
y_mnist = tf.keras.utils.to_categorical(y_mnist,num_classes=10)

# Combine MNIST and QMNIST
X_train = np.concatenate((X_qmnist, X_mnist))
y_train = np.concatenate((y_qmnist, y_mnist))

# final dataset shape
print("MNIST image dataset shape:", X_qmnist.shape)
print("QMNIST image dataset shape:", X_mnist.shape)
print("Final image dataset shape:", X_train.shape)

plot_images(X_train[:9], y_train[:9], shape=(3,3))

### Data Augmentation

To provide more data during the training process, we are also going to use Data Augmentation.

In [None]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.20,
    shear_range=15,
    zoom_range=0.10,
    validation_split=0.25,
    horizontal_flip=False
)

train_generator = datagen.flow(
    X_train,
    y_train, 
    batch_size=256,
    subset='training',
)

validation_generator = datagen.flow(
    X_train,
    y_train, 
    batch_size=64,
    subset='validation',
)

# CNN

In [None]:
def create_model():
    model = tf.keras.Sequential([
        
        tf.keras.layers.Reshape((28, 28, 1)),
        tf.keras.layers.Conv2D(filters=32, kernel_size=(5,5), activation="relu", padding="same", input_shape=(28,28,1)),
        tf.keras.layers.MaxPool2D((2,2)),
        
        tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), activation="relu", padding="same"),
        tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), activation="relu", padding="same"),
        tf.keras.layers.MaxPool2D((2,2)),
        
        tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), activation="relu", padding="same"),
        tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), activation="relu", padding="same"),
        tf.keras.layers.MaxPool2D((2,2)),

        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation="sigmoid"),
        tf.keras.layers.Dropout(0.25),
        
        tf.keras.layers.Dense(512, activation="sigmoid"),
        tf.keras.layers.Dropout(0.25),
        
        tf.keras.layers.Dense(256, activation="sigmoid"),
        tf.keras.layers.Dropout(0.1),
        
        tf.keras.layers.Dense(10, activation="sigmoid")
    ])

    model.compile(
        optimizer="adam",
        loss = 'categorical_crossentropy',
        metrics = ['accuracy']
    )

    return model

model = create_model()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                 factor=0.1,
                                                 patience=5,
                                                 min_lr=0.000001,
                                                 verbose=1)

checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='model.hdf5',
                                                monitor='val_loss',
                                                save_best_only=True,
                                                save_weights_only=True,
                                                verbose=1)

# Train model

In [None]:
history = model.fit_generator(train_generator, 
                              epochs=100, 
                              validation_data=validation_generator, 
                              callbacks=[reduce_lr,checkpoint], 
                              verbose=1)

In [None]:
model.load_weights('model.hdf5')

fig, ax = plt.subplots(1,2, figsize=(15, 5))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

final_loss, final_acc = model.evaluate(X_train,  y_train, verbose=2)
print("Model accuracy: ", final_acc, ", model loss: ", final_loss)

# Submbit predictions

In [None]:
df = pd.read_csv("/kaggle/input/digit-recognizer/test.csv").astype("float32") / 255.0
predictions = tf.keras.backend.argmax(model.predict(df))
submission = pd.DataFrame({'ImageId': range(1, len(predictions) + 1), "Label": predictions})
submission.to_csv('submission.csv', index=False)