# Digit Recognizer (Kaggle), 0.991 Accuracy with Keras
<hr>
In this tutorial we are going to use __*Convolutional Neural Networks*__ to classify images from the __*MNIST*__ dataset.

- You can find the competition [here](https://www.kaggle.com/c/digit-recognizer/data)
- Blog post [here](https://thelastdev.com/2018/07/09/digit-recognizer-kaggle-0-991-accuracy-with-keras/)

In [None]:
# Load libraries
%pylab inline

import keras
from keras.models import Sequential
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D, MaxPool2D
from keras.datasets import cifar10
from keras import regularizers
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau
import numpy as np
from sklearn.model_selection import train_test_split

import csv
from tqdm import tqdm
import numpy.random

## Open the dataset
After downloading the dataset, we are going to do the following:

1. Open the file and load the data
2. Format the data and get the labels
3. Check for NaN values
4. Split the dataset to train and validation
5. Normalize the data

In [None]:
def open_train_data(path):
    
    train = [] 
    
    with open(path, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)
        for line in tqdm(lines[1:]):
            label = line[0]
            
            image = np.array([x for x in line[1:]])
            image = image.astype('float32')
            
            # Format the data to 28x28x1 (in grey scale)
            image = np.reshape(image, (28, 28, 1))
            train.append([image, label])
    
    return np.array(train)

In [None]:
def split_train_test(train):
    
    np.random.shuffle(train)
    
    features = [x[0] for x in train]
    labels = [x[1] for x in train]
    
    # Split the dataset to train and validation
    x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.025, random_state=42)
    
    # One-hot Encoding
    y_train = np_utils.to_categorical(y_train, 10)
    y_test = np_utils.to_categorical(y_test, 10)
    
    return (np.array(x_train), y_train), (np.array(x_test), y_test)
    

In [None]:
# Load the data, run only once
# train = open_train_data('dataset/train.csv')
# np.save('train.npy', train)

In [None]:
# If you have already ran the the function open_train_data then run this
train = np.load('train.npy')

In [None]:
# Check for missing values
import pandas as pd

for idx, feature in enumerate(train):
    if pd.isnull(feature).any():
        print('Found NaN value in feature %d' % idx)
        break

In [None]:
(x_train, y_train), (x_test, y_test) = split_train_test(train)
x_train = x_train / 255.0
x_test = x_test / 255.0

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

# Create the model

In [None]:
# Create the model
model = Sequential()
model.add(Conv2D(32, (2, 2), padding='same',
                 input_shape=x_train.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (2, 2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(1, 1)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (2, 2), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, (2, 2), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(256, (2, 2), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Activation('softmax'))

# Compile the model

In [None]:
# Compile the model
batch_size = 64

opt_rms = keras.optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)

# opt_rms = keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='categorical_crossentropy', 
              optimizer=opt_rms, 
              metrics=['accuracy'])

# Train the model

In [None]:
from time import time
epochs = 100

tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph/{}'.format(time()), histogram_freq=0, write_graph=True, write_images=True)
checkpoint = ModelCheckpoint('model-{epoch:03d}.h5', verbose=1, monitor='val_acc', save_best_only=True, mode='auto')
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test), 
          callbacks=[tbCallBack, checkpoint])

# Make predictions

In [None]:
model.load_weights('<your model .h5 here>')

In [None]:
# Load the test data
def open_test_data(path):
    
    test = [] 
    
    with open(path, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)
        image_number = 1
        for line in tqdm(lines[1:]):
            
            image = np.array([x for x in line])
            image = image.astype('float32')
            image = np.reshape(image, (28, 28, 1))
            test.append([image, image_number])
            image_number += 1
    
    return np.array(test)

In [None]:
# test_data = open_test_data('dataset/test.csv')

In [None]:
# np.save('test.npy', test_data)

In [None]:
test_data = np.load('test.npy')

In [None]:
import matplotlib.pyplot as plt

with open('submission.csv', 'w') as f:
    f.write('ImageId,Label\n')
    for data in tqdm(test_data):
        arr = numpy.expand_dims(data[0], axis=0)
        number = model.predict(arr)
        
        label = argmax(number)
        f.write(str(data[1]) + ',' + str(label) + '\n')