In [None]:
%matplotlib inline

import numpy as np 
import pandas as pd 
import cv2
import ast
from matplotlib import pyplot as plt
import keras


import keras
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.optimizers import Adam
from keras.applications import MobileNet
from keras.applications.mobilenet import preprocess_input

import os
print(os.listdir("../input/quickdraw-doodle-recognition"))

In [None]:
train_simplified_path = '../input/quickdraw-doodle-recognition/train_simplified/'
test_simplified_path = '../input/quickdraw-doodle-recognition/test_simplified.csv'

BATCHSIZE = 128
SIZE = 96
NCLASSES = 340
VALIDSAMPLES = 100

The apk and mapk metrics are sourced from https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py and list of strings so they can't be used at the training phase without modification. Keras has a top k categorical accuracy metric that will be be used during the training and the provided metrics will be used for later to confirm the models perdormance.

In [None]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

### Load and display some of the data
As precised in the discription of the challenge, labels with multiple words need to be transformed into a single str by repalcing spaces with underscore. Also the drawing column has an str type and need to be transformed to a list.

In [None]:
data = pd.read_csv(train_simplified_path + 'roller coaster.csv', index_col='key_id', nrows=10)
data['drawing'] = data['drawing'].apply(ast.literal_eval)
data['word'] = data['word'].apply(lambda x: x.replace(' ', '_'))
data.head()

To display the images, we use the function propsed by this [kernel](https://www.kaggle.com/gaborfodor/greyscale-mobilenet-lb-0-892).
This method takes into acount the temperal information which is incoded in the color and defined by the parameter: ```time_color``` and also resizes the image.

In [None]:
def draw_cv2(raw_strokes, size=256, lw=6, time_color=True, base_size=256):
    img = np.zeros((base_size, base_size), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = 255 - min(t, 10) * 13 if time_color else 255
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), color, lw)
    if size != base_size:
        return cv2.resize(img, (size, size))
    else:
        return img

In [None]:
plt.title('Sample image of a roller coaster in grayscale')
plt.imshow(255 - draw_cv2(data.iloc[9]['drawing']), cmap=plt.cm.gray)
plt.show()

#### Labels balance
Here we display the count of each of the labels. They are sorted in order to be visualized in a single figure and still be able to provide a good representation of their count.  
We see that around 240 labels have between 113614 and 150000 samples while the around 15 labels have between 250000 and 340030 labels. These differences will affect the trained models and make them lean toward the over represented labels to some extent. Due to the large number of samples this effect might be limited and differences in the performance will depend on the balance of the test dataset. 

In [None]:
train_files = os.listdir(train_simplified_path)
train_count = []
for file in train_files:
    with open(train_simplified_path + file) as f:
        for (count, _) in enumerate(f, 1):
            pass
        train_count.append(count)
train_count = np.sort(train_count)

In [None]:
print('Number of classes: ', len(train_count))
print('Minimum count of a label: ', min(train_count))
print('Maximum count of a label: ', max(train_count))
plt.title('The count of each of the labels sorted')
plt.scatter(range(0, len(train_count)), train_count)
plt.show()

## Data generators
Since the data size is large, we can't load it all in the memory and preprocess it into a list of images. So we will use a custom data generator that reads each csv and creates the images from the strockes while learning.  
In addition to having huge number of samples, each class is provided in a single csv seperatly which would add a computaional cost in order to create batchs with unifrom distribution of the classes. A possible way to solve this, is to load all the data, shuffle it and save it again to later sample from it. However, the method used here is to read randomly ```BATCHSIZE``` files and add a single sample from each one of them to the current batch.

In [None]:
# class index map
train_files = os.listdir(train_simplified_path)
class_index = {}
for i, file in enumerate(train_files):
    class_index[file[:-4].replace(' ', '_')] = i

In [None]:
def train_generator(size=SIZE, files_list=train_files, class_index=class_index, batch_size=BATCHSIZE, n_classes=NCLASSES, lw=6, time_color=True):
    while True:
        ind = VALIDSAMPLES
        while ind < 113610:
            files = [files_list[i] for i in np.random.randint(n_classes, size=batch_size)]
            x = np.zeros((batch_size, size, size, 1))
            y = np.zeros((batch_size, n_classes))
            for i, file in enumerate(files):
                df = pd.read_csv(train_simplified_path + file, skiprows=range(1,ind), nrows=1)
                df['drawing'] = df['drawing'].apply(ast.literal_eval)
                df['word'] = df['word'].apply(lambda x: class_index[x.replace(' ', '_')])
                y[i,:] = keras.utils.to_categorical(df['word'], num_classes=n_classes)
                x[i,:,:,0] = draw_cv2(df.iloc[0]['drawing'], size=size, lw=lw, time_color=time_color)
            x = preprocess_input(x).astype(np.float32)
            ind += 1
            yield x, y

For the validation, the first ``VALIDSAMPLES`` samples from each class are loaded and preprocessed.

In [None]:
def validation_data(file_list=train_files, size=SIZE, class_index=class_index, n_samples_per_class=VALIDSAMPLES, n_classes=NCLASSES, lw=6, time_color=True):
    x = np.zeros((len(file_list) * n_samples_per_class, size, size, 1))
    y = np.zeros((len(file_list) * n_samples_per_class, n_classes))
    for i, file in enumerate(file_list):
        df = pd.read_csv(train_simplified_path + file, nrows=n_samples_per_class)
        df['drawing'] = df['drawing'].apply(ast.literal_eval)
        df['word'] = df['word'].apply(lambda x: class_index[x.replace(' ', '_')])
        y[i*n_samples_per_class:(i+1)*n_samples_per_class,:] = keras.utils.to_categorical(df['word'], num_classes=n_classes)
        for j, raw_strokes in enumerate(df.drawing.values):
            x[i*n_samples_per_class+j,:,:,0] = draw_cv2(raw_strokes, size=size, lw=lw, time_color=time_color)
    x = preprocess_input(x).astype(np.float32)
    return x, y

In [None]:
x_validation, y_validation = validation_data()
print(x_validation.shape, y_validation.shape)

In [None]:
train_data_generator = train_generator()

## Model training
We have at our diposal a large number of sample that should be enough to reach high performance using the right model. So for the architecture choice, a predefined model is used and trained from scratch. As natural images and our sketchs differ widly, the weights are initialized randomly (note that even if we used a pretrained model after a certain number of epochs the pretrained and randomly initiallized model should converge).  
Different popular architectures exist that make use of huge volumes of data, such as Resnet, InceptionV3 and Mobilnet. The use of these models allows to have deep architectures without suffering from vanishing gradient and optimization probelms, in addition to lower computaional cost. In particular MobileNet is fast and quite efficient as it is designed with objective to be applied in computer vision tasks that require quick responses such as autonomus vehicules.
Our images contain far less features to uncover than natural images, so MobileNet is expected to perform well on our dataset.

In [None]:
# metric
def top_3_categorical_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

In [None]:
model = MobileNet(input_shape=(SIZE, SIZE, 1), weights=None, classes=NCLASSES)
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=[top_3_categorical_accuracy])
print(model.summary())

In [None]:
callbacks = [
    ReduceLROnPlateau(monitor='val_top_3_categorical_accuracy', factor=0.5, patience=2, mode='max', min_lr=1e-5, verbose=1),
    ModelCheckpoint('model.h5', monitor='val_top_3_categorical_accuracy', mode='max', save_best_only=True, save_weights_only=True)
]

hist = model.fit_generator(
    train_data_generator, steps_per_epoch=500, epochs=15, verbose=1,
    validation_data=(x_validation, y_validation),
    callbacks = callbacks
)

In [None]:
print(hist.history.keys())
#  "top 3 Accuracy"
plt.plot(hist.history['top_3_categorical_accuracy'])
plt.plot(hist.history['val_top_3_categorical_accuracy'])
plt.title('model top 3 accuracy')
plt.ylabel('top 3 accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# "Loss"
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# index to class mapping
index_class = {v: k for k, v in class_index.items()}

In [None]:
prediction = model.predict(x_validation)
y_true = []
for i in np.argmax(y_validation, axis=1):
    y_true.append(index_class[i])
y_pred = []
for i in range(prediction.shape[0]):
    pred = ''
    for j in prediction[i].argsort()[-3:][::-1]:
        pred += index_class[j] + ' '
    y_pred.append(pred[:-1])

del x_validation
del y_validation
print('Mean average precision (k=3) on the validation set: ', mapk(y_pred, y_true, k=3))

## Submission

In [None]:
test = pd.read_csv(test_simplified_path, index_col='key_id')
x_test = np.zeros((len(test), SIZE, SIZE, 1))
test['drawing'] = test['drawing'].apply(ast.literal_eval)
for i, raw_strokes in enumerate(test.drawing.values):
    x_test[i,:,:,0] = draw_cv2(raw_strokes, size=SIZE, lw=6, time_color=True)
    x_test[i,:,:,0] = preprocess_input(x_test[i,:,:,0]).astype(np.float32)

In [None]:
y_pred = []
prediction = model.predict(x_test)
for i in range(prediction.shape[0]):
    pred = ''
    for j in prediction[i].argsort()[-3:][::-1]:
        pred += index_class[j] + ' '
    y_pred.append(pred[:-1])
test['word']  = y_pred
test.drop(['drawing', 'countrycode'], axis=1, inplace=True)
test.to_csv('submission.csv')