# Quick, Draw! Doodle Recognition Challenge

* local PC environment (GPU 사용)
* keras framework - 직관적이고 간결하게 모듈을 사용할 수 있어서 선택했습니다.
* MobileNet model - 이미지 데이터를 사용하므로 CNN 기반 모델 중 ResNet, DenseNet, MobileNet 3가지 모델 사용했습니다. 그 중 가장 정확도가 높게 나온 MobileNet model 사용했습니다. (Tensorboard 사용해서 모니터링)  
-ResNet : layer가 깊어져 생기는 문제를 skip connection으로 극복합니다.  
-DenseNet : 기존 skip connection을 확장하여 여러 level의 feature 활용합니다.   
-MobileNet : depthwise separable convolution 사용해서 parameter 줄여서 연산을 줄입니다.
* Dataset - 기존에 shuffle해서 저장해놓은 csv파일을 사용합니다. (train:validation = 99:1)
* Accuracy - accracy, top_k_categorical_accuracy; 정확도와 상위 3개 항목의 정확도를 사용했습니다.

# Data Load

In [None]:
%matplotlib inline
import os
import cv2
import json
import pandas as pd
import numpy as np
from glob import glob
from keras.utils.np_utils import to_categorical

In [None]:
SHUFFLE_DIR = 'E:\\project\\shuffle_data'
TEST_DIR = 'E:\\project\\data\\'

**parameter**

In [None]:
base_size = 256
shuffle_data_num = 100
num_classes = 340
img_size = 64
epochs = 20
steps = 600
batch_size = 800

**train dataset class name**

In [None]:
TRAIN_LIST = glob('E:\\project\\data\\train_simplified\\*.csv')
class_list = []

In [None]:
for item in TRAIN_LIST:
    class_name = os.path.basename(item).split('.')[0]
    class_name = class_name.replace(' ', '_')
    class_list.append(class_name)

In [None]:
class_list[:5]

**validation dataset**

In [None]:
valid_df = pd.read_csv(os.path.join(SHUFFLE_DIR, 'train_k99.csv.gz'), nrows=34000)

In [None]:
valid_df.head()

In [None]:
def drawing(raw_strokes, img_size, lw=6, time_color=True):
    img = np.zeros((256, 256), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        color = 255 - min(t, 10) * 13
        for i in range(len(stroke[0]) - 1):
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), color, lw)
    if img_size != 256:
        return cv2.resize(img, (img_size, img_size))/255
    else:
        return img/255

In [None]:
def data_generator(df, img_size, lw=6, time_color=True):
    df['drawing'] = df['drawing'].apply(json.loads)
    x = np.zeros((len(df), img_size, img_size, 1))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, 0] = drawing(raw_strokes, img_size=img_size, lw=lw, time_color=time_color)
    return x

In [None]:
x_valid = data_generator(valid_df, img_size)
y_valid = to_categorical(valid_df.y, num_classes=num_classes)

In [None]:
print(x_valid.shape, y_valid.shape)

### train dataset

In [None]:
def train_generator(img_size, batch_size, iters, lw=6, time_color=True):
    while True:
        for iter in np.random.permutation(iters):
            filename = os.path.join(SHUFFLE_DIR, 'train_k{}.csv.gz'.format(iter))
            for df in pd.read_csv(filename, chunksize=batch_size):
                df['drawing'] = df['drawing'].apply(json.loads)
                x = np.zeros((len(df), img_size, img_size, 1))
                for i, raw_strokes in enumerate(df.drawing.values):
                    x[i, :, :, 0] = drawing(raw_strokes, img_size=img_size, lw=lw, time_color=time_color)
                y = to_categorical(df.y, num_classes=num_classes)
                yield x, y

In [None]:
train_data = train_generator(img_size, batch_size, range(shuffle_data_num-1))

In [None]:
x, y = next(train_data)

In [None]:
print(x.shape, y.shape)

# Modeling

In [None]:
import keras
from time import time
from keras import Model
from keras.models import load_model
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten
from keras.applications import MobileNet
from keras import optimizers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
from keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy

**accuracy**

In [None]:
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

**create model**

In [None]:
model = MobileNet(input_shape=(img_size, img_size, 1), alpha=1, weights=None, classes=340)

In [None]:
model.summary()

In [None]:
c = keras.optimizers.adam(lr=0.002)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=c, metrics=['accuracy', top_3_accuracy])

In [None]:
callbacks = [
    ReduceLROnPlateau(monitor='top_3_accuracy', factor=0.8, patience=3, min_delta=0.001,
                          mode='max', min_lr=1e-5, verbose=1),
    EarlyStopping(patience=3, monitor='top_3_accuracy'),
    ModelCheckpoint('model_mobilenet.h5', monitor='top_3_accuracy', mode='max', save_best_only=True,
                    save_weights_only=True),
    TensorBoard(log_dir="logs/{}".format(time()))
]

In [None]:
hist = model.fit_generator(
    train_data, steps_per_epoch=steps, epochs=epochs, verbose=1,
    validation_data=(x_valid, y_valid), callbacks=callbacks)

**validation**

In [None]:
valid_preds = model.predict(x_valid, batch_size=batch_size, verbose=1)

In [None]:
answer = []
for i in range(len(valid_preds)):
    top3 = valid_preds[i].argsort()[::-1][:3]
    word = ''
    for j in top3:
        word += class_list[j]
        word += " "
    answer.append(word)
valid_preds_df = pd.DataFrame(answer)

In [None]:
valid_preds_df.head(20)

In [None]:
valid_df

# Test

In [None]:
test = pd.read_csv('E:\\project\\data\\test_simplified.csv')

In [None]:
test.head()

In [None]:
x_test = data_generator(test, img_size)

In [None]:
test_preds = model.predict(x_test, batch_size=batch_size, verbose=1)

In [None]:
def create_submission(test, test_preds):
    pred_rows = []
    answer = []
    for i in range(len(test_preds)):
        top3 = test_preds[i].argsort()[::-1][:3]
        word = ''
        for j in top3:
            word += class_list[j]
            word += " "
        answer.append(word)
    df = pd.DataFrame(answer)
    test['word'] = df
    sub = test[['key_id', 'word']]
    sub.to_csv('submission_{}.csv'.format(time()), index=False)

In [None]:
create_submission(test, test_preds)