# Import

In [None]:
import os
import re
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from PIL import Image, ImageDraw 

In [None]:
INPUT_ROOT = '../input/quickdraw-doodle-recognition'
INPUT_DIR = 'train_simplified'
print(os.listdir(INPUT_ROOT))

## Read and Summarize the Data (discard actual value)

In [None]:
csv_filenames = glob(os.path.join(INPUT_ROOT, INPUT_DIR, '*.csv'))
column_names = ['countrycode', 'drawing', 'key_id', 'recognized', 'timestamp', 'word']

row_counts = []
words = []
min_row_count = 999999999

import sys
totalmem_mb = 0
for csv_filename in tqdm(csv_filenames):
    row_count = sum(1 for row in pd.read_csv(csv_filename))
    words.append(pd.read_csv(csv_filename, nrows=1)['word'].values[0])
    row_counts.append(row_count)
    min_row_count = min(min_row_count, row_count)

In [None]:
df =  pd.read_csv(csv_filenames[0], nrows=1)
df.iloc[:,5].values[0]

## Generator Maker

In [None]:
IMHEIGHT, IMWIDTH = 64, 64
NUM_CLASSES = len(row_counts)

In [None]:
def random_row_gen_maker(csv_path, max_index, reservoir_min=3):
    reservoir = [] # list of strokes
    def gen():
        while True:
            if len(reservoir) == 0:
                while len(reservoir) < reservoir_min:
                    r = np.random.randint(1, max_index+1) # [0, max_index) -> [1, max_index]
                    if r==1: df = pd.read_csv(csv_path, nrows=1)
                    else: df = pd.read_csv(csv_path, skiprows=r, nrows=5)

                    if not bool(df.iloc[:, 3].values[0]): continue # 3 -> recognized
                    reservoir.append(ast.literal_eval(df.iloc[:, 1].values[0])) # 1 -> drawing
            strokes = reservoir.pop()
            
            image = Image.new("P", (256,256), color=255)
            image_draw = ImageDraw.Draw(image)
            for stroke in strokes:
                for i in range(len(stroke[0])-1):
                    image_draw.line( [stroke[0][i], stroke[1][i], 
                                      stroke[0][i+1], stroke[1][i+1]],
                                    fill=0, width=5 )
            image = image.resize((IMHEIGHT, IMWIDTH))
            yield (np.array(image)/255.0).copy()
    return gen()

## Testing Generator - Single Output

In [None]:
G = random_row_gen_maker(csv_filenames[words.index('star')], 100)

In [None]:
img = next(G)
plt.imshow(img, cmap='gray')

## Testing Generator - Multiple Output

In [None]:
fig = plt.figure(figsize=(15,12))
rows, cols = 5, 8

for i in range(rows):
    row_idx = np.random.randint(0, len(words))
    G = random_row_gen_maker(csv_filenames[row_idx], 100)
    for j in range(cols): 
        # plot and titles
        plt.subplot(rows, cols, i*cols+j+1)
        plt.title(words[row_idx])
        
        # turn off axis ticks
        plt.gca().axes.get_xaxis().set_visible(False)
        plt.gca().axes.get_yaxis().set_visible(False)
        
        # gray scale
        plt.imshow(next(G), cmap='gray')

## Create a Generic CNN Model

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

### Generator for fit, test

In [None]:
def gen(batchsize, max_index=1000):
    while True:
        images = []
        labels = []
        while len(images) < batchsize:
            csv_path = np.random.choice(csv_filenames)
            r = np.random.randint(1, max_index+1) # [0, max_index) -> [1, max_index]
            if r==1: df = pd.read_csv(csv_path, nrows=1)
            else: df = pd.read_csv(csv_path, skiprows=r, nrows=5)

            if not bool(df.iloc[:, 3].values[0]): continue # 3 -> recognized

            ## Get lable
            word = df.iloc[:,5].values[0]

            ## Render the strokes onto an image
            strokes = ast.literal_eval(df.iloc[:, 1].values[0]) # 1 -> drawing

            image = Image.new("P", (256,256), color=255)
            image_draw = ImageDraw.Draw(image)
            for stroke in strokes:
                for i in range(len(stroke[0])-1):
                    image_draw.line( [stroke[0][i], stroke[1][i], 
                                      stroke[0][i+1], stroke[1][i+1]],
                                    fill=0, width=5 )
            image = image.resize((IMHEIGHT, IMWIDTH))
            image = np.array(image)/255.0
            # TODO reshape image to input shape

            images.append(image)
            labels.append(words.index(word))
        images = np.array(images)
        
        ## Transform for fitting/testing
        x = images.reshape(images.shape[0], IMHEIGHT, IMWIDTH, 1)
        y = keras.utils.to_categorical(labels, NUM_CLASSES)
        yield (x, y)

In [None]:
BATCH_SIZE = 1024
train_gen = gen(BATCH_SIZE)
test_gen = gen(BATCH_SIZE)
print(train_gen)

### Build Model - CPU/GPU

In [None]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(IMHEIGHT, IMWIDTH, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(680, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

### Build Model - TPU

In [None]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# # instantiating the model in the strategy scope creates the model on the TPU
# with tpu_strategy.scope():
#     model = Sequential()
#     model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(IMHEIGHT, IMWIDTH, 1)))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.2))
#     model.add(Flatten())
#     model.add(Dense(680, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(NUM_CLASSES, activation='softmax'))
#     model.summary()

#     model.compile(loss='categorical_crossentropy', optimizer='adam',
#                   metrics=['accuracy'])

### Training the model

In [None]:
history = model.fit_generator(epochs = 50, verbose = 1,
                              generator=train_gen, steps_per_epoch = 16,
                              validation_data = test_gen, validation_steps = 16, )
model.save('g-cnn-generator.h5')

### Plot Loss/Accuracy

In [None]:
import keras
from matplotlib import pyplot as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Predicting

In [None]:
test_gen = gen(16)
X, y = next(test_gen)
y = y.tolist()
answers = [words[yi.index(1)] for yi in y]

y_pred = model.predict(X, verbose=0)
args = np.argsort(-y_pred)[:, 0:3]

fig = plt.figure(figsize=(15,12))
rows, cols = 4, 4
for i in range(rows):
    for j in range(cols):   
        idx = i*cols + j
        answer = answers[idx]
        
        if words[args[idx][0]] == answer:
            answer += ' (Y)'
        else:
            answer += ' (N)'
        
        # plot and titles
        plt.subplot(rows, cols, i*cols+j+1)
        plt.title(answer, fontsize=15)
        infotext = "1. {}\n2. {}\n3. {}".format(words[args[idx][0]], words[args[idx][1]], words[args[idx][2]])
        
        plt.text(32, 58, infotext, style='italic', color='white', fontsize='large',
            bbox={'facecolor': '#486678', 'alpha': 0.8, 'pad': 10})

        # turn off axis ticks
        plt.gca().axes.get_xaxis().set_visible(False)
        plt.gca().axes.get_yaxis().set_visible(False)
        
        img = X[idx]
        img = img.reshape((IMHEIGHT, IMWIDTH))
        
        # gray scale
        plt.imshow(img, cmap='gray')

### ~~Gather X and y~~

In [None]:
# X, y = [], []
# for idx in tqdm(range(len(draw_df))):
#     X.append(render(draw_df.iloc[idx].drawing))
#     y.append( words.index(draw_df.iloc[idx].word) )

# X, y = np.array(X), np.array(y)
# print(X.shape)
# print(y.shape)

### ~~Splitting to Test and Train Data~~

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y)

# y_train = keras.utils.to_categorical(y_train, num_classes)
# X_train = X_train.reshape(X_train.shape[0], imheight, imwidth, 1)
# y_test = keras.utils.to_categorical(y_test, num_classes)
# X_test = X_test.reshape(X_test.shape[0], imheight, imwidth, 1)

## ~~Preparing data~~

In [None]:
# print(y_train.shape)
# print(X_train.shape)
# print(y_test.shape)
# print(X_test.shape)

# # Correct should be
# #  (612000, 340) 
# #  (612000, 32, 32, 1) 
# #  (68000, 340) 
# #  (68000, 32, 32, 1)

## ~~Building the First Model (Generic CNN)~~

### ~~Building the Layers~~

In [None]:
# model = Sequential()
# model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(imheight, imwidth, 1)))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.2))
# model.add(Flatten())
# model.add(Dense(680, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(num_classes, activation='softmax'))
# model.summary()

### ~~Compiling and Training the model~~

In [None]:
# model.compile(loss='categorical_crossentropy', optimizer='adam',
#               metrics=['accuracy'])

# history = model.fit(x=X_train, y=y_train, batch_size = 32, epochs = 10,
#           validation_data = (X_test, y_test), verbose = 1)

In [None]:
# model.save('g-cnn-testing.h5')

In [None]:
# import keras
# from matplotlib import pyplot as plt
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'val'], loc='upper left')
# plt.show()

## ~~Predicting~~

In [None]:
# fig = plt.figure(figsize=(15,12))
# rows, cols = 3, 3

# for i in range(rows):
#     for j in range(cols): 
#         idx = randint(0, len(X_test)-1)
#         img = render(draw_df.iloc[idx].drawing)
#         rsimg = np.reshape(img, (-1, imheight, imwidth, 1))
#         answer = draw_df.iloc[idx].word
        
#         # predict
#         preds = model.predict(rsimg, verbose=0)
#         args = np.argsort(-preds)[:, 0:3]  # top 3
#         args = args[0]
#         if words[args[0]] == answer:
#             answer += ' (Y)'
#         else:
#             answer += ' (N)'
        
#         # plot and titles
#         plt.subplot(rows, cols, i*cols+j+1)
#         plt.title(answer, fontsize=15)
#         infotext = "1. {}\n2. {}\n3. {}".format(words[args[0]], words[args[1]], words[args[2]])
        
#         plt.text(32, 58, infotext, style='italic', color='white', fontsize='large',
#             bbox={'facecolor': '#486678', 'alpha': 0.8, 'pad': 10})

#         # turn off axis ticks
#         plt.gca().axes.get_xaxis().set_visible(False)
#         plt.gca().axes.get_yaxis().set_visible(False)
        
#         # gray scale
#         plt.imshow(img, cmap='gray')