In [None]:
from __future__ import print_function
from future_builtins import map, filter, zip

In [None]:
import tensorflow as tf
import keras
import coremltools
import numpy as np

for name, package in [('tensorflow',tf), ('keras',keras), ('coremltools',coremltools), ('numpy', np)]:
    try:
        print(name + ' v' + package.__version__)
    except AttributeError:
        print(name + ' v?')

In [None]:
# Load newline-delimited list of categories

# f = open('quickdraw-categories.txt','r')
f = open('50-categories.txt','r')
categories = [line.rstrip() for line in f]
f.close()

filename_from_category = lambda s: '../../quickdraw/'+s+'.npy'
filenames = list(map(filename_from_category, categories))

In [None]:
# Test that all files exist according to categories
# in quickdraw-categories.txt

import os.path

all_exist = True
for filename in filenames:
    if not os.path.isfile(filename):
        print('file `{}` does not exist'.format(filename))
        all_exist = False
if all_exist:
    print('All {} files found!'.format(len(categories)))

In [None]:
def one_hot_vector(index):
    hot = np.zeros(len(categories),dtype=np.int8)
    hot[index] = 1
    return hot

def one_hot_array(index):
    is_column = False
    hot = np.zeros((1,len(categories)),dtype=np.int8)
    hot[0,index] = 1
    if is_column:
        return hot.T
    else:
        return hot

In [None]:
import random

def make_dataset():
    data = []
    target = []
    for category_index, (category, filename) in enumerate(zip(categories, filenames)):
        label_one_hot_encoded = one_hot_vector(category_index)
        category_images = np.load(filename)
        n_images = category_images.shape[0]
        for image in category_images[:samples_per_category]:
            image.shape = image_size     #Make Square
            data.append(image)
            target.append(label_one_hot_encoded)
    data = np.array(data)
    data = np.expand_dims(data, axis=3)
    target = np.array(target)
    return (data, target)

# Build datasets for CNN/Random Forests

In [None]:
from sklearn.model_selection import train_test_split


####################################################################
image_size = (28,28)
# Number of samples per 50 categories (Determines full dataset size)
samples_per_category = 30000

# Percent of dataset used for both validation set and test set
split_size = 0.1
# Percent of data to subsample for random forests classifier
subsample_size = 1
####################################################################


# Find the total dataset size
total_samples = samples_per_category*50
print('{} samples selected from every category'.format(samples_per_category))
print('{} categories used for total of {} samples'.format(50, total_samples), end='\n\n')

# Get full dataset
data, target = make_dataset()

## Subsample dataset
samples = int(total_samples*subsample_size)
idx = np.random.choice(data.shape[0], samples, replace=False)
data_sampled = data[idx]
data_sampled.shape = (data_sampled.shape[0], data_sampled.shape[1]*data_sampled.shape[2])
target_sampled = target[idx]


# Create train(80%), validation(10%), and test(10%) for the CNN
X_train_cnn, X_split_cnn, y_train_cnn, y_split_cnn = train_test_split(
    data, 
    target, 
    test_size=split_size*2, 
    random_state=42,
    stratify=target
)
X_valid_cnn, X_test_cnn, y_valid_cnn, y_test_cnn = train_test_split(
    X_split_cnn, 
    y_split_cnn, 
    test_size=0.5, 
    random_state=42,
    stratify=y_split_cnn
)
print('Convolutional Neural Network datasets')
print('{} total samples'.format(total_samples))
print('{}% -- {} -- Training set'.format((1-split_size*2)*100, X_train_cnn.shape))
print('{}% -- {} -- Validation set'.format(split_size*100, X_valid_cnn.shape))
print('{}% -- {} -- Test set'.format(split_size*100, X_test_cnn.shape), end='\n\n')


# Create train(80%), validation(10%), and test(10%) for the random forest network 
X_train_rf, X_split_rf, y_train_rf, y_split_rf = train_test_split(
    data_sampled,
    target_sampled,
    test_size=split_size*2,
    random_state=42,
    stratify=target_sampled
)
X_valid_rf, X_test_rf, y_valid_rf, y_test_rf = train_test_split(
    X_split_rf,
    y_split_rf,
    test_size=0.5,
    random_state=42,
    stratify=y_split_rf
)
print('Random Forests datasets')
print('{} total samples distilled down to {} ({}%)'.format(total_samples, samples, subsample_size*100))
print('{}% -- {} -- Training set'.format((1-split_size*2)*100, X_train_rf.shape))
print('{}% -- {} -- Validation set'.format(split_size*100, X_valid_rf.shape))
print('{}% -- {} -- Test set'.format(split_size*100, X_test_rf.shape), end='\n\n')

# CNN

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Reshape, Dense, BatchNormalization, Dropout, \
                         Conv2D, MaxPooling2D, Activation, Flatten, Lambda

def _make_convolution_layers(model):
    model.add(
        Reshape(
            # input shape is height, width, channels
            (input_shape[0], input_shape[1], 1), 
            input_shape=(input_shape[0], input_shape[1], 1)
        )
    )

    for filters in n_filters:
        # Add a single convolution layer
        model.add(
            Conv2D(
                filters=filters,
                kernel_size=(3,3),
                padding='same',
                activation=layer_activation
            )
        )
        # Add batch normalization to the convolution layer
        if batch_normalize:
            model.add(
                BatchNormalization(axis=1)
            )
        # Pool the layer
        model.add(
            # channels_last is required for coremltools
            MaxPooling2D(pool_size=(2,2), data_format='channels_last')
        )

    model.add(Flatten())

def _make_dense_layers(model):
    for size in dense_sizes:
        model.add(
            Dense(size, activation=layer_activation)
        )
        if batch_normalize:
            model.add(
                BatchNormalization(axis=1)
            )
    model.add(
        Dense(output_size, activation=final_activation)
    )

def one_hot_output(y_hat_raw):
    # predict() outputs floats. We want one-hot in most cases
    y_hat = np.zeros(y_hat_raw.shape, dtype=np.int)
    y_hat[np.arange(y_hat_raw.shape[0]), y_hat_raw.argmax(1)] = 1
    return y_hat

In [None]:
import os.path
from keras.models import load_model
import json

def cache_fit(model_name, model, *args, **kwargs):
    archive_name = model_name+'_model.h5'
    history_name = model_name+'_history.json'
    archive_exists = os.path.isfile(archive_name)

    if not archive_exists:
        print('Model '+model_name+' not found in archive. Training new model.')
        hist = model.fit(*args, **kwargs)
        model.save(archive_name)
        with open(history_name, 'w') as f:
            json.dump(hist.history, f)
        return model
    else:
        print('Model found on disk. Reloading.')
        return load_model(archive_name)

In [None]:
input_shape=image_size
n_filters=[64,64,128,128]
dense_sizes=[128,128]
output_size=len(categories)

batch_normalize = True

layer_activation='relu'
final_activation='softmax'

loss='categorical_crossentropy'
optimizer='Nadam'
metrics=['categorical_accuracy']

#########

model = Sequential()

_make_convolution_layers(model)
_make_dense_layers(model)

model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=metrics
)

model.summary()

model_name = 'cnn_50cat_3000img_4ep_test'

model = cache_fit(
    model_name, model, 
    x=X_train_cnn, y=y_train_cnn, 
    batch_size=32, epochs=4, 
    verbose=1, validation_data=(X_valid_cnn, y_valid_cnn)
)

### CNN Predict

In [None]:
y_cat_cnn = np.argmax(y_test_cnn, axis=1)
y_hat_cnn = np.argmax(model.predict(X_test_cnn), axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt, seaborn as sns

cm = confusion_matrix(y_cat_cnn, y_hat_cnn)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

acc = accuracy_score(y_cat_cnn, y_hat_cnn)

fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cm, ax=ax)
plt.title('CNN Test prediciton (Acc:{:.2f}%)'.format(acc*100))
plt.show()

### Create CoreML Model

In [None]:
coreml_model = coremltools.converters.keras.convert(
    model, 
    input_names='drawing', 
    image_input_names='drawing', 
    class_labels='50-categories.txt'
)

coreml_model.save(model_name+'.mlmodel')

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=30,
    max_depth=None,
    n_jobs=1,
    verbose=1,
    class_weight=None
)

rf.fit(X_train_rf, y_train_rf)

### Random Forests Predict -- Validation

In [None]:
y_cat_rf_valid = np.argmax(y_valid_rf, axis=1)
y_hat_rf_valid = np.argmax(rf.predict(X_valid_rf), axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt, seaborn as sns

cm = confusion_matrix(y_cat_rf_valid, y_hat_rf_valid)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

acc = accuracy_score(y_cat_rf_valid, y_hat_rf_valid)

fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cm, ax=ax)
plt.title('Random Forests Validation prediciton (Acc:{:.2f}%)'.format(acc*100))
plt.show()

### Random Forests Predict -- Test

In [None]:
y_cat_rf_test = np.argmax(y_test_rf, axis=1)
y_hat_rf_test = np.argmax(rf.predict(X_test_rf), axis=1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt, seaborn as sns

cm = confusion_matrix(y_cat_rf_test, y_hat_rf_test)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

acc = accuracy_score(y_cat_rf_test, y_hat_rf_test)

fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(cm, ax=ax)
plt.title('Random Forests Validation prediciton (Acc:{:.2f}%)'.format(acc*100))
plt.show()