## Setting up NIMA Retraining Pipeline in Keras

In [87]:
import boto3
import botocore

import sys
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import base64
import contextlib

sys.path.append('/srv/data/machine_learning')
import py_models.common.image as im

from PIL import Image
from io import BytesIO
from cStringIO import StringIO
from collections import defaultdict

from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input

### Define Custom Earth Mover Loss Function

In [52]:
def earth_mover_loss(y_true, y_pred):
    cdf_ytrue = K.cumsum(y_true, axis=-1)
    cdf_ypred = K.cumsum(y_pred, axis=-1)
    samplewise_emd = K.sqrt(K.mean(K.square(K.abs(cdf_ytrue - cdf_ypred)), axis=-1))
    return K.mean(samplewise_emd)

### Initialize Pretrained InceptionNet Model

In [53]:
image_size = 224

base_model = InceptionResNetV2(input_shape=(image_size, image_size, 3), include_top=False, pooling='avg')
for layer in base_model.layers:
    layer.trainable = True

x = Dropout(0.75)(base_model.output)
x = Dense(10, activation='softmax')(x)

model = Model(base_model.input, x)
model.summary()
optimizer = Adam(lr=1e-3)
model.compile(optimizer, loss=earth_mover_loss)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv2d_813 (Conv2D)             (None, 111, 111, 32) 864         input_5[0][0]                    
__________________________________________________________________________________________________
batch_normalization_813 (BatchN (None, 111, 111, 32) 96          conv2d_813[0][0]                 
__________________________________________________________________________________________________
activation_813 (Activation)     (None, 111, 111, 32) 0           batch_normalization_813[0][0]    
__________________________________________________________________________________________________
conv2d_814

__________________________________________________________________________________________________
conv2d_944 (Conv2D)             (None, 12, 12, 192)  215040      activation_943[0][0]             
__________________________________________________________________________________________________
batch_normalization_941 (BatchN (None, 12, 12, 192)  576         conv2d_941[0][0]                 
__________________________________________________________________________________________________
batch_normalization_944 (BatchN (None, 12, 12, 192)  576         conv2d_944[0][0]                 
__________________________________________________________________________________________________
activation_941 (Activation)     (None, 12, 12, 192)  0           batch_normalization_941[0][0]    
__________________________________________________________________________________________________
activation_944 (Activation)     (None, 12, 12, 192)  0           batch_normalization_944[0][0]    
__________

### Load AVA dataset

In [56]:
import airpy as ap

df = ap.presto("SELECT * FROM plus.dim_ava_dataset_test;")
df.head()

presto: Query: Complete after 2.07 sec on 2018-08-19.                          


Unnamed: 0,ava_image_id,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10,thumbnail,ds
0,1000,3,2,12,16,19,36,49,38,20,5,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,2018-08-19
1,10000,6,2,12,9,33,59,33,21,8,3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,2018-08-19
2,10002,5,0,14,36,60,35,29,8,2,0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,2018-08-19
3,10003,0,3,1,4,6,19,28,15,10,2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,2018-08-19
4,10005,1,2,3,4,13,27,26,8,5,1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,2018-08-19


### Split Into Train / Validation / Test set

In [80]:
def generate_train_val_test(
    df
    , input_name='thumbnail'):
    """Generate training, validation, and test set"""
    
    from sklearn.cross_validation import train_test_split
    
    score_cols = df.columns[df.columns.str.startswith('score')].tolist()
    
    X, X_test, _, _ = train_test_split( \
        df.loc[:,['thumbnail'] + score_cols]
        , df[score_cols]
        , test_size=0.2
    )
    
    X_train, X_val, _, _ = train_test_split( \
        X.loc[:,['thumbnail'] + score_cols]
        , X[score_cols]
        , test_size=0.2
    )
    return X_train, X_val, X_test


X_train, X_val, X_test = \
    generate_train_val_test(df)

print "============================="
print "X_train has dimension {}".format(X_train.shape)
# print "y_train has dimension {}".format(y_train.shape)
print "=============================\n"

print "============================="
print "X_val has dimension {}".format(X_val.shape)
# print "y_val has dimension {}".format(y_val.shape)
print "=============================\n"

print "============================="
print "X_test has dimension {}".format(X_test.shape)
# print "y_test has dimension {}".format(y_test.shape)
print "=============================\n"

X_train has dimension (1280, 11)

X_val has dimension (320, 11)

X_test has dimension (400, 11)



### Thumbnail2vec

In [81]:
def thumbnail2vec(thumbnail):
    
    """This function takes in a base64 encoded thumbnail
    and decode it to a (224, 224, 3) numpy array.
    """
        
    from base64 import b64decode
    from PIL import Image
    from io import BytesIO
    from keras.applications.resnet50 import preprocess_input

    try:
        img_decoded = b64decode(thumbnail)
        img_rgb = Image.open(BytesIO(img_decoded)).convert('RGB')
        result = np.asarray(img_rgb, dtype=np.float64)
        if np.sum(result.shape) == 224 + 224 + 3:
            result = np.expand_dims(result, axis=0)
            result = preprocess_input(result)
            return result.reshape(result.shape[1:])
        else:
            return np.zeros((224, 224, 3))
    except:
        return np.zeros((224, 224, 3))

### Define Generator For Training (WIP)

In [82]:
def batch_generator(
        df
        , batch_size=32
        , input_name='thumbnail'
        , output_name='issue_type'):
    
    """Batch data generator that can be used in conjunction 
    with fit_generator for model training"""
    
    while True:
        batch_idx = np.random.randint(0, df.shape[0], batch_size)
        
        X_batch = df.iloc[batch_idx, :][input_name]
        X_batch_array = np.array(list(X_batch.apply(thumbnail2vec)))
        
        score_cols = df.columns[df.columns.str.startswith('score')]
        y_batch = df.iloc[batch_idx, :][score_cols].astype('float32')
        y_batch_array = np.array(y_batch)
        
        yield X_batch_array, y_batch_array

In [83]:
for i, (X_batch_array, y_batch_array) in enumerate(batch_generator(df)):
    print "[Batch order: {}".format(i+1) + "]\n=============="
    print "Batch Size {}".format(X_batch_array.shape), "\n", "Label size {}".format(y_batch_array.shape), '\n==============\n'
    if i > 3:
        break

[Batch order: 1]
Batch Size (32, 224, 224, 3) 
Label size (32, 10) 

[Batch order: 2]
Batch Size (32, 224, 224, 3) 
Label size (32, 10) 

[Batch order: 3]
Batch Size (32, 224, 224, 3) 
Label size (32, 10) 

[Batch order: 4]
Batch Size (32, 224, 224, 3) 
Label size (32, 10) 

[Batch order: 5]
Batch Size (32, 224, 224, 3) 
Label size (32, 10) 



### Set up Callback Function

In [84]:
from keras.callbacks import Callback
from IPython.display import clear_output
from keras.callbacks import EarlyStopping, ModelCheckpoint

NUM_ITERATION = 20
BATCH_SIZE = 32
TRAIN_STEPS_PER_EPOCH = X_train.shape[0] / BATCH_SIZE
VAL_STEPS_PER_EPOCH = X_val.shape[0] / BATCH_SIZE

class PlotLearning(Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        self.acc = []
        self.val_acc = []
        self.logs = []
        self.fig = plt.figure()

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.acc.append(logs.get('acc'))
        self.val_acc.append(logs.get('val_acc'))
        self.i += 1
        f, (ax1, ax2) = plt.subplots(1, 2, sharex=True, figsize=(20,10))
        
        clear_output(wait=True)
        
        ax1.set_yscale('log')
        ax1.plot(self.x, self.losses, '-', label="training_loss")
        ax1.plot(self.x, self.val_losses, '-', label="validation_loss")
        ax1.legend(fontsize = 'x-large')
        ax1.set_xlim([0, NUM_ITERATION])
        ax1.set_ylim([10,100])
        
        ax2.plot(self.x, self.acc, '-', label="training accuracy")
        ax2.plot(self.x, self.val_acc, '-', label="validation accuracy")
        ax2.legend(fontsize = 'x-large')
        ax2.set_xlim([0, NUM_ITERATION])
        ax2.set_ylim([0.5,1])
        
        plt.savefig("/home/robert_chang/redspot_home/resnet50/learning_curve/" + str(self.i) + ".png")        
        plt.show()

plot_learning_curve = PlotLearning()

### Model Training

In [None]:
# model.fit_generator(batch_generator(df_ava_dataset, batchsize=batchsize)
#                     , steps_per_epoch=(250000. // batchsize)
#                     , epochs=epochs
#                     , verbose=1
#                     , callbacks=callbacks
#                     validation_data=val_generator(batchsize=batchsize))

NUM_ITERATION = 20
BATCH_SIZE = 64
TRAIN_STEPS_PER_EPOCH = X_train.shape[0] / BATCH_SIZE
VAL_STEPS_PER_EPOCH = X_val.shape[0] / BATCH_SIZE
FILE_PATH = '/home/robert_chang/redspot_home/resnet50/model_weights/photo_scoring_nima_retraining.h5'

callback_lists = [
    # plot learning curve by the end of each epoch
    plot_learning_curve,
    
    # save model weights by the endo of each epoch
    ModelCheckpoint(
        filepath=FILE_PATH,
        monitor='val_loss',
        save_best_only=True,
    )
]

history = model.fit_generator(
            generator = batch_generator(X_train, BATCH_SIZE)
            , steps_per_epoch = TRAIN_STEPS_PER_EPOCH
            , epochs = NUM_ITERATION
            , validation_data = batch_generator(X_val, BATCH_SIZE)
            , validation_steps = VAL_STEPS_PER_EPOCH
            , callbacks=callback_lists
)

Epoch 1/20