In [1]:
## import libaries
import pandas as pd
import numpy as np
import cv2
import os, sys
from tqdm import tqdm

In [2]:
# Set work directory
os.chdir("C:/Users/skumarravindran/Desktop/Kaggle/CrowdAnalytixIdentifySuperHeros")

In [3]:
## load data
train = pd.read_csv('train_data.csv')
validation = pd.read_csv('validation_data.csv')
test = pd.read_csv('test_data.csv')

In [4]:
# function to read image
def read_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_COLOR)
    img = cv2.resize(img, (128,128)) # changed from 256 to 100'bad accuracy' then trying 128
    return img

In [5]:
## set path for images
TRAIN_PATH = 'Train/'
TEST_PATH = 'CAX_Superhero_Test/'
VALIDATION_PATH = 'Validation/'

In [6]:
# load data
train_img, test_img, validation_img = [],[],[]
for img_path in tqdm(train['filename'].values):
    train_img.append(read_img(TRAIN_PATH + img_path))

for img_path in tqdm(test['filename'].values):
    test_img.append(read_img(TEST_PATH + img_path))
    
for img_path in tqdm(validation['filename'].values):
    validation_img.append(read_img(VALIDATION_PATH + img_path))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4347/4347 [00:10<00:00, 414.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3375/3375 [00:05<00:00, 564.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086/1086 [00:01<00:00, 568.95it/s]


In [7]:
# normalize images
x_train = np.array(train_img, np.float32) / 255.
x_validation = np.array(validation_img, np.float32) / 255.
x_test = np.array(test_img, np.float32) / 255.

In [8]:
# Train Dataset: target variable - encoding numeric value
t_label_list = train['Superhero'].tolist()
Y_train = {k:v+1 for v,k in enumerate(set(t_label_list))}
y_train = [Y_train[k] for k in t_label_list]   
y_train = np.array(y_train)

# Validation Dataset: target variable - encoding numeric value
v_label_list = validation['Superhero'].tolist()
Y_validation = {k:v+1 for v,k in enumerate(set(v_label_list))}
y_validation = [Y_validation[k] for k in v_label_list]   
y_validation = np.array(y_validation)

In [9]:
from keras import applications
from keras.models import Model
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.metrics import categorical_accuracy
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau

Using TensorFlow backend.


In [10]:
y_train = to_categorical(y_train)
y_validation = to_categorical(y_validation)

In [11]:
#Transfer learning with Inception V3 
base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

In [12]:
# Number of product classifications
print(y_train.shape[1])
print(y_validation.shape[1])

13
13


In [13]:
## set model architechture 
add_model = Sequential()
add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
add_model.add(Dense(128, activation='relu'))
add_model.add(Dense(y_train.shape[1], activation='softmax'))

model = Model(inputs=base_model.input, outputs=add_model(base_model.output))
model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=1e-3, momentum=0.9),# lr from 1e-4 
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128, 128, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 128, 128, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 128, 128, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 64, 64, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 64, 64, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 64, 64, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 32, 32, 128)       0         
__________

In [14]:
batch_size = 8   # 32 # tune it
epochs = 100 # init 5 -increase it

train_datagen = ImageDataGenerator(
        rotation_range=20, #was 30
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1, # newly added
        horizontal_flip=True)
train_datagen.fit(x_train)


In [15]:
## Option 1
history = model.fit_generator(
    train_datagen.flow(x_train, y_train, batch_size=batch_size),
    steps_per_epoch=x_train.shape[0] // batch_size,
    validation_data = (x_validation, y_validation),
    epochs=epochs,
    #####callbacks=[ModelCheckpoint('VGG16-transferlearning.model', monitor='val_acc', save_best_only=True)]
    #####callbacks=[ModelCheckpoint('VGG16-transferlearning.model', monitor='val_acc', save_best_only=True)]
    #####callbacks=[EarlyStopping(monitor='loss', patience=10, verbose=0), ModelCheckpoint(filepath="C:/Users/skumarravindran/Documents/keras_save_model/he_weights.hdf5", verbose=1, save_best_only=True)]
    # Below command was used for the final version but couldn't submit the prediction-missed deadline, 
    #it includes early stopping, reduce the LR based on validation results, saves the best model and 
    #updated with the next best ones. Check callback documentations for additional tips
    callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=7, mode='auto', verbose=1), ReduceLROnPlateau(monitor='val_acc', patience=5, verbose=1, factor=0.5, min_lr=0), ModelCheckpoint(filepath='C:/Users/skumarravindran/Documents/keras_save_model/he_weights.hdf5', verbose=1, save_best_only=True)]
)
# additional documentations
#https://keras.io/callbacks/

Epoch 1/100




Epoch 2/100




Epoch 3/100




Epoch 4/100




Epoch 5/100




Epoch 6/100




Epoch 7/100




Epoch 8/100




Epoch 9/100




Epoch 10/100




Epoch 11/100




Epoch 12/100




Epoch 13/100




Epoch 14/100




Epoch 15/100




Epoch 16/100




Epoch 17/100




Epoch 18/100




Epoch 19/100




Epoch 20/100




Epoch 00019: early stopping


In [25]:
# Option 2 : Not working
#checkpointer = ModelCheckpoint(filepath="C:/Users/skumarravindran/Documents/keras_save_model/super_heros/weights.hdf5", verbose=1, save_best_only=True)
#model.fit(train_datagen.flow(x_train, y_train, batch_size=batch_size), validation_split=0.3, epochs=epochs, callbacks=[checkpointer])

In [16]:

# To load weights in case of option 2 only
model.load_weights('C:/Users/skumarravindran/Documents/keras_save_model/he_weights.hdf5')
## predict test data
predictions = model.predict(x_test)


In [17]:
# get labels
predictions = np.argmax(predictions, axis=1)
rev_y = {v:k for k,v in Y_train.items()}
pred_labels = [rev_y[k] for k in predictions]

In [18]:
## make submission
sub = pd.DataFrame({'filename':test.filename, 'Superhero':pred_labels})
sub.to_csv('test_sub24032018_v2.csv', index=False) ## ~0.59