# Download raw images provided by NDSC

In [0]:
!mkdir /content/images/
!wget https://www.dropbox.com/s/bhy4b7bwsvvhtf6/beauty_image.tar.gz?dl=0
!tar -zxf beauty_image.tar.gz?dl=0 -C /content/images/ --strip-components=1
!wget https://www.dropbox.com/s/ss2ibxu0k4x9c91/mobile_image.tar.gz?dl=0
!tar -zxf mobile_image.tar.gz?dl=0 -C /content/images/ --strip-components=1
!wget https://www.dropbox.com/s/3jpwfbeilm22vhs/fashion_image.tar.gz?dl=0
!tar -zxf fashion_image.tar.gz?dl=0 -C /content/images/ --strip-components=1

# Import necessary libraries

In [0]:
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.utils import to_categorical, Sequence, plot_model
from keras.utils.vis_utils import model_to_dot
from keras.models import load_model, Model, Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Input, Concatenate, concatenate
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from IPython.display import SVG
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import os
import glob
import time
import json
import random
import time
import cv2

# fix random seed for reproducibility
np.random.seed(7)

# Set up file paths & import other provided data

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

base_folder = '/content/gdrive/My Drive/NDSC/'
data_folder = base_folder + 'data/'
train_data_filepath = data_folder + 'train_jpg.csv'
test_data_filepath = data_folder + 'test.csv'
predict_folder = base_folder + 'predict/'

In [0]:
# Read categories.json
with open(data_folder+'categories.json') as f:
    categories = json.load(f)
    
# Create categories dictionary with numbers as keys and major/minor categories as values
cat = {}
for major,minors in categories.items():
    for minor,val in minors.items():
      cat[val] = (major,minor)
    
print('Categories: \n{}'.format(cat))

In [0]:
# Import train data
df = pd.read_csv(train_data_filepath)
df = df.sample(frac=1,random_state=7).reset_index(drop=True)
df['true_path'] =  ['/content/images/'+ (img.split('/')[1]) for img in df['image_path']]

df_test = pd.read_csv(test_data_filepath)
df_test['true_path'] =  ['/content/images/'+ (img.split('/')[1]) for img in df_test['image_path']]

# OHE labels
Y_train = to_categorical(df['Category'])
hist = plt.hist(np.argmax(Y_train,axis=1),bins=58,range=[-0.5,57.5]) # visualise distribution

m = df.shape[0]
n = np.unique(np.argmax(Y_train,axis=1)).shape[0]
print('No. of images:',m)
print('Number of categories:',n)

# Quick data exploration/visualisation

In [0]:
image_files = glob.glob('/content/images/*')
print('A total of {} images have been downloaded.'.format(len(image_files)))

In [0]:
# Show you 5 random images
im5 = df.sample(n=5)
fig = plt.figure(figsize=(25,125))
for i in range(5):
  im = image.load_img(im5['true_path'].iloc[i],target_size=(224,224))
  print('Image: ',cat[im5['Category'].iloc[i]],im5['title'].iloc[i])
  fig.add_subplot(1,5,i+1)
  plt.imshow(im)
plt.show()

# Data Preprocessing

In [0]:
# Generate word bank and encode
print('Encoding all title words...')
# Create list of all titles
titles = list(df['title'])
titles = [title.split() for title in titles]
titles_test = list(df_test['title'])
titles_test = [title.split() for title in titles_test]

# OHE for title words (aka unique word bank)
mlb = MultiLabelBinarizer(sparse_output=True)
titles_enc = mlb.fit_transform(titles)
titles_test_enc = mlb.transform(titles_test)
N_dict = len(mlb.classes_)
print('Encoding complete. # unique words:',N_dict)

# Create/Load Model

In [0]:
# Use this if loading a trained model instead
model = load_model(base_folder+'models/model.h5')

In [0]:
# Create model
# input data conists of a list, first containing an image and second the title tags

print('Generating model...')
input_layer_word = Input(shape=(N_dict,))
input_layer_conv = Input(shape=(224,224,3))
base_model = ResNet50(weights='imagenet',include_top=False,input_tensor=input_layer_conv)
x_img = base_model.output
x_img = GlobalAveragePooling2D()(x_img)
x_img = Dense(512, activation='relu')(x_img)
x_img = Dropout(0.3)(x_img)
x_word = Dense(2048,activation='relu')(input_layer_word)
## new dropout layer
# x_word = Dropout(0.3)(x_word)

x = concatenate([x_img,x_word])
x = Dense(58, activation='relu')(x)
predictions = Dense(58, activation='softmax')(x)

# freeze all convolutional imagenet layers
for layer in base_model.layers:
    print(layer.name)
    layer.trainable = False

# Final model
model = Model(inputs=[input_layer_conv,input_layer_word], outputs=predictions)
model.summary()
SVG(model_to_dot(model).create(prog='dot', format='svg'))

# Train & Evaluate Model

In [0]:
# Fit Generator
class My_Generator(Sequence):

    def __init__(self, input_data, labels, batch_size):
        [self.image_filenames, self.titles], self.labels = input_data, labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x_titles = np.array(self.titles[idx * self.batch_size:(idx + 1) * self.batch_size].todense())
        batch_y = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]
  
        batch_x_image = []
        for item in batch_x:
          img = image.load_img(item, target_size=(224, 224))
          batch_x_image.append(image.img_to_array(img))
          
        batch_x_image = np.array(batch_x_image)
        batch_x_image = preprocess_input(batch_x_image)
        
        return [batch_x_image,batch_x_titles],np.array(batch_y)
      
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

split_ratio = 0.95
divide = int(split_ratio*m)
batch_size = 64
num_epochs = 1

# convert data into valid numpy format
true_path_np = np.array(df['true_path'])
                     

training_filenames,validation_filenames = [true_path_np[:divide],titles_enc[:divide]],[true_path_np[divide:],titles_enc[divide:]]
GT_training,GT_validation = Y_train[:divide],Y_train[divide:]

my_training_batch_generator = My_Generator(training_filenames, GT_training, batch_size)
my_validation_batch_generator = My_Generator(validation_filenames, GT_validation, batch_size)

In [0]:
# Start training
print('Training in progres...')
history = model.fit_generator(generator=my_training_batch_generator,
                                          steps_per_epoch=(m // batch_size),
                                          epochs=num_epochs,
                                          verbose=1,
                                          validation_data=my_validation_batch_generator,
                                          validation_steps=(m // batch_size),
                                          use_multiprocessing=False,
                                          workers=16,
                                          max_queue_size=32)

# Save trained model
model.save(base_folder+'models/ultimate_v2_4epochs.h5')

# Evaluate
scores = model.evaluate_generator(generator=my_validation_batch_generator, verbose = 1)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [0]:
model.save(base_folder+'models/new_model.h5')

## Visualise loss history

In [0]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Predict test set for submission

In [0]:
class Predict_Generator(Sequence):

    def __init__(self, input_data, batch_size):
        self.image_filenames, self.titles = input_data
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.image_filenames[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x_titles = np.array(self.titles[idx * self.batch_size:(idx + 1) * self.batch_size].todense())
  
        batch_x_image = []
        for item in batch_x:
          img = image.load_img(item, target_size=(224, 224))
          batch_x_image.append(image.img_to_array(img))
          
        batch_x_image = np.array(batch_x_image)
        batch_x_image = preprocess_input(batch_x_image)
        
        return [batch_x_image,batch_x_titles]

# convert data into valid numpy format
true_path_np_test = np.array(df_test['true_path'])
batch_size = 256

my_prediction_batch_generator = Predict_Generator([true_path_np_test,titles_test_enc], batch_size)
      
# Generate predictions
predictions = model.predict_generator(my_prediction_batch_generator,verbose=1)

# Save to csv
predictions = np.argmax(predictions,axis=1)
output_file = predict_folder + 'ult_v2_2epochs.csv'
  
output = pd.DataFrame({'itemid':df_test['itemid'],'Category':predictions})
output.to_csv(output_file,index=False)