In [1]:
# Load basic libraries
import numpy as np
import pandas as pd
import os
import sys
%matplotlib inline

# Load Keras Libraries
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Input
from keras import applications
from keras import optimizers
from keras import backend as K

from sklearn.utils import shuffle

Using TensorFlow backend.


In [2]:
# Load train-test data
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test.csv')

In [3]:
# categories: naming 
import json
with open('../categories.json','r') as f:
    allCat = json.load(f)
print('The top level categories are: {}'.format(list(allCat.keys())))


print('There are {} categories in Mobile'.format(len(allCat['Mobile'])))
print('There are {} categories in Fashion'.format(len(allCat['Fashion'])))
print('There are {} categories in Beauty'.format(len(allCat['Beauty'])))

mobCat = sorted(list(allCat['Mobile'].values()))
fasCat = sorted(list(allCat['Fashion'].values()))
beuCat = sorted(list(allCat['Beauty'].values()))

folder_path_dict = {i:'Mobile' for i in mobCat}
folder_path_dict.update({i:'Fashion' for i in fasCat})
folder_path_dict.update({i:'Beauty' for i in beuCat})

##
numerical2label = {}
labels = allCat

for master_label in labels.keys():
    master_dict = labels[master_label]
    for item_name, item_idx in master_dict.items():
        numerical2label[item_idx] = item_name
        
label2numerical = {}
for item_idx, item_name in numerical2label.items():
    label2numerical[item_name] = item_idx

The top level categories are: ['Mobile', 'Fashion', 'Beauty']
There are 27 categories in Mobile
There are 14 categories in Fashion
There are 17 categories in Beauty


In [None]:
# update file paths accordingly in train_df
train_df = train_data.copy()

def update_file_path(inp):
    #print(inp)
    x = inp[0]
    cat = inp[1]
    path_segs = x.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Train/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
    return base_path + '/' + str(cat)+ '/' + rel_path

train_df['new_path'] = train_df.loc[:,['image_path','Category']].apply(lambda x: update_file_path(x),axis=1)
train_df['meta_cat'] = train_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0])  

In [None]:
# Lood data for mobile categories (CHANGE here for other categories)
ListCat = fasCat
Cat_Size = len(fasCat)
train_byCat = train_df.groupby('meta_cat')
cur_cat = 'fashion_image'
cat_train = train_byCat.get_group(cur_cat)
cat_train.shape

In [5]:
# dimensions of our images.
img_width, img_height = 128, 128

# input shape
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

In [6]:

base_model = applications.MobileNet(weights='imagenet', include_top=False, 
                                        input_shape=input_shape)


In [7]:
## Model
last_layer = base_model.output
neck = GlobalAveragePooling2D()(last_layer)

# Base Mobilenet   Model
model = Model(inputs=base_model.input, outputs=neck)

In [None]:
datagen = ImageDataGenerator(rescale=1. / 255)
base_dir = '/mnt/disks/NDSC'

In [None]:
# Generator-1
batch_size = 50
generator = datagen.flow_from_dataframe(
        dataframe=cat_train,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

In [None]:
# Generate features for first 100k train samples
num_samples = generator.n
generator.reset()

bottleneck_features_fashion = model.predict_generator(
        generator, num_samples // batch_size, verbose=1, use_multiprocessing=False)

In [None]:
with open('Fashion_bottleneck_part_1.npy', 'wb') as f:
    np.save(f, bottleneck_features_fashion)

In [None]:
len(cat_train)-len(bottleneck_features_fashion)

In [None]:
fas_rem = cat_train.tail(2)
fas_rem

In [None]:
# Generator-1
batch_size = 2
generator_rem = datagen.flow_from_dataframe(
        dataframe=fas_rem,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

In [None]:
num_samples_rem = generator_rem.n
generator_rem.reset()

bottleneck_features_fashion_rem = model.predict_generator(
        generator_rem, num_samples_rem // batch_size, verbose=1, use_multiprocessing=False)

In [None]:
max(bottleneck_features_fashion_rem[1])

In [None]:
max(bottleneck_features_fashion[0])

In [None]:
max(bottleneck_features_fashion[-1])

In [None]:
## Concatenate
X_IMG_FAS_TRAIN = np.concatenate((bottleneck_features_fashion,bottleneck_features_fashion_rem), axis = 0)

In [None]:
X_IMG_FAS_TRAIN.shape

In [None]:
with open('X_IMG_FAS_TRAIN.npy', 'wb') as f:
    np.save(f, X_IMG_FAS_TRAIN)

In [None]:
max(X_IMG_FAS_TRAIN[1])

In [None]:
n_fas = X_IMG_FAS_TRAIN.shape[0]

In [None]:
fas_y = cat_train.Category.values
fas_targets = np.zeros((n_fas, 58))
fas_targets[np.arange(n_fas), fas_y] = 1

In [None]:
## Model Fine tune

In [None]:
# Fashion model
img_input = Input(shape=(1024,), name='img_input')
x = Dropout(0.2)(img_input)
x = Dense(1024, activation='relu', name= 'fc-1')(x) # dense 1
x = Dropout(0.5)(x)
x = Dense(512,activation='relu')(x) #dense layer 2
x = Dropout(0.5)(x)
out = Dense(58, activation = 'softmax', name = 'out_layer')(x)

# Base Mobilenet   Model

Fas_model = Model(inputs=img_input, outputs=out)

Fas_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
Fas_model.summary()

In [None]:
def NYRS_gen(X, y, batch_size):
    
    n_batches = math.floor(len(X) / batch_size)
    
    while True: 
        X,y = shuffle(X,y) # Shuffle the index.
        
        for i in range(n_batches):
            
            X_batch = X[i*batch_size:(i+1)*batch_size]
            y_batch = y[i*batch_size:(i+1)*batch_size]
            
            yield X_batch, y_batch

In [None]:
batch_size = 32
import math

N = len(X_IMG_FAS_TRAIN)

X_IMG_FAS_TRAIN, fas_targets = shuffle(X_IMG_FAS_TRAIN, fas_targets)

N_train = int(0.8*N)

X_train = X_IMG_FAS_TRAIN[:N_train]
y_train = fas_targets[:N_train]

X_val = X_IMG_FAS_TRAIN[N_train:]
y_val = fas_targets[N_train:]

n_steps = len(X_IMG_FAS_TRAIN) // batch_size

batch_gen = NYRS_gen(X_train, y_train, 32)

history = Fas_model.fit_generator(batch_gen, epochs=10, 
                              steps_per_epoch=n_steps, 
                              validation_data=(X_val,y_val),
                              verbose=True)

In [None]:
X_val.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

## Extract for test set

In [8]:
# update file paths accordingly in train_df
test_df = test_data.copy()

def update_test_file_path(inp):

    path_segs = inp.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Test/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
        
    return base_path + '/'  + rel_path

test_df['new_path'] = test_df.loc[:,'image_path'].apply(lambda x: update_test_file_path(x))
test_df['meta_cat'] = test_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0])

In [9]:
# Lood data for mobile categories (CHANGE here for other categories)
test_byCat = test_df.groupby('meta_cat')
cur_cat = 'fashion_image'
cat_test = test_byCat.get_group(cur_cat)
cat_test.shape

(55440, 5)

In [10]:
datagen_test = ImageDataGenerator(rescale=1. / 255)
base_dir = '/mnt/disks/NDSC'

In [11]:
# Generator-test
batch_size = 50
generator_test = datagen_test.flow_from_dataframe(
        dataframe=cat_test,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 55440 validated image filenames.


In [13]:
# Generate features for  test samples
num_samples = generator_test.n
generator_test.reset()

bottleneck_features_fashion_test = model.predict_generator(
        generator_test, num_samples // batch_size, verbose=1, use_multiprocessing=False)



In [14]:
with open('df_TEST_Fashion_bottleneck_part_1.npy', 'wb') as f:
    np.save(f, bottleneck_features_fashion_test)

In [15]:
# The remaining part

In [16]:
N_rem = len(cat_test)-len(bottleneck_features_fashion_test)
N_rem

40

In [18]:
test_fas_rem = cat_test.tail(N_rem)
test_fas_rem.tail()

Unnamed: 0,itemid,title,image_path,new_path,meta_cat
131980,1825226347,stock baru atasan blouse wanita kemeja lengan ...,fashion_image/4bda1e64d3b4738cd2fcf67d52fd7b1f...,Test/Fashion/4bda1e64d3b4738cd2fcf67d52fd7b1f.jpg,fashion_image
131981,1826413179,termurah kemeja wanita atasan baju kerja kanto...,fashion_image/85f1b1bc152062b1cc87c4e53b7af66b...,Test/Fashion/85f1b1bc152062b1cc87c4e53b7af66b.jpg,fashion_image
131982,1827137407,rl women s summer chiffon shirt trumpet sleeve...,fashion_image/b39777140d57e04aadd0b4b07a86db8b...,Test/Fashion/b39777140d57e04aadd0b4b07a86db8b.jpg,fashion_image
131983,1828540009,women casual 3 4 sleeve lace patchwork t shirt...,fashion_image/3c9f9b654b70b2c80e9f59acdc15a473...,Test/Fashion/3c9f9b654b70b2c80e9f59acdc15a473.jpg,fashion_image
131984,1829477358,wanita s1017 kaos o neck dkny lengan pendek sp...,fashion_image/c61ba632459162c5bdc70ea1e6ba3ca1...,Test/Fashion/c61ba632459162c5bdc70ea1e6ba3ca1.jpg,fashion_image


In [None]:
# Generator-1
datagen_test_rem = ImageDataGenerator(rescale=1. / 255)
batch_size = 1
test_generator_rem = datagen_test_rem.flow_from_dataframe(
        dataframe=test_fas_rem,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

In [None]:
num_samples_rem = test_generator_rem.n
test_generator_rem.reset()

test_bottleneck_features_fashion_rem = model.predict_generator(
        test_generator_rem, num_samples_rem // batch_size, verbose=1, use_multiprocessing=False)

In [None]:
X_IMG_FAS_TEST = np.concatenate((bottleneck_features_fashion_test,test_bottleneck_features_fashion_rem), axis = 0)

In [None]:
with open('X_IMG_FAS_TEST.npy', 'wb') as f:
    np.save(f, X_IMG_FAS_TEST)

In [None]:
#### ----- ###