In [1]:
# Load basic libraries
import numpy as np
import pandas as pd
import os
import sys
%matplotlib inline

# Load Keras Libraries
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Input
from keras import applications
from keras import optimizers
from keras import backend as K
from keras.layers.normalization import BatchNormalization

from sklearn.utils import shuffle

Using TensorFlow backend.


In [2]:
# Load train-test data
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test.csv')

In [3]:
# categories: naming 
import json
with open('../categories.json','r') as f:
    allCat = json.load(f)
print('The top level categories are: {}'.format(list(allCat.keys())))


print('There are {} categories in Mobile'.format(len(allCat['Mobile'])))
print('There are {} categories in Fashion'.format(len(allCat['Fashion'])))
print('There are {} categories in Beauty'.format(len(allCat['Beauty'])))

mobCat = sorted(list(allCat['Mobile'].values()))
fasCat = sorted(list(allCat['Fashion'].values()))
beuCat = sorted(list(allCat['Beauty'].values()))

folder_path_dict = {i:'Mobile' for i in mobCat}
folder_path_dict.update({i:'Fashion' for i in fasCat})
folder_path_dict.update({i:'Beauty' for i in beuCat})

##
numerical2label = {}
labels = allCat

for master_label in labels.keys():
    master_dict = labels[master_label]
    for item_name, item_idx in master_dict.items():
        numerical2label[item_idx] = item_name
        
label2numerical = {}
for item_idx, item_name in numerical2label.items():
    label2numerical[item_name] = item_idx

The top level categories are: ['Mobile', 'Fashion', 'Beauty']
There are 27 categories in Mobile
There are 14 categories in Fashion
There are 17 categories in Beauty


In [4]:
# update file paths accordingly in train_df
train_df = train_data.copy()

def update_file_path(inp):
    #print(inp)
    x = inp[0]
    cat = inp[1]
    path_segs = x.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Train/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
    return base_path + '/' + str(cat)+ '/' + rel_path

train_df['new_path'] = train_df.loc[:,['image_path','Category']].apply(lambda x: update_file_path(x),axis=1)
train_df['meta_cat'] = train_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0])  

In [5]:
# Lood data for mobile categories (CHANGE here for other categories)
train_byCat = train_df.groupby('meta_cat')
cur_cat = 'mobile_image'
cat_train = train_byCat.get_group(cur_cat)
cat_train.shape

(160330, 6)

In [6]:
# dimensions of our images.
img_width, img_height = 128, 128

# input shape
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

In [12]:
# Mobile-Net Model

In [7]:
base_model = applications.MobileNet(weights='imagenet', include_top=False, 
                                        input_shape=input_shape)

In [8]:
## Model
last_layer = base_model.output
neck = GlobalAveragePooling2D()(last_layer)

# Base Mobilenet   Model
model = Model(inputs=base_model.input, outputs=neck)

In [9]:
datagen = ImageDataGenerator(rescale=1. / 255)
base_dir = '/mnt/disks/NDSC'

In [10]:
# Generator-1
batch_size = 50
generator = datagen.flow_from_dataframe(
        dataframe=cat_train,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 160330 validated image filenames.


In [11]:
# Generate features for train samples
num_samples = generator.n
generator.reset()

bottleneck_features_mobile = model.predict_generator(
        generator, num_samples // batch_size, verbose=1, use_multiprocessing=False)



In [12]:
with open('Mobile_bottleneck_part_1.npy', 'wb') as f:
    np.save(f, bottleneck_features_mobile)

In [13]:
N_rem = len(cat_train)-len(bottleneck_features_mobile)
N_rem

30

In [14]:
mob_rem = cat_train.tail(N_rem)

In [15]:
# Generator-1
batch_size = 1
generator_rem = datagen.flow_from_dataframe(
        dataframe=mob_rem,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 30 validated image filenames.


In [16]:
num_samples_rem = generator_rem.n
generator_rem.reset()

bottleneck_features_mobile_rem = model.predict_generator(
        generator_rem, num_samples_rem // batch_size, verbose=1, use_multiprocessing=False)



In [17]:
max(bottleneck_features_mobile_rem[1])

5.0132027

In [18]:
max(bottleneck_features_mobile[0])

5.828734

In [19]:
max(bottleneck_features_mobile[-1])

5.4925256

In [20]:
## Concatenate
X_IMG_MOB_TRAIN = np.concatenate((bottleneck_features_mobile,bottleneck_features_mobile_rem), axis = 0)

In [21]:
X_IMG_MOB_TRAIN.shape

(160330, 1024)

In [22]:
with open('X_IMG_MOB_TRAIN.npy', 'wb') as f:
    np.save(f, X_IMG_MOB_TRAIN)

In [23]:
with open('X_IMG_MOB_TRAIN.npy', 'rb') as f:
    X_IMG_MOB_TRAIN= np.load(f)

In [29]:
max(X_IMG_MOB_TRAIN[-31])

5.4925256

In [30]:
n_mob = X_IMG_MOB_TRAIN.shape[0]

In [31]:
mob_y = cat_train.Category.values
mob_targets = np.zeros((n_mob, 58))
mob_targets[np.arange(n_mob), mob_y] = 1

In [None]:
## Model Fine tune

In [32]:
# Beauty model
img_input = Input(shape=(1024,), name='img_input')
x = BatchNormalization()(img_input)
x = Dropout(0.2)(x)
x = Dense(1024, activation='relu', name= 'fc-1')(x) # dense 1
x = Dropout(0.5)(x)
x = Dense(512,activation='relu')(x) #dense layer 2
x = Dropout(0.5)(x)
out = Dense(58, activation = 'softmax', name = 'out_layer')(x)

# Base Mobilenet   Model

Mob_model = Model(inputs=img_input, outputs=out)

Mob_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
Mob_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
img_input (InputLayer)       (None, 1024)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
fc-1 (Dense)                 (None, 1024)              1049600   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
__________

In [33]:
def NYRS_gen(X, y, batch_size):
    
    n_batches = math.floor(len(X) / batch_size)
    
    while True: 
        X,y = shuffle(X,y) # Shuffle the index.
        
        for i in range(n_batches):
            
            X_batch = X[i*batch_size:(i+1)*batch_size]
            y_batch = y[i*batch_size:(i+1)*batch_size]
            
            yield X_batch, y_batch

In [35]:
batch_size = 32
import math

N = len(X_IMG_MOB_TRAIN)

X_IMG_MOB_TRAIN, mob_targets = shuffle(X_IMG_MOB_TRAIN, mob_targets)

N_train = int(0.8*N)

X_train = X_IMG_MOB_TRAIN[:N_train]
y_train = mob_targets[:N_train]

X_val = X_IMG_MOB_TRAIN[N_train:]
y_val = mob_targets[N_train:]

n_steps = len(X_IMG_MOB_TRAIN) // batch_size

batch_gen = NYRS_gen(X_train, y_train, 32)

history = Mob_model.fit_generator(batch_gen, epochs=10, 
                              steps_per_epoch=n_steps, 
                              validation_data=(X_val,y_val),
                              verbose=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Extract for test set

In [36]:
# update file paths accordingly in train_df
test_df = test_data.copy()

def update_test_file_path(inp):

    path_segs = inp.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Test/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
        
    return base_path + '/'  + rel_path

test_df['new_path'] = test_df.loc[:,'image_path'].apply(lambda x: update_test_file_path(x))
test_df['meta_cat'] = test_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0])

In [37]:
# Lood data for mobile categories (CHANGE here for other categories)
test_byCat = test_df.groupby('meta_cat')
cur_cat = 'mobile_image'
cat_test = test_byCat.get_group(cur_cat)
cat_test.shape

(40417, 5)

In [38]:
datagen_test = ImageDataGenerator(rescale=1. / 255)
base_dir = '/mnt/disks/NDSC'

In [39]:
# Generator-test
batch_size = 50
generator_test = datagen_test.flow_from_dataframe(
        dataframe=cat_test,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 40417 validated image filenames.


In [40]:
# Generate features for  test samples
num_samples = generator_test.n
generator_test.reset()

bottleneck_features_mobile_test = model.predict_generator(
        generator_test, num_samples // batch_size, verbose=1, use_multiprocessing=False)



In [41]:
with open('df_TEST_Mobile_bottleneck_part_1.npy', 'wb') as f:
    np.save(f, bottleneck_features_mobile_test)

In [15]:
# The remaining part

In [42]:
N_rem = len(cat_test)-len(bottleneck_features_mobile_test)
N_rem

17

In [43]:
test_mob_rem = cat_test.tail(N_rem)
test_mob_rem.tail()

Unnamed: 0,itemid,title,image_path,new_path,meta_cat
172397,1781957365,nokia 5.1 plus ram 3gb 32gb garansi resmi 1 ta...,mobile_image/3dbd99b9d999d326d8ae57f7ad1f1b3e.jpg,Test/Mobile/3dbd99b9d999d326d8ae57f7ad1f1b3e.jpg,mobile_image
172398,1839851276,big promo add whatshap 0821 9127 5399 iphone 7...,mobile_image/6d45e5c7e36ac897f58a9f72ff4bf0b8.jpg,Test/Mobile/6d45e5c7e36ac897f58a9f72ff4bf0b8.jpg,mobile_image
172399,955369303,datang lagi sharp r1 ram 3gb 32gb gratis silic...,mobile_image/08f68bb1cc3f381364776ac5cfd9e45e.jpg,Test/Mobile/08f68bb1cc3f381364776ac5cfd9e45e.jpg,mobile_image
172400,1638035772,sony xperia z5 premium au ram 3gb rom 32gb sec...,mobile_image/1d0610ea0f43d75ecc3ff951f6c647d4.jpg,Test/Mobile/1d0610ea0f43d75ecc3ff951f6c647d4.jpg,mobile_image
172401,1498091427,xiaomi mi 8 ram 6 128gb black,mobile_image/6649fa043a7b2eebda6ed904c966a14b.jpg,Test/Mobile/6649fa043a7b2eebda6ed904c966a14b.jpg,mobile_image


In [44]:
# Generator-1
datagen_test_rem = ImageDataGenerator(rescale=1. / 255)
batch_size = 1
test_generator_rem = datagen_test_rem.flow_from_dataframe(
        dataframe=test_mob_rem,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 17 validated image filenames.


In [45]:
num_samples_rem = test_generator_rem.n
test_generator_rem.reset()

test_bottleneck_features_mobile_rem = model.predict_generator(
        test_generator_rem, num_samples_rem // batch_size, verbose=1, use_multiprocessing=False)



In [46]:
X_IMG_MOB_TEST = np.concatenate((bottleneck_features_mobile_test,test_bottleneck_features_mobile_rem), axis = 0)

In [47]:
with open('X_IMG_MOB_TEST.npy', 'wb') as f:
    np.save(f, X_IMG_MOB_TEST)

In [None]:
#### ----- ###