In [18]:
# Load basic libraries
import numpy as np
import pandas as pd
import os
import sys
%matplotlib inline

# Load Keras Libraries
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import Input
from keras import applications
from keras import optimizers
from keras import backend as K
from keras.layers.normalization import BatchNormalization

from sklearn.utils import shuffle

In [2]:
# Load train-test data
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test.csv')

In [3]:
# categories: naming 
import json
with open('../categories.json','r') as f:
    allCat = json.load(f)
print('The top level categories are: {}'.format(list(allCat.keys())))


print('There are {} categories in Mobile'.format(len(allCat['Mobile'])))
print('There are {} categories in Fashion'.format(len(allCat['Fashion'])))
print('There are {} categories in Beauty'.format(len(allCat['Beauty'])))

mobCat = sorted(list(allCat['Mobile'].values()))
fasCat = sorted(list(allCat['Fashion'].values()))
beuCat = sorted(list(allCat['Beauty'].values()))

folder_path_dict = {i:'Mobile' for i in mobCat}
folder_path_dict.update({i:'Fashion' for i in fasCat})
folder_path_dict.update({i:'Beauty' for i in beuCat})

##
numerical2label = {}
labels = allCat

for master_label in labels.keys():
    master_dict = labels[master_label]
    for item_name, item_idx in master_dict.items():
        numerical2label[item_idx] = item_name
        
label2numerical = {}
for item_idx, item_name in numerical2label.items():
    label2numerical[item_name] = item_idx

The top level categories are: ['Mobile', 'Fashion', 'Beauty']
There are 27 categories in Mobile
There are 14 categories in Fashion
There are 17 categories in Beauty


In [4]:
# update file paths accordingly in train_df
train_df = train_data.copy()

def update_file_path(inp):
    #print(inp)
    x = inp[0]
    cat = inp[1]
    path_segs = x.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Train/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
    return base_path + '/' + str(cat)+ '/' + rel_path

train_df['new_path'] = train_df.loc[:,['image_path','Category']].apply(lambda x: update_file_path(x),axis=1)
train_df['meta_cat'] = train_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0])  

In [5]:
# Lood data for mobile categories (CHANGE here for other categories)
train_byCat = train_df.groupby('meta_cat')
cur_cat = 'beauty_image'
cat_train = train_byCat.get_group(cur_cat)
cat_train.shape

(286583, 6)

In [6]:
# dimensions of our images.
img_width, img_height = 128, 128

# input shape
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

In [8]:
# Mobile-Net Model

In [7]:
base_model = applications.MobileNet(weights='imagenet', include_top=False, 
                                        input_shape=input_shape)

In [8]:
## Model
last_layer = base_model.output
neck = GlobalAveragePooling2D()(last_layer)

# Base Mobilenet   Model
model = Model(inputs=base_model.input, outputs=neck)

In [9]:
datagen = ImageDataGenerator(rescale=1. / 255)
base_dir = '/mnt/disks/NDSC'

In [10]:
# Generator-1
batch_size = 50
generator = datagen.flow_from_dataframe(
        dataframe=cat_train,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 286583 validated image filenames.


In [11]:
# Generate features for train samples
num_samples = generator.n
generator.reset()

bottleneck_features_beauty = model.predict_generator(
        generator, num_samples // batch_size, verbose=1, use_multiprocessing=False)



In [12]:
with open('Beauty_bottleneck_part_1.npy', 'wb') as f:
    np.save(f, bottleneck_features_beauty)

In [13]:
N_rem = len(cat_train)-len(bottleneck_features_beauty)
N_rem

33

In [14]:
beu_rem = cat_train.tail(N_rem)
beu_rem

Unnamed: 0,itemid,title,Category,image_path,new_path,meta_cat
286550,2544385,nyx soft matte lip cream smlc 18 prague,12,beauty_image/d543c41f22223eb568b44f1793346ca6.jpg,Train/Beauty/12/d543c41f22223eb568b44f1793346c...,beauty_image
286551,3307364,tony moly magic food banana cream foam cleanser,13,beauty_image/d51c8d3f9164072f359bc87223b1ede3.jpg,Train/Beauty/13/d51c8d3f9164072f359bc87223b1ed...,beauty_image
286552,3458065,nyx soft matte lip cream original usa 100,12,beauty_image/2ed542b380c88ae5dd210b15a5ae0107.jpg,Train/Beauty/12/2ed542b380c88ae5dd210b15a5ae01...,beauty_image
286553,5266506,the balm how bout them apples,12,beauty_image/2927984662957ce774bdbff0686c5965.jpg,Train/Beauty/12/2927984662957ce774bdbff0686c59...,beauty_image
286554,5404353,skinfood water color tint pilih warna,12,beauty_image/836acff8f697a00dd5e411a1fa70217e.jpg,Train/Beauty/12/836acff8f697a00dd5e411a1fa7021...,beauty_image
286555,5534803,tony moly natural green handcream moringa,13,beauty_image/66ff3cb3a885969c4d5237af7f398c27.jpg,Train/Beauty/13/66ff3cb3a885969c4d5237af7f398c...,beauty_image
286556,5708831,nyx soft matte lip cream,12,beauty_image/b10f28a3c64ff40e3cbef905c80c5062.jpg,Train/Beauty/12/b10f28a3c64ff40e3cbef905c80c50...,beauty_image
286557,5756426,nyx soft matte lip cream smlc21 transylvania 1...,13,beauty_image/63267c45822a6b8149546b50d360e276.jpg,Train/Beauty/13/63267c45822a6b8149546b50d360e2...,beauty_image
286558,6620800,nyx lip cream,12,beauty_image/6778bf2fec23be0052d1da20845745b0.jpg,Train/Beauty/12/6778bf2fec23be0052d1da20845745...,beauty_image
286559,8098574,clearance revlon lacquer balm,12,beauty_image/5188ccda182ccbb03436d4bc6a889a2c.jpg,Train/Beauty/12/5188ccda182ccbb03436d4bc6a889a...,beauty_image


In [15]:
# Generator-1
batch_size = 1
generator_rem = datagen.flow_from_dataframe(
        dataframe=beu_rem,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 33 validated image filenames.


In [16]:
num_samples_rem = generator_rem.n
generator_rem.reset()

bottleneck_features_beauty_rem = model.predict_generator(
        generator_rem, num_samples_rem // batch_size, verbose=1, use_multiprocessing=False)



In [18]:
max(bottleneck_features_beauty_rem[1])

5.3103123

In [19]:
max(bottleneck_features_beauty[0])

5.607791

In [20]:
max(bottleneck_features_beauty[-1])

5.340643

In [21]:
## Concatenate
X_IMG_BEU_TRAIN = np.concatenate((bottleneck_features_beauty,bottleneck_features_beauty_rem), axis = 0)

In [22]:
X_IMG_BEU_TRAIN.shape

(286583, 1024)

In [23]:
with open('X_IMG_BEU_TRAIN.npy', 'wb') as f:
    np.save(f, X_IMG_BEU_TRAIN)

In [9]:
with open('X_IMG_BEU_TRAIN.npy', 'rb') as f:
    X_IMG_BEU_TRAIN= np.load(f)

In [10]:
max(X_IMG_BEU_TRAIN[1])

5.954384

In [11]:
n_beu = X_IMG_BEU_TRAIN.shape[0]

In [13]:
beu_y = cat_train.Category.values
beu_targets = np.zeros((n_beu, 58))
beu_targets[np.arange(n_beu), beu_y] = 1

In [None]:
## Model Fine tune

In [19]:
# Beauty model (Quick Check)
img_input = Input(shape=(1024,), name='img_input')
x = BatchNormalization()(img_input)
x = Dropout(0.2)(x)
x = Dense(1024, activation='relu', name= 'fc-1')(x) # dense 1
x = Dropout(0.5)(x)
x = Dense(512,activation='relu')(x) #dense layer 2
x = Dropout(0.5)(x)
out = Dense(58, activation = 'softmax', name = 'out_layer')(x)

# Base Mobilenet   Model

Beu_model = Model(inputs=img_input, outputs=out)

Beu_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
Beu_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
img_input (InputLayer)       (None, 1024)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
fc-1 (Dense)                 (None, 1024)              1049600   
_________________________________________________________________
dropout_5 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
__________

In [20]:
def NYRS_gen(X, y, batch_size):
    
    n_batches = math.floor(len(X) / batch_size)
    
    while True: 
        X,y = shuffle(X,y) # Shuffle the index.
        
        for i in range(n_batches):
            
            X_batch = X[i*batch_size:(i+1)*batch_size]
            y_batch = y[i*batch_size:(i+1)*batch_size]
            
            yield X_batch, y_batch

In [22]:
batch_size = 32
import math

N = len(X_IMG_BEU_TRAIN)

X_IMG_BEU_TRAIN, beu_targets = shuffle(X_IMG_BEU_TRAIN, beu_targets)

N_train = int(0.8*N)

X_train = X_IMG_BEU_TRAIN[:N_train]
y_train = beu_targets[:N_train]

X_val = X_IMG_BEU_TRAIN[N_train:]
y_val = beu_targets[N_train:]

n_steps = len(X_IMG_BEU_TRAIN) // batch_size

batch_gen = NYRS_gen(X_train, y_train, 32)

history = Beu_model.fit_generator(batch_gen, epochs=10, 
                              steps_per_epoch=n_steps, 
                              validation_data=(X_val,y_val),
                              verbose=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
1790/8955 [====>.........................] - ETA: 51s - loss: 1.3536 - acc: 0.5841

KeyboardInterrupt: 

## Extract for test set

In [23]:
# update file paths accordingly in train_df
test_df = test_data.copy()

def update_test_file_path(inp):

    path_segs = inp.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Test/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
        
    return base_path + '/'  + rel_path

test_df['new_path'] = test_df.loc[:,'image_path'].apply(lambda x: update_test_file_path(x))
test_df['meta_cat'] = test_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0])

In [24]:
# Lood data for mobile categories (CHANGE here for other categories)
test_byCat = test_df.groupby('meta_cat')
cur_cat = 'beauty_image'
cat_test = test_byCat.get_group(cur_cat)
cat_test.shape

(76545, 5)

In [25]:
datagen_test = ImageDataGenerator(rescale=1. / 255)
base_dir = '/mnt/disks/NDSC'

In [26]:
# Generator-test
batch_size = 50
generator_test = datagen_test.flow_from_dataframe(
        dataframe=cat_test,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 76545 validated image filenames.


In [27]:
# Generate features for  test samples
num_samples = generator_test.n
generator_test.reset()

bottleneck_features_beauty_test = model.predict_generator(
        generator_test, num_samples // batch_size, verbose=1, use_multiprocessing=False)



In [28]:
with open('df_TEST_Beauty_bottleneck_part_1.npy', 'wb') as f:
    np.save(f, bottleneck_features_beauty_test)

In [15]:
# The remaining part

In [29]:
N_rem = len(cat_test)-len(bottleneck_features_beauty_test)
N_rem

45

In [30]:
test_beu_rem = cat_test.tail(N_rem)
test_beu_rem.tail()

Unnamed: 0,itemid,title,image_path,new_path,meta_cat
76540,1826670507,lip cream wardah,beauty_image/5910b3379e4ad845a7be914b6cb02527.jpg,Test/Beauty/5910b3379e4ad845a7be914b6cb02527.jpg,beauty_image
76541,1832111687,la tulipe stay matte lip cream 08,beauty_image/e8022625c508a528dbd6c90869a0bc2a.jpg,Test/Beauty/e8022625c508a528dbd6c90869a0bc2a.jpg,beauty_image
76542,1832269907,etude house berry delicious color in liquid li...,beauty_image/3af512eb5a3c8b0632d787110bcb796b.jpg,Test/Beauty/3af512eb5a3c8b0632d787110bcb796b.jpg,beauty_image
76543,1832929407,goban matte liquid lipstick,beauty_image/e07d7b1400f51a09260040143b969306.jpg,Test/Beauty/e07d7b1400f51a09260040143b969306.jpg,beauty_image
76544,1833165562,nivea lip balm,beauty_image/d592906ac8f906bc36664e906f635619.jpg,Test/Beauty/d592906ac8f906bc36664e906f635619.jpg,beauty_image


In [31]:
# Generator-1
datagen_test_rem = ImageDataGenerator(rescale=1. / 255)
batch_size = 1
test_generator_rem = datagen_test_rem.flow_from_dataframe(
        dataframe=test_beu_rem,
        directory=base_dir,
        x_col="new_path",
        y_col= None,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        seed=42,
        class_mode=None,
        shuffle=False)

Found 45 validated image filenames.


In [32]:
num_samples_rem = test_generator_rem.n
test_generator_rem.reset()

test_bottleneck_features_beauty_rem = model.predict_generator(
        test_generator_rem, num_samples_rem // batch_size, verbose=1, use_multiprocessing=False)



In [33]:
X_IMG_BEU_TEST = np.concatenate((bottleneck_features_beauty_test,test_bottleneck_features_beauty_rem), axis = 0)

In [34]:
with open('X_IMG_BEU_TEST.npy', 'wb') as f:
    np.save(f, X_IMG_BEU_TEST)

In [None]:
#### ----- ###