In [None]:
# Basic Libraries for data analysis 
import pandas as pd
import numpy as np
import random as rnd
import math
import gc # garbage collection
from tqdm import tqdm # check eta
tqdm.pandas()

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Deep learning
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, Embedding 
from keras.layers import LSTM, Flatten, SpatialDropout1D, Bidirectional, CuDNNLSTM
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers.normalization import BatchNormalization

# NLP related LSTM
import re
from gensim.models import Word2Vec  # Word embeddings

# Sci-kit Learn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

In [None]:
# categories: naming 

import json

with open('../categories.json','r') as f:
    allCat = json.load(f)
    
print('The top level categories are: {}'.format(list(allCat.keys())))

print('There are {} categories in Mobile'.format(len(allCat['Mobile'])))
print('There are {} categories in Fashion'.format(len(allCat['Fashion'])))
print('There are {} categories in Beauty'.format(len(allCat['Beauty'])))

mobCat = sorted(list(allCat['Mobile'].values()))
fasCat = sorted(list(allCat['Fashion'].values()))
beuCat = sorted(list(allCat['Beauty'].values()))

folder_path_dict = {i:'Mobile' for i in mobCat}
folder_path_dict.update({i:'Fashion' for i in fasCat})
folder_path_dict.update({i:'Beauty' for i in beuCat})

# dict for category mapping
numerical2label = {}
labels = allCat

for master_label in labels.keys():
    master_dict = labels[master_label]
    for item_name, item_idx in master_dict.items():
        numerical2label[item_idx] = item_name
        
# inverse map     
label2numerical = {}
for item_idx, item_name in numerical2label.items():
    label2numerical[item_name] = item_idx

## Load Train and Test Data

In [None]:
# Load in the train and test datasets
df_train = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')


In [None]:
# Set Meta Category to Train and Test DF
train_df = df_train.copy()
test_df = df_test.copy()

train_df['meta_cat'] = train_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0]) 
test_df['meta_cat'] = test_df.loc[:,'image_path'].apply(lambda x: x.split('/')[0]) 

## Fashion

In [None]:
# Let's train only fashion
train_gr = train_df.groupby('meta_cat')
test_gr = test_df.groupby('meta_cat')

fashion = train_gr.get_group('fashion_image')
fashion_test = test_gr.get_group('fashion_image')

print('Fashion train shape = {}'.format(fashion.shape))
print('Fashion test shape = {}'.format(fashion_test.shape))

In [None]:
cut_off = 10000
Zone1 = np.unique(fashion.iloc[:cut_off].Category.values)
Zone2 = np.unique(fashion.iloc[-cut_off:].Category.values)

def determine_zone(cat):
    if cat in Zone1:
        return 1
    else:
        return 0

In [None]:
# Add zone feature
fashion = fashion.copy()
fashion['Zone'] = fashion['Category'].apply(lambda x: determine_zone(x))

In [None]:
# Compute length of descriptions
# fashion['length'] = fashion['title'].apply(lambda x: len(x.split()))

In [None]:
def preprocessing(titles_array):
    
    
    processed_array = []
    
    for title in tqdm(titles_array):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', title)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_array.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_array

In [None]:
fashion['processed'] = preprocessing(fashion['title'])
fashion['length_p'] = fashion['processed'].apply(lambda x: len(x.split()))

In [None]:
## Group by 
fas_gr = fashion.groupby('Category')
for i in range(17,31):
    cur_cat = fas_gr.get_group(i)
    lens = np.mean(cur_cat.length_p.values)
    sd = np.std(cur_cat.length_p.values)
    print('Categoty = {}'.format(numerical2label[i]))
    print('len = {}, SD = {}'.format(lens,sd))

#### I am using three models at the base level. Later, these models will be combined using xgBoost 

## Model 1 : MobileNet for Image Data

In [None]:
# Load pre computed image embeddings for train and test
with open('X_IMG_FAS_TRAIN.npy', 'rb') as f:
    X_IMG_FAS_TRAIN = np.load(f)
    
with open('X_IMG_FAS_TEST.npy', 'rb') as f:
    X_IMG_FAS_TEST = np.load(f)
    
print('Shape of train image embeddings:{}'.format(X_IMG_FAS_TRAIN.shape))
print('Shape of test image embeddings:{}'.format(X_IMG_FAS_TEST.shape))

### Train - Validation - Test Split

In [None]:
# Make train-test split
train_df, cv_df, train_img, cv_img = train_test_split(fashion, X_IMG_FAS_TRAIN, test_size=0.02, random_state=8, 
                                     shuffle=True, stratify=fashion['Category'])
test_df = fashion_test.copy()

ntrain = train_df.shape[0]
ncv = cv_df.shape[0]
ntest = len(test_df)

print('Number of observations in train set: %d' % ntrain)
print('Number of observations in validation set: %d' % ncv)
print('Number of observations in test set: %d' % ntest)
      
SEED = 8 # for reproducibility
NFOLDS = 4 # set folds for out-of-fold prediction

# K-fond cross validation
num_cat = 58

oof_train = np.zeros((ntrain,num_cat)) # to store the outputs
oof_cv = np.zeros((ncv,num_cat)) 
oof_cv_skf = np.zeros((NFOLDS, ncv, num_cat))

In [None]:
# Image model: build on mobilenet

def image_model():
    img_input = Input(shape=(1024,), name='img_input')
    x = BatchNormalization()(img_input)
    x = Dropout(0.2)(x)
    x = Dense(1024, activation='relu', name= 'fc-1')(x) # dense 1
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(512,activation='relu')(x) #dense layer 2
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    out = Dense(58, activation = 'softmax', name = 'out_layer')(x)

    # Build the Model
    img_model = Model(inputs=img_input, outputs=out)
    
    # Compile the Model
    img_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  
    
    return img_model

In [None]:
# Write a generate function to train the image model

def img_gen(X, y, batch_size):
    
    n_batches = math.floor(len(X) / batch_size)
    
    while True: 
        X,y = shuffle(X,y) # Shuffle the index.
        
        for i in range(n_batches):
            
            X_batch = X[i*batch_size:(i+1)*batch_size]
            y_batch = y[i*batch_size:(i+1)*batch_size]
            
            yield X_batch, y_batch

In [None]:
# Setup KFold CrossValidation

ntrain = train_df.shape[0]
ncv = cv_df.shape[0]
ntest = test_df.shape[0]

SEED = 8 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold (OOF) predictions

# K-fold cross validation (temp variables)
num_cat = 58
oof_train = np.zeros((ntrain,num_cat)) # to store the outputs

oof_cv = np.zeros((ncv,num_cat)) 
oof_cv_skf = np.zeros((NFOLDS, ncv, num_cat))

oof_test = np.zeros((ntest,num_cat)) 
oof_test_skf = np.zeros((NFOLDS, ntest, num_cat))

# Get the image encodings for cv and test set
cv_img = cv_img.copy()
test_img = X_IMG_FAS_TEST.copy()

In [None]:
# Iterate over KFolds

kf = KFold(n_splits= NFOLDS, shuffle = True, random_state=SEED)
kf_splits = kf.split(train_df)

# iterations
for i, (train_index, valid_index) in enumerate(kf_splits):
    
    print('======== CV {} =========='.format(i+1))
    
    X_tr = train_df.iloc[train_index]
    X_val = train_df.iloc[valid_index]
    
    print('Shape of oof valid = {}'.format(X_val.shape))

    train_enc = train_img[train_index,:]
    y_tr = X_tr.Category.values
    tr_target = np.zeros((len(y_tr), num_cat))
    tr_target[np.arange(len(y_tr)), y_tr] = 1

    
    val_enc = train_img[valid_index,:]
    y_val = X_val.Category.values
    val_target = np.zeros((len(y_val), num_cat))
    val_target[np.arange(len(y_val)), y_val] = 1
    
    # Compile model
    model = image_model()
    
    batch_size = 32
    data_gen = img_gen(train_enc, tr_target, batch_size)

    n_steps = len(X_tr) // batch_size

    history = model.fit_generator(data_gen, epochs=12, 
                              steps_per_epoch=n_steps, 
                              validation_data=(val_enc, val_target), 
                              verbose=True)

    # make prediction for the validation set
    y_pred_valid = model.predict(val_enc)
    y_pred_cv = model.predict(cv_img)
    y_pred_test = model.predict(test_img)

    oof_train[valid_index] = y_pred_valid
    oof_cv_skf[i, :] = y_pred_cv
    oof_test_skf[i, :] = y_pred_test
    
    del model
    gc.collect()

In [None]:
# Maybe run a few more epochs for image model?

# Take mean of test and cv predictions
OOF_train_IMG = oof_train

oof_cv[:] = oof_cv_skf.mean(axis=0)
oof_test[:] = oof_test_skf.mean(axis=0)

OOF_cv_IMG = oof_cv
OOF_test_IMG = oof_test

# Save variables for potential later use
with open('OOF_train_IMG.npy','wb') as f:
    np.save(f,OOF_train_IMG)

with open('OOF_cv_IMG.npy','wb') as f:
    np.save(f,OOF_cv_IMG)
    
with open('OOF_test_IMG.npy','wb') as f:
    np.save(f,OOF_test_IMG)

In [None]:
## cv prediction
cv_pred = np.argmax(OOF_cv_IMG, axis=1)
cv_label = cv_df.Category.values

In [None]:
# Check accuracy of cross validation set
count = 0
for i in range(len(cv_pred)):
    if (cv_pred[i]==cv_label[i]):
        count = count+1
acc = count/len(cv_pred)*100
print('CV accuracy from image = {}'.format(acc))

## Model-2: LSTM + word2vec

In [None]:
# Consider doing some pre-processing for title test
sentences = pd.concat([train_df['title'], cv_df['title'],test_df['title']],axis=0)
train_sentences = list(sentences.progress_apply(str.split).values)

In [None]:
# Build model for custom word embeddings

import time
start_time = time.time()

text_model = Word2Vec(min_count=5, window=3, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20)
text_model.build_vocab(train_sentences, progress_per=10000)

text_model = Word2Vec(sentences=train_sentences, sg=1, window=3, size=300)

print('The number of word for which embeddings will be computed: %d' %len(text_model.wv.vocab))

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')

In [None]:
# Train the word2vec model

start_time = time.time()

text_model.train(sentences = train_sentences, total_examples=text_model.corpus_count, epochs=60, report_delay=1)

print(f'Time taken : {(time.time() - start_time) / 60:.2f} mins')

In [None]:
# Compute lengths of sentences title length

max_len = 0
idx = 0
for i in range(len(train_sentences)):
    if(len(train_sentences[i])>max_len):
        max_len = len(train_sentences[i])
        idx = i
print('Maximum sentence length = {}'.format(max_len))
# print(train_sentences[idx])

In [None]:
text_model = Word2Vec.load("w2v_300d_100epoch.model")

In [None]:
# All titles contain less than 32 words
max_length = 32 # maximum length of title
max_features = 8500 # this is the number of words we care about

train_titles = train_df.title.values
cv_titles = cv_df.title.values
test_titles = test_df.title.values 

frms = [train_titles, cv_titles, test_titles]
all_titles = np.concatenate(frms, axis=0)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(all_titles)

cv_seq = tokenizer.texts_to_sequences(cv_titles)
cv_seq_padded = pad_sequences(cv_seq, maxlen=max_length)

test_seq = tokenizer.texts_to_sequences(test_titles)
test_seq_padded = pad_sequences(test_seq, maxlen=max_length)

In [None]:
num_cat = 58
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Preparing the Embedding layer
num_words = min(max_features, vocab_size) + 1
#print(num_words)

embedding_dim = 300
count = 0

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    #model.wv.get_vector('iphone')
    if word in text_model.wv.vocab.keys():
        embedding_vector = text_model.wv.get_vector(word)
        count = count + 1
    else:
        embedding_vector = None
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)
        
print('Total number of words considered = %s.'% num_words)
print('No of embeddings found in text model = %s.'% count)
print('The shape of embedding matrix: {}'.format(embedding_matrix.shape))

In [None]:
# Define the LSTM model
# Perform grid search for tuning hyperparameters

def LSTM_model():
    
    LSTM_model = Sequential()
    LSTM_model.add(Embedding(num_words,
                        embedding_dim,
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=max_length,
                        trainable=True))
    LSTM_model.add(SpatialDropout1D(0.3))
    LSTM_model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
    LSTM_model.add(Bidirectional(CuDNNLSTM(128)))
    LSTM_model.add(Dropout(0.5))
    LSTM_model.add(Dense(units=58, activation='softmax'))
    LSTM_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    
    return LSTM_model

In [None]:
# Generator for training LSTM model

def batch_gen(train_df, batch_size):
    n_batches = math.floor(len(train_df) / batch_size)
    
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        
        for i in range(n_batches):
            
            batch_df = train_df.iloc[i*batch_size:(i+1)*batch_size]
            batch_titles = batch_df['title']
            batch_seq = tokenizer.texts_to_sequences(batch_titles)
            batch_seq_padded = pad_sequences(batch_seq, maxlen=max_length)
                        
            batch_labels = batch_df.Category.values
            batch_targets = np.zeros((batch_size, num_cat))
            batch_targets[np.arange(batch_size), batch_labels] = 1
            
            yield batch_seq_padded, batch_targets

In [None]:
# Kfold iterations
ntrain = train_df.shape[0]
ncv = cv_df.shape[0]
ntest = test_df.shape[0]

SEED = 8 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction

# K-fond cross validation
oof_train = np.zeros((ntrain,num_cat)) # to store the outputs

oof_cv = np.zeros((ncv,num_cat)) 
oof_cv_skf = np.zeros((NFOLDS, ncv, num_cat))

oof_test = np.zeros((ntest,num_cat)) 
oof_test_skf = np.zeros((NFOLDS, ntest, num_cat))

In [None]:
# Iterate over k-folds
# KF generator

kf = KFold(n_splits= NFOLDS, shuffle = True, random_state=8)
kf_splits = kf.split(train_df)

for i, (train_index, valid_index) in enumerate(kf_splits):
    
    print('======== CV {} =========='.format(i+1))
    X_tr = train_df.iloc[train_index]
    X_val = train_df.iloc[valid_index]
    
    print('Shape of valid = {}'.format(X_val.shape))
    #print(X_val.head(3).title)
    
    val_titles = X_val.title.values
    val_seq = tokenizer.texts_to_sequences(val_titles)
    val_seq_padded = pad_sequences(val_seq, maxlen=max_length)
    
    y_val = X_val.Category.values
    val_target = np.zeros((len(y_val), num_cat))
    val_target[np.arange(len(y_val)), y_val] = 1
    
    # Compile model
    model = LSTM_model()
    
    batch_size = 64
    data_gen = batch_gen(X_tr,batch_size)
    
    n_steps = int(0.5*(len(train_df)//batch_size))

    history = model.fit_generator(data_gen, epochs=7, 
                              steps_per_epoch=n_steps, 
                              validation_data=(val_seq_padded, val_target), 
                              verbose=True)

    # make prediction for the validation set
    y_pred_valid = model.predict(val_seq_padded)
    y_pred_cv = model.predict(cv_seq_padded)
    y_pred_test = model.predict(test_seq_padded)

    oof_train[valid_index] = y_pred_valid
    oof_cv_skf[i, :] = y_pred_cv
    oof_test_skf[i, :] = y_pred_test
    
    del model
    gc.collect()

In [None]:
OOF_train_LSTM = oof_train

oof_cv[:] = oof_cv_skf.mean(axis=0)
OOF_cv_LSTM = oof_cv

oof_test[:] = oof_test_skf.mean(axis=0)
OOF_test_LSTM = oof_test

with open('OOF_train_LSTM.npy','wb') as f:
    np.save(f,OOF_train_LSTM)

with open('OOF_cv_LSTM.npy','wb') as f:
    np.save(f,OOF_cv_LSTM)
    
with open('OOF_test_LSTM.npy','wb') as f:
    np.save(f,OOF_test_LSTM)

In [None]:
## cv prediction
cv_pred = np.argmax(OOF_cv_LSTM, axis=1)
cv_label = cv_df.Category.values

## Accuracy
count = 0
for i in range(len(cv_pred)):
    if (cv_pred[i]==cv_label[i]):
        count = count+1
acc = count/len(cv_pred)*100
print('CV accuracy from text LSTM = {}'.format(acc))

In [None]:
truth = cv_df.Category.values


In [None]:
from sklearn.metrics import confusion_matrix
labels = []
for i in range(58):
    label = numerical2label[i]
    labels.append(label)
    
CF_TXT = confusion_matrix(truth, cv_pred, labels=np.arange(58))

CF_TXT = pd.DataFrame(CF_TXT, columns = labels, index = labels)

In [None]:
CF_TXT.iloc[17:22,17:31]

## Model 3:  TF-iDF + SVD

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(train_df.title.values)

xtrain_tfv =  tfv.transform(train_df.title.values) 
xcv_tfv =  tfv.transform(cv_df.title.values) 
xtest_tfv =  tfv.transform(test_df.title.values) 

In [None]:
xtrain_tfv.shape

In [None]:
# Take Singular Value Decomposition: SVD

from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, decomposition

# Apply SVD, I chose 300 components

svd = decomposition.TruncatedSVD(n_components=300)
svd.fit(xtrain_tfv)

xtrain_svd = svd.transform(xtrain_tfv)
xcv_svd = svd.transform(xcv_tfv)
xtest_svd = svd.transform(xtest_tfv)

# Scale the data obtained from SVD

scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)

xtrain_svd_scl = scl.transform(xtrain_svd)
xcv_svd_scl = scl.transform(xcv_svd)
xtest_svd_scl = scl.transform(xtest_svd)

In [None]:
## TF-iDF Model
# Build the model
def TF_model():
    
    TF_model = Sequential()
    TF_model.add(Dense(units = 512, input_shape=(300,),activation="relu"))
    TF_model.add(Dropout(0.4))
    TF_model.add(Dense(units = 58,activation="softmax"))


    TF_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return TF_model

In [None]:
# Generator for TF-iDF model

def TF_batch_gen(X_tr,data_tr,batch_size):
    n_batches = math.floor(len(data_tr) // batch_size)
    
    while True: 
        X_tr, data_tr = shuffle(X_tr,data_tr)  # Shuffle the data.
        for i in range(n_batches):
            X_train = X_tr[i*batch_size:(i+1)*batch_size,:]
            y_train = data_tr.iloc[i*batch_size:(i+1)*batch_size].Category.values
            
            batch_targets = np.zeros((batch_size, 58))
            batch_targets[np.arange(batch_size), y_train] = 1
            
            yield X_train, batch_targets

In [None]:
# Prepare kfold variables

ntrain = train_df.shape[0]
ncv = cv_df.shape[0]
ntest = test_df.shape[0]
SEED = 8 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction

# K-fond cross validation
num_cat = 58
oof_train = np.zeros((ntrain,num_cat)) # to store the outputs

oof_cv = np.zeros((ncv,num_cat)) 
oof_cv_skf = np.zeros((NFOLDS, ncv, num_cat))

oof_test = np.zeros((ntest,num_cat)) 
oof_test_skf = np.zeros((NFOLDS, ntest, num_cat))

In [None]:
# KF generator and iterate

kf = KFold(n_splits= NFOLDS, shuffle = True, random_state=8)
kf_splits = kf.split(train_df)

for i, (train_index, valid_index) in enumerate(kf_splits):
    
    print('======== CV {} =========='.format(i+1))
    
    X_tr = xtrain_svd_scl[train_index]
    X_val = xtrain_svd_scl[valid_index]
    
    data_tr = train_df.iloc[train_index]
    data_val = train_df.iloc[valid_index]
    
    print('Shape of valid = {}'.format(X_val.shape))
    
    y_val = data_val.Category.values
    val_target = np.zeros((len(y_val), num_cat))
    val_target[np.arange(len(y_val)), y_val] = 1
    
    # Compile model
    model = TF_model()
    batch_size = 32
    
    data_gen = TF_batch_gen(X_tr,data_tr,batch_size)

    steps_per_epoch = int(0.75*(len(train_df)//batch_size))

    history = model.fit_generator(data_gen, epochs=4, 
                              steps_per_epoch=steps_per_epoch, 
                              validation_data=(X_val, val_target), 
                              verbose=True)

    # make prediction for the validation set
    
    y_pred_valid = model.predict(X_val)
    y_pred_cv = model.predict(xcv_svd_scl)
    y_pred_test = model.predict(xtest_svd_scl)

    oof_train[valid_index] = y_pred_valid
    oof_cv_skf[i, :] = y_pred_cv
    oof_test_skf[i, :] = y_pred_test
    
    del model
    gc.collect()

In [None]:
# Save variables for future use

OOF_train_TF = oof_train

oof_cv[:] = oof_cv_skf.mean(axis=0)
oof_test[:] = oof_test_skf.mean(axis=0)

OOF_cv_TF = oof_cv
OOF_test_TF = oof_test

with open('OOF_train_TF.npy','wb') as f:
    np.save(f,OOF_train_TF)

with open('OOF_cv_TF.npy','wb') as f:
    np.save(f,OOF_cv_TF)
    
with open('OOF_test_TF.npy','wb') as f:
    np.save(f,OOF_test_TF)

In [None]:
##
## cv prediction
cv_pred = np.argmax(OOF_cv_TF, axis=1)
cv_label = cv_df.Category.values

## Accuracy
count = 0
for i in range(len(cv_pred)):
    if (cv_pred[i]==cv_label[i]):
        count = count+1
acc = count/len(cv_pred)*100
print('CV accuracy from text TFiDF = {}'.format(acc))

## Model -4: Image embedding and xgBoost

In [None]:
import xgboost as xgb

In [None]:
# Take Singular Value Decomposition: SVD
# Apply SVD, I chose 64 components

svd_img = decomposition.TruncatedSVD(n_components=96)
svd_img.fit(train_img)

xtrain_svd = svd_img.transform(train_img)
xcv_svd = svd_img.transform(cv_img)
xtest_svd = svd_img.transform(test_img)

# Scale the data obtained from SVD

scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)

xtrain_svd_scl = scl.transform(xtrain_svd)
xcv_svd_scl = scl.transform(xcv_svd)
xtest_svd_scl = scl.transform(xtest_svd)

In [None]:
# Prepare kfold variables

ntrain = train_df.shape[0]
ncv = cv_df.shape[0]
ntest = test_df.shape[0]
SEED = 8 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction

# K-fond cross validation
num_cat = 14 # only 14 for xgboost
oof_train = np.zeros((ntrain,num_cat)) # to store the outputs

oof_cv = np.zeros((ncv,num_cat)) 
oof_cv_skf = np.zeros((NFOLDS, ncv, num_cat))

oof_test = np.zeros((ntest,num_cat)) 
oof_test_skf = np.zeros((NFOLDS, ntest, num_cat))

In [None]:
# KF generator and iterate

kf = KFold(n_splits= NFOLDS, shuffle = True, random_state=8)
kf_splits = kf.split(train_df)

for i, (train_index, valid_index) in enumerate(kf_splits):
    
    print('======== CV {} =========='.format(i+1))
    
    X_tr = xtrain_svd_scl[train_index]
    X_val = xtrain_svd_scl[valid_index]
    
    data_tr = train_df.iloc[train_index]
    data_val = train_df.iloc[valid_index]
    
    print('Shape of valid = {}'.format(X_val.shape))
    
    y_val = data_val.Category.values
    y_tr = data_tr.Category.values

    
    # Compile model
    model =  xgb.XGBClassifier(n_estimators= 25, max_depth= 4, min_child_weight= 1,
     gamma=0.1, subsample=0.7, colsample_bytree=1.0, objective= 'multi:softmax',
     nthread= -1, verbosity=2,
     scale_pos_weight=1).fit(X_tr, y_tr, eval_set = [(X_tr,y_tr),(X_val,y_val)],verbose=5)

    # make prediction for the validation set
    
    y_pred_valid = model.predict_proba(X_val)
    y_pred_cv = model.predict_proba(xcv_svd_scl)
    y_pred_test = model.predict_proba(xtest_svd_scl)

    oof_train[valid_index] = y_pred_valid
    oof_cv_skf[i, :] = y_pred_cv
    oof_test_skf[i, :] = y_pred_test
    
    del model
    gc.collect()

In [None]:
# Save variables for future use

OOF_train_XGB = oof_train

oof_cv[:] = oof_cv_skf.mean(axis=0)
oof_test[:] = oof_test_skf.mean(axis=0)

OOF_cv_XGB = oof_cv
OOF_test_XGB = oof_test

with open('OOF_train_XGB.npy','wb') as f:
    np.save(f,OOF_train_XGB)

with open('OOF_cv_XGB.npy','wb') as f:
    np.save(f,OOF_cv_XGB)
    
with open('OOF_test_XGB.npy','wb') as f:
    np.save(f,OOF_test_XGB)

In [None]:
## cv prediction
cv_pred = np.argmax(OOF_cv_XGB, axis=1) + 17
cv_label = cv_df.Category.values

## Accuracy
count = 0
for i in range(len(cv_pred)):
    if (cv_pred[i]==cv_label[i]):
        count = count+1
acc = count/len(cv_pred)*100
print('CV accuracy from text XGB = {}'.format(acc))

In [None]:
# Confusion
from sklearn.metrics import confusion_matrix
labels = []
for i in range(58):
    label = numerical2label[i]
    labels.append(label)
    
CF_XGB = confusion_matrix(cv_label, cv_pred, labels=np.arange(58))

CF_XGB = pd.DataFrame(CF_XGB, columns = labels, index = labels)

In [None]:
CF_XGB.iloc[17:31,17:31]

## Stacking - Meta Learner: xgBoost

In [None]:
# Load train files
with open('OOF_train_LSTM.npy','rb') as f:
    OOF_train_LSTM = np.load(f)
    
with open('OOF_train_TF.npy','rb') as f:
    OOF_train_TF = np.load(f)

with open('OOF_train_IMG.npy','rb') as f:
    OOF_train_IMG = np.load(f)

# Load cv files
with open('OOF_cv_LSTM.npy','rb') as f:
    OOF_cv_LSTM = np.load(f)
    
with open('OOF_cv_TF.npy','rb') as f:
    OOF_cv_TF = np.load(f)

with open('OOF_cv_IMG.npy','rb') as f:
    OOF_cv_IMG = np.load(f)

# Load test files
with open('OOF_test_LSTM.npy','rb') as f:
    OOF_test_LSTM = np.load(f)
    
with open('OOF_test_TF.npy','rb') as f:
    OOF_test_TF = np.load(f)

with open('OOF_test_IMG.npy','rb') as f:
    OOF_test_IMG = np.load(f)

## Feature Engineering


In [None]:
cols = ['Cat_' + str(i) for i in range(58)]
train_pred_df = pd.DataFrame(OOF_train_LSTM, columns = cols)
cv_pred_df = pd.DataFrame(OOF_cv_LSTM, columns = cols)
test_pred_df = pd.DataFrame(OOF_test_LSTM, columns = cols)

In [None]:
zone1_cols = ['Cat_' + str(i) for i in Zone1]

In [None]:
## Augment train with probability

df1 = train_df.copy()
df1['inDex'] = df1.index

df2 = train_pred_df.copy()
df2 = df2.iloc[:,17:31]

df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

frms = [df1,df2]
train_df_aug = pd.concat(frms, axis = 1)

In [None]:
## Augment validation with probability

df1 = cv_df.copy()
df1['inDex'] = cv_df.index

df2 = cv_pred_df.copy()
df2 = df2.iloc[:,17:31]

df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

frms = [df1,df2]
cv_df_aug = pd.concat(frms, axis = 1)

In [None]:
## Augment test with probability

In [None]:
df1 = test_df.copy()
df1['inDex'] = test_df.index

df2 = test_pred_df.copy()
df2 = df2.iloc[:,17:31]

df1.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

frms = [df1,df2]
test_df_aug = pd.concat(frms, axis = 1)
ALL_TEST = test_df_aug.copy()

ALL_TEST['Zone_P'] = ALL_TEST[zone1_cols].apply(lambda x: sum(x), axis= 1)

In [None]:
ALL_FASHION = pd.concat([train_df_aug,cv_df_aug], axis = 0)
ALL_FASHION_SORTED = ALL_FASHION.sort_values(by = 'inDex', axis=0)
ALL_FASHION_SORTED['Zone_P'] = ALL_FASHION_SORTED[zone1_cols].apply(lambda x: sum(x), axis= 1)
ALL_F = ALL_FASHION_SORTED.set_index('inDex')
zone_proba = ALL_F.Zone_P.values

In [None]:
## Magic Feature

Ns = len(zone_proba)
window = 5

magic_f = np.zeros((Ns,1))
for i in range(Ns):
    if(i<5):
        magic_f[i] = zone_proba[i]
    else:
        magic_f[i] = sum(zone_proba[i-5:i])/window
        


In [None]:
## Magic Feature for test df
zone_proba_test = ALL_TEST.Zone_P.values



In [None]:
Ns = len(zone_proba_test)
window = 5

magic_f_test = np.zeros((Ns,1))
for i in range(Ns):
    if(i<5):
        magic_f_test[i] = zone_proba_test[i]
    else:
        magic_f_test[i] = sum(zone_proba_test[i-5:i])/window

In [None]:
ALL_TEST['magic_f'] = magic_f_test
test_df_mod = ALL_TEST.copy()
ALL_F['magic_f'] = magic_f
train_df_mod = ALL_F.loc[train_df.index,:]
cv_df_mod = ALL_F.loc[cv_df.index,:]

In [None]:
# N_tr = train_MF.shape[0]
# N_cv = cv_MF.shape[0]
# N_test = test_MF.shape[0]

In [None]:
# train_MF = train_MF.reshape(N_tr,1)
# cv_MF = cv_MF.reshape(N_cv,1)
# test_MF = test_MF.reshape(N_test,1)

## Final xgBoost with Magic Feature

In [None]:
y_train = train_df.Category.values
magic_train = train_df_mod.magic_f.values
magic_train = magic_train.reshape(-1,1)
magic_cv = cv_df_mod.magic_f.values
magic_cv = magic_cv.reshape(-1,1)
magic_test = test_df_mod.magic_f.values
magic_test = magic_test.reshape(-1,1)

In [None]:
x_train = np.concatenate(( OOF_train_LSTM[:,17:31], OOF_train_TF[:,17:31],OOF_train_IMG[:,17:31], magic_train), axis=1)
x_cv = np.concatenate(( OOF_cv_LSTM[:,17:31], OOF_cv_TF[:,17:31],OOF_cv_IMG[:,17:31],magic_cv), axis=1)

x_test = np.concatenate(( OOF_test_LSTM[:,17:31], OOF_test_TF[:,17:31],OOF_test_IMG[:,17:31],magic_test), axis=1)

In [None]:
import xgboost as xgb

X_tr, X_val, Y_tr, Y_val = train_test_split(x_train, y_train, stratify=y_train, random_state=42, 
                                                  test_size=0.08, shuffle=True)

In [None]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 400,
 max_depth= 6,
 min_child_weight= 1,
 #gamma=1,
 gamma=0.1,                        
 subsample=0.7,
 colsample_bytree=0.9,
 objective= 'multi:softmax',
 verbosity=2,
 nthread =16,
 scale_pos_weight=1).fit(x_train, y_train, eval_set = [(X_tr,Y_tr),(X_val,Y_val)],verbose=1, early_stopping_rounds=10)

In [None]:
## Neural Net as meta learner?

In [None]:
# save xgb
import pickle
pickle.dump(gbm, open("xgb_fashion_mf_deeper_tree_400.dat", "wb"))

In [None]:
# Make predictions for cv and test set
predictions_cv = gbm.predict(x_cv)
predictions_test = gbm.predict(x_test)

In [None]:
# Compute expected accuracy

Y_val = cv_df.Category.values

count=0
for i in range(len(predictions_cv)):
    if(predictions_cv[i] == Y_val[i]):
        count+=1
acc = count/len(predictions_cv)*100
print('Expected Accuracy after xgBoost = {}'.format(acc))

In [None]:
# Store test predictions
test_df['Category'] = predictions_test
test_df['CatName'] = test_df['Category'].apply(lambda x: numerical2label[x]) 

In [None]:
FAS_df_submit = test_df[['itemid', 'Category']].copy()
FAS_df_submit.to_csv('Fashion_submission_mf_v3.csv', index=False)

In [None]:
FAS_df_submit.head()

In [None]:
### --- end of modelling for fashion part --- ###

In [None]:
sub1 = pd.read_csv('Fashion_submission.csv')
sub2 = pd.read_csv('Fashion_submission_mf.csv')

In [None]:
y1 = sub1.Category.values
y2 = sub2.Category.values

In [None]:
len(y1[y1==y2])/len(y1)

## Debugging

In [None]:
# load model from file
import pickle
xgb = pickle.load(open("xgb_fashion_mf_deeper_tree_400.dat", "rb"))

In [None]:
# Make predictions for cv and test set
#predictions_cv_before = xgb.predict(x_cv)
#predictions_test = xgb.predict(x_test)
#test_prob = xgb.predict_proba(x_test)

In [None]:
sub = cv_df.copy()
sub['pred'] = predictions_cv
sub['pred_cat'] = sub['pred'].apply(lambda x: numerical2label[x])

In [None]:
sub.head()

In [None]:
truth = cv_df.Category.values


In [None]:
predictions_cv = xgb.predict(x_cv)

In [None]:
df = cv_df.copy()
df['pred'] = predictions_cv

In [None]:
from sklearn.metrics import confusion_matrix
labels = []
for i in range(58):
    label = numerical2label[i]
    labels.append(label)
    
CF_TXT = confusion_matrix(truth, predictions_cv, labels=np.arange(58))

CF_TXT = pd.DataFrame(CF_TXT, columns = labels, index = labels)

In [None]:
CF_TXT.iloc[17:31,17:31]

In [None]:
idx = (predictions_cv != truth)
idx_cur = (predictions_cv == truth)
df_wrong = df[idx]
df_cur= df[idx_cur]

In [None]:
# update file paths accordingly in train_df
focus_df = df_wrong.copy()
focus_df_c = df_cur.copy()
def update_file_path(inp):
    #print(inp)
    x = inp[0]
    cat = inp[1]
    path_segs = x.split('/')
    
    path_map = {'beauty_image':'Beauty', 'fashion_image':'Fashion', 'mobile_image':'Mobile'}
    base_path = 'Train/' + path_map[path_segs[0]]
    rel_path = path_segs[1]
    rel_segs = rel_path.split('.')
    if len(rel_segs) == 1:
        rel_path = rel_path + '.jpg'
    return base_path + '/' + str(cat)+ '/' + rel_path

focus_df['new_path'] = focus_df.loc[:,['image_path','Category']].apply(lambda x: update_file_path(x),axis=1)

focus_df_c['new_path'] = focus_df_c.loc[:,['image_path','Category']].apply(lambda x: update_file_path(x),axis=1)

In [None]:
## Show wrong results for debugging

def debug_helper(df):
    
    all_ex = df.values
    
    for i, ex in enumerate(all_ex):

        title = ex[1]
        cat = ex[2]
        pred = ex[6]
        path = ex[7]
        
        print('Title: {}'.format(title))
        #print('Image:\n')
        root_path = '/mnt/disks/NDSC/'
        x=plt.imread(root_path + path)
        plt.imshow(x)
        plt.show()
        print('Actual Category: {}'. format(numerical2label[cat]))

        print('Predicted from model: {}'. format(numerical2label[pred]))
        print('\n ============= \n')

In [None]:
cat = 23

groups = focus_df_c.groupby('Category')
wgr = groups.get_group(cat)

cvGrp = cv_df.groupby('Category')
gr = cvGrp.get_group(cat)

N_cat = len(gr)
N_wr = len(wgr)

print('========Name==========')
print(numerical2label[cat])
print('========INFO==========')
print('Group Strength = {}'.format(N_cat/len(cv_df)*100))
print('Error frac = {}'.format(N_wr/N_cat))
print('========Confusion TEXT ==========')
print(CF_TXT.iloc[cat,17:31])

print ('========Typical Mistakes==========')
ns = min(10,len(wgr))
df = wgr.sample(ns)
debug_helper(df)

In [None]:
df

In [None]:
## Some Fix with Feature Engineering

In [None]:
all_ex = fashion.copy()
all_ex['name_cat'] = all_ex['Category'].apply(lambda x:numerical2label[x])

In [None]:
all_ex = fashion.copy()
all_ex['name_cat'] = all_ex['Category'].apply(lambda x:numerical2label[x])

cat = 30
print('Given category = {}'.format(numerical2label[cat]))
frac_train = len(fashion[fashion['Category'] == cat])/len(fashion)
frac_test = len(sub1[sub1['Category'] == cat])/len(sub1)

print('Frac Train = {}'.format(frac_train))
print('Frac Test = {}'.format(frac_test))

In [None]:
frac_train

In [None]:
sub1[sub1['Category']==26]

In [None]:
##Test
# Store test predictions
test_df['Category'] = predictions_test
test_df['CatName'] = test_df['Category'].apply(lambda x: numerical2label[x]) 

In [None]:
tbag = test_df.copy()
tbag = tbag.drop(['image_path', 'meta_cat'],axis = 1)
tbag = tbag.reset_index()

In [None]:
st =1000
en = st + 15
tbag.iloc[st:en].CatName

In [None]:
cat = 18
chk = tbag[tbag['Category'] == cat]
start = 0
end = start + 10

chk.iloc[start:end]

In [None]:
pos =1068

P = test_prob[pos,:]*100

print(tbag.iloc[pos].title)
print ('\n=============\n')
for i in range(14):
    print('Cat = {}, P = {}'.format(numerical2label[i+17],P[i]))

In [None]:
test_prob.shape