In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import Model
from keras import optimizers
from keras.models import Sequential
from keras.layers import LSTM, Input, Embedding, Lambda, Dense, concatenate, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
import keras.backend as K
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import StratifiedKFold, KFold,cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import mean_squared_error
from math import sqrt
from xgboost import XGBRegressor
import lightgbm as lgb

import datetime
from time import time
import gc

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Read the data

In [None]:
train = pd.read_csv('../input/home-depot-product-search-relevance/train.csv', encoding="ISO-8859-1")
test = pd.read_csv('../input/home-depot-product-search-relevance/test.csv', encoding="ISO-8859-1")
# prod = pd.read_csv('../input/home-depot-product-search-relevance/product_descriptions.csv')

print('train size', train.shape)
print('test size', test.shape)
# print('prod size', prod.shape)

In [None]:
train_size = train.shape[0]
df_all = pd.concat((train, test), axis=0, ignore_index=True)
# df_all = pd.merge(df_all, prod, on='product_uid', how='left')
# df_all['product_title'] = (df_all['product_title'] + df_all['product_description'])
# df_all.drop(['product_description', 'product_uid', 'id'], axis=1, inplace=True)

In [None]:
df_all.head()

# Difine labels

In [None]:
y_data = train['relevance']

# transform relevance to labels
le_labels = LabelEncoder()
le_labels.fit(y_data.unique())
y_data_labels = le_labels.transform(y_data)

# Check max product description and search term length to know how much padding needed

In [None]:
def get_max_length(data):
    max_len = 0
    for i in range(0, len(data)):
        n_words = len(data.iloc[i].split())
        if n_words > max_len:
            max_len = n_words
    return max_len

max_length_prod = get_max_length(df_all['product_title'])
max_length_search = get_max_length(df_all['search_term'])

print('max_length_prod', max_length_prod)
print('max_length_search', max_length_search)

# Word embedding

In [None]:
embed_size = 300
max_features = 50000 

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features, filters='!"$&()*+,-.:;<=>?[\]^_`{|}~')
tokenizer.fit_on_texts(list(df_all['product_title'].append(df_all['search_term']).values))

X_train_prod = tokenizer.texts_to_sequences(train['product_title'].values)
X_train_search = tokenizer.texts_to_sequences(train['search_term'].values)
X_test_prod = tokenizer.texts_to_sequences(test['product_title'].values)
X_test_search = tokenizer.texts_to_sequences(test['search_term'].values)

# Padding by the max_length_prod and max_length_search

In [None]:
max_length = max(max_length_prod, max_length_search)

X_train_prod = pad_sequences(X_train_prod, maxlen=max_length)
X_train_search = pad_sequences(X_train_search, maxlen=max_length)
X_test_prod = pad_sequences(X_test_prod, maxlen=max_length)
X_test_search = pad_sequences(X_test_search, maxlen=max_length)

# Assign embedding to words by GoogleNews-vectors-negative300 embedding 

In [None]:
EMBEDDING_FILE = '../input/gnewsvector/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index)) + 1
embedding_matrix = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
embedding_matrix[0] = 0
for word, i in word_index.items():
    if i >= max_features: 
        continue
    if word in embeddings_index:
        embedding_matrix[i] = embeddings_index.get_vector(word)

del EMBEDDING_FILE
gc.collect()

# Build the model

In [None]:
# Model variables
n_lstm_hidden = 25
batch_size = 256
n_epoch = 5

def get_model():

    # input layer
    left_input = Input(shape=(max_length, ))
    right_input = Input(shape=(max_length, ))
    
    # embedding layers
    embedding_layer = Embedding(nb_words, output_dim=embed_size, input_length=max_length, weights=[embedding_matrix])

    # Embedded version of the inputs
    encoded_left = embedding_layer(left_input)
    encoded_right = embedding_layer(right_input)

    # LSTM layer
    shared_lstm = LSTM(n_lstm_hidden)
    left_output = shared_lstm(encoded_left)
    right_output = shared_lstm(encoded_right)

    # concat two outputs
    concat = concatenate([left_output, right_output])
    
    dense_1 = Dense(32, activation="relu")(concat)
    dropout_1 = Dropout(0.3)(dense_1)

    # add Dense layer to calculate the similarty between product title and search term
    output = Dense(1, activation="relu")(dropout_1)

    # Pack it all up into a model
    siamese_model = Model([left_input, right_input], output)

    # colmpiling
    siamese_model.compile(loss='mse', optimizer='adam')
    return siamese_model

In [None]:
def split_to_train_val(data_x, train_index, val_index):
    X_train = data_x[train_index]
    X_val = data_x[val_index]
    
    return X_train, X_val

In [None]:
# Plot loss
def plot_loss(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')
    plt.show()

In [None]:
pred_test = np.zeros(X_test_prod.shape[0])
mean_rmse = 0

n_splits = 5
# kfold = KFold(n_splits=n_splits, shuffle=True, random_state=24)
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=24)

for train_index, val_index in kfold.split(X_train_prod, y_data_labels):
    y_train_kfold = y_data[train_index] 
    y_val_kfold = y_data[val_index]
    X_train_left_kfold, X_val_left_kfold = split_to_train_val(X_train_prod, train_index, val_index)
    X_train_right_kfold, X_val_right_kfold = split_to_train_val(X_train_search, train_index, val_index)
    
    siamese_model = get_model()
    history = siamese_model.fit([X_train_left_kfold, X_train_right_kfold], y_train_kfold,
                            batch_size=batch_size, epochs=n_epoch,
                            validation_data=([X_val_left_kfold, X_val_right_kfold], y_val_kfold))
    
    pred_val = siamese_model.predict([X_val_left_kfold, X_val_right_kfold])
    rmse = sqrt(mean_squared_error(y_val_kfold, pred_val))
    mean_rmse += (rmse / n_splits)
    
    pred_test += np.reshape((siamese_model.predict([X_test_prod, X_test_search]) / n_splits), (len(pred_test),))    
    
    print('rmse', rmse)
    plot_loss(history)
    
print('mean rmse', mean_rmse)

# Use the model we got as feature extractor 
1. for XGBoost model
2. for lightgbm model

In [None]:
# Train the model on all the data
siamese_model = get_model()
history = siamese_model.fit([X_data['left'], X_data['right']], y_data.values, batch_size=batch_size, epochs=n_epoch)

# get the output of the concat layer and use it as features to the ml models
concat_layer = siamese_model.layers[3].output
feature_model = Model(siamese_model.input, concat_layer)
feature_model.compile(loss='mse', optimizer='adam')
print(feature_model.summary())

# we use the output of the concat layer as fetures so they will be the input to the xgb and lgb models
featurs = feature_model.predict([X_data['left'], X_data['right']])

# we preform the prediction also on the test set to evaluate the mse on test set
features_test = feature_model.predict([X_test['left'], X_test['right']])

In [None]:
# # get the output of the concat layer and use it as features to the ml models
# concat_layer = siamese_model.layers[3].output
# feature_model = Model(siamese_model.input, concat_layer)
# feature_model.compile(loss='mse', optimizer='adam')
# print(feature_model.summary())

# # we use the output of the concat layer as fetures so they will be the input to the xgb and lgb models
# featurs = feature_model.predict([X_train_left_kfold, X_train_right_kfold])

# # we preform the prediction also on the test set to evaluate the mse on test set
# features_test = feature_model.predict([X_val_left_kfold, X_val_right_kfold])

In [None]:
# xgb
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.01, gamma=0, subsample=0.8, colsample_bytree=1, max_depth=7)
xgb_model.fit(featurs, y_data)
xgb_pred = xgb_model.predict(features_test)

# lgb
lgb_model = lgb.sklearn.LGBMRegressor(is_unbalance=True, learning_rate =0.01, subsample=0.8, colsample_bytree=0.6, max_depth=7)
lgb_model.fit(featurs, y_data)
lgb_pred = lgb_model.predict(features_test)


# xgb_rmse = np.sqrt(mean_squared_error(y_val_kfold, xgb_pred))
# lgb_rmse = np.sqrt(mean_squared_error(y_val_kfold, lgb_pred))

In [None]:
def round_pred(pred):
    for i in range(0,len(pred)):
        if pred[i] < 1:
            pred[i] = 1
        if pred[i] > 3:
            pred[i] = 3
    return pred

xgb_pred = round_pred(xgb_pred.astype(float))
lgb_pred = round_pred(lgb_predastype(float))

In [None]:
# print(xgb_rmse, lgb_rmse)

# Write predictions to sample submission file

In [None]:
sample_sub = pd.DataFrame()
sample_sub['id'] = test['id']
sample_sub['relevance'] = pred_test
sample_sub.to_csv('sample_submmision_char_siamese.csv', index=False)

sample_sub['relevance'] = xgb_pred
sample_sub.to_csv('sample_submmision_char_xgb.csv', index=False)

sample_sub['relevance'] = lgb_pred
sample_sub.to_csv('sample_submmision_char_lgb.csv', index=False)