In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import Model
from keras import optimizers
from keras.models import Sequential
from keras.layers import LSTM, Input, Embedding, Lambda
from keras.layers.normalization import BatchNormalization
import keras.backend as K

import datetime
from time import time

from keras.optimizers import Adadelta
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Read the data

In [None]:
train = pd.read_csv('../input/train.csv', encoding="ISO-8859-1")
test = pd.read_csv('../input/test.csv', encoding="ISO-8859-1")
prod_desc = pd.read_csv('../input/product_descriptions.csv', encoding="ISO-8859-1")

print('train size', train.shape)
print('test size', test.shape)
print('product description size', prod_desc.shape)

In [None]:
train_size = train.shape[0]
df_all = pd.concat((train, test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, prod_desc, how='left', on='product_uid')
df_all['product_full_info'] = df_all['product_title'] + ' ' + df_all['product_description']
df_all.drop(['product_title', 'product_description'], axis=1, inplace=True)

In [None]:
df_all.head()

# Split product_full_info and search_term to chars

In [None]:
char_to_remove = [' ', '-', '{', '}', '"', '(', ')', '.', ',', ':', '&','[',']','`','_','\'', '~', 
                  '\x80', '\x81', '\x84', '\x89', '\x8b', '\x90', '\x93', '\x95', '\x99', '\x9a', 
                  '\x9d', '\xa0', '¡', '¢', 'ª', '°', '²', 'À', 'Â', 'È', 'Ê', 'Ë', 'Ï', 'Ò', 'Û', 
                  'Ü', 'â', 'ã', 'å', 'è', '÷']

def lower_char(char):
    if char >= 'A' and char <= 'Z':
        return char.lower()
    else:
        return char

df_all['product_full_info'] = df_all['product_full_info'].apply(lambda sent: [lower_char(character) for character in sent if character not in char_to_remove])
df_all['search_term'] = df_all['search_term'].apply(lambda sent: [lower_char(character) for character in sent if character not in char_to_remove])

In [None]:
df_all.head()

# Get all unique chars to preform lable encoder

In [None]:
prod_all_sentences = df_all['product_full_info']
search_all_sentences = df_all['search_term']

all_unique_chars = np.unique(np.concatenate((prod_all_sentences.append(search_all_sentences).values), axis=None))
all_unique_chars = np.append(['<uniq>'], all_unique_chars)

In [None]:
all_unique_chars

# Replace tokens in numbers using lable encoder

In [None]:
def token_to_num(data, unique_valus):
    le = LabelEncoder()
    le.fit(unique_valus)

    data['product_full_info'] = data['product_full_info'].apply(lambda char_list: le.transform(char_list)) 
    data['search_term'] = data['search_term'].apply(lambda char_list: le.transform(char_list)) 

    return data

df_all = token_to_num(df_all, all_unique_chars)

In [None]:
df_all.head()

# Check max product description and search term length to know how much padding needed

In [None]:
def get_max_length(data):
    max_len = 0
    for i in range(0, len(data)):
        if len(data.iloc[i]) > max_len:
            max_len = len(data.iloc[i])
    return max_len

max_length_prod = get_max_length(df_all['product_full_info'])
max_length_search = get_max_length(df_all['search_term'])

In [None]:
print('max_length_prod', max_length_prod)
print('max_length_search', max_length_search)

# Split back to train and test sets

In [None]:
df_train = df_all.iloc[:train_size]
df_test = df_all.iloc[train_size:]
df_test.reset_index(inplace=True, drop=True)

y = df_train['relevance']
X = df_train.drop(['id','relevance'], axis=1)
X_test = df_test.drop(['id','relevance'], axis=1)

# Split to validation set

In [None]:
validation_size = int(0.2 * X.shape[0])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_size)

In [None]:
X_train.head()

# Split to two sides:
1. search_term
2. product_desctiption + title

In [None]:
X_train = {'left': X_train['product_full_info'], 'right': X_train['search_term']}
X_val = {'left': X_val['product_full_info'], 'right': X_val['search_term']}
X_test = {'left': df_test['product_full_info'], 'right': df_test['search_term']}

In [None]:
# Convert labels to their numpy representations
y_train = y_train.values
y_val = y_val.values

# Add zero padding to each char list in size of max_length

In [None]:
for dataset in [X_train, X_val, X_test]:
    dataset['left'] = pad_sequences(dataset['left'], maxlen=max_length_prod)
    dataset['right'] = pad_sequences(dataset['right'], maxlen=max_length_search)

In [None]:
print(X_train['left'].shape)
print(X_train['right'].shape)
print(y_train.shape)

# Build the model

In [None]:
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 2
embedding_dim = 5

# Helper function for the similarity estimate of the LSTMs outputs
def exponent_neg_manhattan_distance(left, right):
    result = K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
    return (result * 2) + 1

# input layer
left_input = Input(shape=(max_length_prod,), dtype='int32')
right_input = Input(shape=(max_length_search,), dtype='int32')

# embedding layers
embedding_layer_prod = Embedding(input_dim=len(all_unique_chars), output_dim=embedding_dim, 
                                 input_length=max_length_prod)
embedding_layer_search = Embedding(input_dim=len(all_unique_chars), output_dim=embedding_dim, 
                                   input_length=max_length_search)

# Embedded version of the inputs
encoded_left = embedding_layer_prod(left_input)
encoded_right = embedding_layer_search(right_input)

# LSTM layer
shared_lstm = LSTM(n_hidden)
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),
                         output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
siamese_model = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

# colmpiling
siamese_model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

training_start_time = time()
history = siamese_model.fit([X_train['left'], X_train['right']], y_train, batch_size=batch_size, epochs=n_epoch,
                            validation_data=([X_val['left'], X_val['right']], y_val))
print(datetime.timedelta(seconds=time()-training_start_time))

In [None]:
# Plot accuracy
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
pred = siamese_model.predict(X_test)

In [None]:
sample_sub = pd.DataFrame()
sample_sub['id'] = test['id']
sample_sub['relevance'] = pred