### Using LSTM Embedding

In [8]:
# importing libraries
import numpy as np
import pandas as pd
import tensorflow

In [9]:
df_train = pd.read_csv('train.csv', encoding='utf-8')  # reading the training set
df_train['id'] = df_train['id'].apply(str)

In [10]:
df_test = pd.read_csv('test.csv', encoding='utf-8')    # reading the test set
df_test['test_id'] = df_test['test_id'].apply(str)

In [11]:
df_all = pd.concat((df_train, df_test))
df_all['question1'].fillna('', inplace=True)
df_all['question2'].fillna('', inplace=True)

### CREATE VOCABULARY

In [12]:
from sklearn.feature_extraction.text import CountVectorizer  #Convert a collection of text documents to a matrix of token counts
import itertools

In [13]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(df_all['question1'], df_all['question2']))
other_index = len(counts_vectorizer.vocabulary_)

### PREPARING DATA

In [14]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [15]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern) #tokenizing words 

In [16]:
def create_padded_seqs(texts, max_len=10): # creating a function to check number of elements in a sequence to be equal to each other
    seqs = texts.apply(lambda s: 
        [counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)


In [17]:
df_all = df_all.sample(1000)


### Padding the sequences with zeros to make lengths of both sequences equal

In [18]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \ 
    train_test_split(create_padded_seqs(df_all[df_all['id'].notnull()]['question1']), 
                     create_padded_seqs(df_all[df_all['id'].notnull()]['question2']),
                     df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     stratify=df_all[df_all['id'].notnull()]['is_duplicate'].values,
                     test_size=0.3, random_state=1989)

### TRAINING

In [19]:
import keras.layers as lyr
from keras.models import Model

In [20]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 100)      1000000     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 256)          365568      embedding_1[0][0]                
          

### Fitting the model and calcualting log loss over 6 epochs

In [21]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=6, verbose=2)

Train on 95 samples, validate on 42 samples
Epoch 1/6
 - 42s - loss: 0.8073 - val_loss: 0.8098
Epoch 2/6
 - 0s - loss: 0.8046 - val_loss: 0.8071
Epoch 3/6
 - 0s - loss: 0.8020 - val_loss: 0.8044
Epoch 4/6
 - 0s - loss: 0.7994 - val_loss: 0.8017
Epoch 5/6
 - 0s - loss: 0.7968 - val_loss: 0.7991
Epoch 6/6
 - 0s - loss: 0.7943 - val_loss: 0.7964


<keras.callbacks.History at 0x2a9025b0e10>

### Extract Features From Model

In [22]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')

In [23]:
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

In [24]:
import xgboost as xgb



In [25]:
dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)

In [26]:
xgb_params = {                                           # Calculating log loss 
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1
}
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)

[0]	train-logloss:0.677829	val-logloss:0.6942
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 10 rounds.
[10]	train-logloss:0.556592	val-logloss:0.676909
[20]	train-logloss:0.468888	val-logloss:0.653811
[30]	train-logloss:0.393451	val-logloss:0.634305
[40]	train-logloss:0.34712	val-logloss:0.619389
[50]	train-logloss:0.307175	val-logloss:0.615066
[60]	train-logloss:0.280667	val-logloss:0.618676
Stopping. Best iteration:
[51]	train-logloss:0.304577	val-logloss:0.613718



In [27]:
X1_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question1'])
X2_test = create_padded_seqs(df_all[df_all['test_id'].notnull()]['question2'])

In [28]:
F_test = features_model.predict([X1_test, X2_test], batch_size=128)

In [29]:
dTest = xgb.DMatrix(F_test)

In [30]:
df_sub = pd.DataFrame({
        'test_id': df_all[df_all['test_id'].notnull()]['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    }).set_index('test_id')

In [31]:
df_sub.head()

Unnamed: 0_level_0,is_duplicate
test_id,Unnamed: 1_level_1
1086363,0.181149
1870076,0.46009
870186,0.386014
225585,0.275356
733418,0.379834


In [32]:
df_sub['is_duplicate'].hist(bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x2a90cb77f60>

The text in the document by <Shireen Rabbani, Rohit Ulhe> is licensed under CC BY 3.0 https://creativecommons.org/licenses/by/3.0/us/
The code in the document by <Shireen Rabbani, Rohit Ulhe> is licensed under the MIT License https://opensource.org/licenses/MIT