In [None]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Merge, Bidirectional
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint

In [None]:
# File paths
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
MODEL_WEIGHTS_FILE = 'model_weights.h5'

In [None]:
# Hyper parameters
embedding_dim = 300
validation_size = 40000
# Model variables
hidden = 50
gradient_clipping_norm = 1.25
learning_rate = 0.01
batch_size = 64
epoch = 20

In [None]:
# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

stops = set(stopwords.words('english'))

def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"haven't", "have not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"i'll", "i will ", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"wasn't", "was not", text)
    
    #fill the word
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    
    #space out special tokens
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)  

    #Remove Punctuations
    words = nltk.word_tokenize(text)
    punct_regex = re.compile('[{0!s}]'.format(re.escape(string.punctuation)))
    nopunct_words = []
    finaltext = ""
    for w in words:
        nw = punct_regex.sub(u'', w)
        if nw != u'':
            nopunct_words.append(nw)
    for w in nopunct_words:
        finaltext = finaltext + " " + w
    
    return finaltext.split()


# Prepare embedding
vocabulary = dict()
unk_vocab = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

questions_cols = ['question1', 'question2']

# Iterate over the questions only of both training and test datasets
for dataset in [train_df, test_df]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:

            qn = []  # qn -> question numbers representation
            for word in text_to_word_list(row[question]):

                # Check for unwanted words
                if word in stops and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(unk_vocab)
                    qn.append(len(unk_vocab))
                    unk_vocab.append(word)
                else:
                    qn.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.set_value(index, question, qn)
            

embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec

In [None]:
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
                     train_df.question2.map(lambda x: len(x)).max(),
                     test_df.question1.map(lambda x: len(x)).max(),
                     test_df.question2.map(lambda x: len(x)).max())

print(max_seq_length)
# Split to train validation
training_size = len(train_df) - validation_size

X = train_df[questions_cols]
y = train_df['is_duplicate']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=validation_size)

# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_df.question1, 'right': test_df.question2}

# Convert labels to their numpy representations
y_train = y_train.values
y_validation = y_validation.values

# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(y_train)

In [None]:
#manhattan distance calculations
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
question_1 = Input(shape=(max_seq_length,), dtype='int32')
question_2 = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(question_1)
encoded_right = embedding_layer(question_2)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = Bidirectional(LSTM(hidden))

question_1_output = shared_lstm(encoded_left)
question_2_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([question_1_output, question_2_output])

# Create a Model
malstm = Model([question_1, question_2], [malstm_distance])

#Save the weights file
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]

malstm.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

# Start training
training_start_time = time()

malstm_trained = malstm.fit([X_train['left'], X_train['right']], y_train, batch_size=batch_size, nb_epoch=epoch,
                            validation_data=([X_validation['left'], X_validation['right']], y_validation) , callback=callbacks)

print("Training time finished.\n{} epochs in {}".format(epoch, datetime.timedelta(seconds=time()-training_start_time)))

In [None]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate(X_test['left'], X_test['right'], y_validation, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))