In [20]:
from __future__ import print_function

import numpy as np
import pandas as pd
import csv, json
import os
import nltk
import re
from nltk.corpus import stopwords
from zipfile import ZipFile
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file
import datetime, time, json
from time import time
from keras.models import Model
from keras.models import Sequential
from keras.layers import Activation, LSTM, Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers import Flatten, Bidirectional, Merge
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from sklearn.model_selection import train_test_split

In [16]:
question1 = []
question2 = []
is_duplicate = []
is_dup=[]

TRAIN_XL = 'Data/quora_duplicate_questions.xlsx'

KERAS_DATASETS_DIR = os.getcwd()
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
HIDDEN = 50
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 100000
NB_EPOCHS = 15
DROPOUT = 0.1
BATCH_SIZE = 64

In [3]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text


xl = pd.ExcelFile(TRAIN_XL)
    
df1 = xl.parse("quora_duplicate_questions")
q1 = df1['question1']
q2 = df1['question2']
is_duplicate = df1['is_duplicate']


for ques in q1:
    question1.append(text_to_word_list(ques))    
for ques in q2:
    question2.append(text_to_word_list(ques))
    

print('Question pairs1: %d' % len(question1))
print('Question pairs2: %d' % len(question2))
print('dup: %d' % len(is_duplicate))

Question pairs1: 404290
Question pairs2: 404290
dup: 404290


In [4]:
print(len(is_duplicate))

404290


In [5]:
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

In [6]:
if not exists(GLOVE_ZIP_FILE):
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

with open('glove.6B.300d.txt', encoding="utf8" ) as f:
    content = f.readlines()
model = {}
for line in content:
    splitLine = line.split()
    word = splitLine[0]
    embedding = np.array([float(val) for val in splitLine[1:]])
    model[word] = embedding
print ("Done.",len(model)," words loaded!")

Done. 400000  words loaded!


In [7]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = model.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 24781


In [8]:
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
#labels = np.array(is_duplicate)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)
print(labels)

Shape of question1 data tensor: (404290, 25)
Shape of question2 data tensor: (404290, 25)
Shape of label tensor: (404290,)
[0 0 0 ..., 0 0 0]


In [9]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [10]:
max_seq_length = min(len(Q1_train),len(Q2_train),
                     len(Q1_test),len(Q2_test))
print(MAX_SEQUENCE_LENGTH)
print(nb_words)
print(word_embedding_matrix)


25
85519
[[ 0.         0.         0.        ...,  0.         0.         0.       ]
 [ 0.04656    0.21318   -0.0074364 ...,  0.0090611 -0.20989    0.053913 ]
 [-0.20017    0.14302    0.052055  ...,  0.034939  -0.12599    0.21863  ]
 ..., 
 [-0.020654   0.051946  -0.19756   ..., -0.1902     0.27514    0.45159  ]
 [ 0.31079    0.5725     0.10701   ...,  0.14536    0.5736     0.59401  ]
 [-0.43546   -0.14073   -0.26553   ...,  0.42638   -0.03747    0.2603   ]]


In [17]:
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

question11 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question22 = Input(shape=(MAX_SEQUENCE_LENGTH,))

embed_layer = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=MAX_SEQUENCE_LENGTH,trainable=False)

encoded_q1 = embed_layer(question11)
encoded_q2 = embed_layer(question22)


lstm_layer = Bidirectional(LSTM(HIDDEN))

question11_op = lstm_layer(encoded_q1)
question22_op = lstm_layer(encoded_q2)

malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([question11_op, question22_op])

model = Model([question11,question22], output=[malstm_distance])


callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])





In [18]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 25, 300)      25656000    input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 100)          140400      embedding_4[0][0]                
          

In [25]:
training_start_time = time()

malstm_trained = model.fit([Q1_train, Q2_train],
                    y_train,
                    nb_epoch=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)

print("Training time finished.\n{} epochs in {}".format(NB_EPOCHS, datetime.timedelta(seconds=time()-training_start_time)))

  if __name__ == '__main__':


Train on 327474 samples, validate on 36387 samples
Epoch 1/15
 - 634s - loss: 0.5080 - acc: 0.7730 - val_loss: 0.4721 - val_acc: 0.7946
Epoch 2/15
 - 606s - loss: 0.4652 - acc: 0.8024 - val_loss: 0.4510 - val_acc: 0.8089
Epoch 3/15
 - 521s - loss: 0.4443 - acc: 0.8154 - val_loss: 0.4427 - val_acc: 0.8127
Epoch 4/15
 - 678s - loss: 0.4297 - acc: 0.8240 - val_loss: 0.4353 - val_acc: 0.8179
Epoch 5/15
 - 614s - loss: 0.4182 - acc: 0.8292 - val_loss: 0.4306 - val_acc: 0.8224
Epoch 6/15
 - 588s - loss: 0.4075 - acc: 0.8354 - val_loss: 0.4278 - val_acc: 0.8228
Epoch 7/15
 - 536s - loss: 0.3994 - acc: 0.8402 - val_loss: 0.4247 - val_acc: 0.8250
Epoch 8/15
 - 571s - loss: 0.3922 - acc: 0.8445 - val_loss: 0.4233 - val_acc: 0.8244
Epoch 9/15
 - 570s - loss: 0.3856 - acc: 0.8480 - val_loss: 0.4236 - val_acc: 0.8275
Epoch 10/15
 - 565s - loss: 0.3794 - acc: 0.8512 - val_loss: 0.4221 - val_acc: 0.8274
Epoch 11/15
 - 565s - loss: 0.3742 - acc: 0.8540 - val_loss: 0.4208 - val_acc: 0.8293
Epoch 12/15


NameError: name 'epoch' is not defined

In [27]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(malstm_trained.history['val_acc']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 13 = 0.8308


In [29]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.4361, accuracy = 0.8239
