In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Merge, Input, concatenate
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

Using TensorFlow backend.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from string import punctuation

In [3]:
data = pd.read_csv("data/model_train.csv")
print(data.shape)
data.head()

(327474, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,11881,22926,22927,Who to download GTA San Andreas without net?,How long to become air force colonel?,0
1,75170,128697,128698,How does ito integral represent a Brownian mot...,Why are Ito integrals important?,0
2,175257,76887,4072,How can I control emotional stress?,How do I gain emotional intelligence and contr...,1
3,61861,107933,44287,How did NASA get the Voyager spacecraft to int...,"In Interstellar, how did Cooper (on Earth) get...",0
4,206912,18163,17607,How do you treat canker sores or mouth ulcers?,How do you treat inflammation of the mouth wit...,1


In [4]:
data["question1"] = data["question1"].apply(lambda x: str(x))
data["question2"] = data["question2"].apply(lambda x: str(x))
data["question1"] = data["question1"].apply(lambda x: x.replace("'", ""))
data["question2"] = data["question2"].apply(lambda x: x.replace("'", ""))

### Split the dataset into train and valid

In [5]:
from sklearn.model_selection import train_test_split 
x_train, x_valid = train_test_split(data, test_size=0.1, random_state=1992)
print(x_train.shape, x_valid.shape)

(294726, 6) (32748, 6)


In [6]:
from utils import *

In [7]:
x1_train = []
process_questions(x1_train, x_train.question1.values, 'x1_train', x_train)

x2_train = []
process_questions(x2_train, x_train.question2.values, 'x2_train', x_train)

x1_valid = []
process_questions(x1_valid, x_valid.question1.values, 'x1_valid', x_valid)

x2_valid = []
process_questions(x2_valid, x_valid.question2.values, 'x2_valid', x_valid)

x1_train is 33.9% complete.
x1_train is 67.9% complete.
x2_train is 33.9% complete.
x2_train is 67.9% complete.


In [8]:
tk_train = text.Tokenizer(num_words=200000)
tk_train.fit_on_texts(x1_train+x2_train)

In [9]:
max_len = 25

x1_train = tk_train.texts_to_sequences(x1_train)
x1_train = sequence.pad_sequences(x1_train, maxlen=max_len)

x2_train = tk_train.texts_to_sequences(x2_train)
x2_train = sequence.pad_sequences(x2_train, maxlen=max_len)

x1_valid = tk_train.texts_to_sequences(x1_valid)
x1_valid = sequence.pad_sequences(x1_valid, maxlen=max_len)

x2_valid = tk_train.texts_to_sequences(x2_valid)
x2_valid = sequence.pad_sequences(x2_valid, maxlen=max_len)

In [10]:
word_index = tk_train.word_index
print(len(word_index))

76057


In [11]:
x1_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     1,
          40,  1617, 26356,  1676,   244,   251, 13509], dtype=int32)

In [12]:
x_train.question1.values[0]

'How did King Leopold II come to own  Congo?'

## Embeddings 

In [13]:
embeddings_index = {}
f = open('data/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

2196017it [03:59, 9185.46it/s]


In [14]:
print('Found %s word vectors.' % len(embeddings_index))

Found 2195884 word vectors.


In [15]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
not_present_words = []
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    if embedding_vector is None:
        not_present_words.append(word)
        

100%|██████████| 76057/76057 [00:00<00:00, 135993.55it/s]


In [16]:
len(not_present_words)

15613

In [17]:
DROPOUT=0.1

In [20]:
question1 = Input(shape=(max_len,))
question2 = Input(shape=(max_len,))

q1 = Embedding(len(word_index) + 1, 
                 300, 
                 weights=[embedding_matrix], 
                 input_length=max_len, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(300, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(300, ))(q1)

q2 = Embedding(len(word_index) + 1, 
                 300, 
                 weights=[embedding_matrix], 
                 input_length=max_len, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(300, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(300, ))(q2)

merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [21]:
callbacks = [ModelCheckpoint("base_model4", monitor='val_acc', save_best_only=True)]
history = model.fit([x1_train, x2_train],
                    x_train.is_duplicate.values,
                    epochs=25,
                    validation_data=([x1_valid, x2_valid], x_valid.is_duplicate.values),
                    verbose=2,
                    batch_size=32,
                    callbacks=callbacks)

Train on 294726 samples, validate on 32748 samples
Epoch 1/25
 - 137s - loss: 0.5404 - acc: 0.7259 - val_loss: 0.5185 - val_acc: 0.7338
Epoch 2/25
 - 135s - loss: 0.4886 - acc: 0.7605 - val_loss: 0.4652 - val_acc: 0.7701
Epoch 3/25
 - 135s - loss: 0.4626 - acc: 0.7770 - val_loss: 0.4590 - val_acc: 0.7753
Epoch 4/25
 - 135s - loss: 0.4397 - acc: 0.7905 - val_loss: 0.4423 - val_acc: 0.7847
Epoch 5/25
 - 135s - loss: 0.4222 - acc: 0.8012 - val_loss: 0.4367 - val_acc: 0.7879
Epoch 6/25
 - 132s - loss: 0.4077 - acc: 0.8106 - val_loss: 0.4607 - val_acc: 0.7703
Epoch 7/25
 - 137s - loss: 0.3919 - acc: 0.8193 - val_loss: 0.4348 - val_acc: 0.7929
Epoch 8/25
 - 134s - loss: 0.3779 - acc: 0.8276 - val_loss: 0.4354 - val_acc: 0.7909
Epoch 9/25
 - 136s - loss: 0.3664 - acc: 0.8344 - val_loss: 0.4218 - val_acc: 0.7971
Epoch 10/25
 - 135s - loss: 0.3537 - acc: 0.8417 - val_loss: 0.4241 - val_acc: 0.8049
Epoch 11/25
 - 135s - loss: 0.3456 - acc: 0.8468 - val_loss: 0.4142 - val_acc: 0.8052
Epoch 12/25


In [22]:
# Print best validation accuracy and epoch
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_acc']))
print('Maximum validation accuracy = {0:.4f} (epoch {1:d})'.format(max_val_acc, idx+1))

Maximum validation accuracy = 0.8129 (epoch 23)


In [23]:
test = pd.read_csv("data/model_test.csv")
test["question1"] = test["question1"].apply(lambda x: str(x))
test["question2"] = test["question2"].apply(lambda x: str(x))
test["question1"] = test["question1"].apply(lambda x: x.replace("'", ""))
test["question2"] = test["question2"].apply(lambda x: x.replace("'", ""))


x1_test = []
process_questions(x1_test, test.question1.values, 'x1_test', test)

x2_test = []
process_questions(x2_test, test.question2.values, 'test', test)

x1_test = tk_train.texts_to_sequences(x1_test)
x1_test = sequence.pad_sequences(x1_test, maxlen=max_len)

x2_test = tk_train.texts_to_sequences(x2_test)
x2_test = sequence.pad_sequences(x2_test, maxlen=max_len)

In [24]:
model.load_weights("base_model4")
loss, accuracy = model.evaluate([x1_test, x2_test], test.is_duplicate.values, verbose=0)
print("Test Accuracy:",round(accuracy*100))

Test Accuracy: 81.0
