In [22]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Merge, Input, concatenate, dot, Flatten, Reshape, Bidirectional, add
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras.layers.advanced_activations import PReLU
from keras.preprocessing import sequence, text

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
data = pd.read_csv("data/model_train.csv")
print(data.shape)
data.head()

(327474, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,11881,22926,22927,Who to download GTA San Andreas without net?,How long to become air force colonel?,0
1,75170,128697,128698,How does ito integral represent a Brownian mot...,Why are Ito integrals important?,0
2,175257,76887,4072,How can I control emotional stress?,How do I gain emotional intelligence and contr...,1
3,61861,107933,44287,How did NASA get the Voyager spacecraft to int...,"In Interstellar, how did Cooper (on Earth) get...",0
4,206912,18163,17607,How do you treat canker sores or mouth ulcers?,How do you treat inflammation of the mouth wit...,1


In [3]:
data["question1"] = data["question1"].apply(lambda x: str(x))
data["question2"] = data["question2"].apply(lambda x: str(x))
data["question1"] = data["question1"].apply(lambda x: x.replace("'", ""))
data["question2"] = data["question2"].apply(lambda x: x.replace("'", ""))

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327474 entries, 0 to 327473
Data columns (total 6 columns):
id              327474 non-null int64
qid1            327474 non-null int64
qid2            327474 non-null int64
question1       327474 non-null object
question2       327474 non-null object
is_duplicate    327474 non-null int64
dtypes: int64(4), object(2)
memory usage: 15.0+ MB


In [5]:
data["is_duplicate"].value_counts()/data.shape[0]

0    0.628557
1    0.371443
Name: is_duplicate, dtype: float64

### Split the dataset into train and valid

In [6]:
from sklearn.model_selection import train_test_split 
x_train, x_valid = train_test_split(data, test_size=0.1, random_state=1992)
print(x_train.shape, x_valid.shape)

(294726, 6) (32748, 6)


In [7]:
tk_train = text.Tokenizer(num_words=200000)
tk_train.fit_on_texts(list(x_train.question1.values.astype(str))+list(x_train.question2.values.astype(str)))

In [8]:
max_len = 25

x1_train = tk_train.texts_to_sequences(x_train.question1.values)
x1_train = sequence.pad_sequences(x1_train, maxlen=max_len)

x2_train = tk_train.texts_to_sequences(x_train.question2.values.astype(str))
x2_train = sequence.pad_sequences(x2_train, maxlen=max_len)

x1_valid = tk_train.texts_to_sequences(x_valid.question1.values)
x1_valid = sequence.pad_sequences(x1_valid, maxlen=max_len)

x2_valid = tk_train.texts_to_sequences(x_valid.question2.values.astype(str))
x2_valid = sequence.pad_sequences(x2_valid, maxlen=max_len)

In [9]:
word_index = tk_train.word_index
print(len(word_index))

78990


In [10]:
x1_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     4,    55,
        1646, 26488,  1697,   266,     7,   274, 13550], dtype=int32)

In [11]:
x_train.question1.values[0]

'How did King Leopold II come to own  Congo?'

## Embeddings 

In [12]:
embeddings_index = {}
f = open('data/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

2196017it [04:09, 8798.75it/s]


In [13]:
print('Found %s word vectors.' % len(embeddings_index))

Found 2195884 word vectors.


In [14]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
not_present_words = []
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    if embedding_vector is None:
        not_present_words.append(word)
        

100%|██████████| 78990/78990 [00:00<00:00, 130150.28it/s]


In [15]:
len(not_present_words)

18543

In [16]:
DROPOUT=0.1

In [23]:
question1 = Input(shape=(max_len,))
question2 = Input(shape=(max_len,))

q1 = Embedding(len(word_index) + 1, 
                 300, 
                 weights=[embedding_matrix], 
                 input_length=max_len, 
                 trainable=False)(question1)
q1 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q1)

q2 = Embedding(len(word_index) + 1, 
                 300, 
                 weights=[embedding_matrix], 
                 input_length=max_len, 
                 trainable=False)(question2)
q2 = Bidirectional(LSTM(128, return_sequences=True), merge_mode="sum")(q2)

attention = dot([q1,q2], [1,1])
attention = Flatten()(attention)
attention = Dense((max_len*128))(attention)
attention = Reshape((max_len, 128))(attention)

merged = add([q1,attention])
merged = Flatten()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
callbacks = [ModelCheckpoint("base_model5", monitor='val_acc', save_best_only=True)]

In [None]:
history = model.fit([x1_train, x2_train],
                    x_train.is_duplicate.values,
                    epochs=25,
                    validation_data=([x1_valid, x2_valid], x_valid.is_duplicate.values),
                    batch_size=32,
                    callbacks=callbacks)

Train on 294726 samples, validate on 32748 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25