I plan to use Word2Vec to convert each question into a word vector. Then I will use a Siamese neural network to detect if the pair is duplicate

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pal = sns.color_palette()

print('# File sizes')
for f in os.listdir('../input'):
    if 'zip' not in f:
        print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_train.head()

We are given a minimal number of data fields here, consisting of:

* **id:** Looks like a simple rowID
* **qid{1, 2}:** The unique ID of each question in the pair
* **question{1, 2}:** The actual textual contents of the questions.
* **is_duplicate:** The label that we are trying to predict - whether the two questions are duplicates of each other.


In [None]:
print('Total number of question pairs for training: {}'.format(len(df_train)))
print('Duplicate pairs: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))
qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
print('Total number of questions in the training data: {}'.format(len(
    np.unique(qids))))
print('Number of questions that appear multiple times: {}'.format(np.sum(qids.value_counts() > 1)))

In [None]:
df_test = pd.read_csv('../input/test.csv')
df_test.head()

Encode questions to unicode

In [None]:
# encode questions to unicode
df_train['question1'] = df_train['question1'].apply(lambda x: str(x).encode("utf-8"))
df_train['question2'] = df_train['question2'].apply(lambda x: str(x).encode("utf-8"))
df_test['question1'] = df_test['question1'].apply(lambda x: str(x).encode("utf-8"))
df_test['question2'] = df_test['question2'].apply(lambda x: str(x).encode("utf-8"))

In [None]:
import gensim
import sys
from tqdm import tqdm

In [None]:
### Train a GLOVE using Gensim


questions = list(df_train['question1']) + list(df_train['question2'])

# tokenize
c = 0
for question in tqdm(questions):
    questions[c] = list(gensim.utils.tokenize(question, deacc=True, lower=True))
    c += 1

# train model
model = gensim.models.Word2Vec(questions, size=300, workers=16, iter=10, negative=20)

# trim memory
model.init_sims(replace=True)

# creta a dict 
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
print("Number of tokens in Word2Vec:", len(w2v.keys()))


In [None]:
# save model
model.save('3_word2vec.mdl')
model.wv.save_word2vec_format('3_word2vec.bin', binary=True)
model = gensim.models.Word2Vec.load('3_word2vec.mdl')  # you can continue training with the loaded model!
model.wv['computer']

In [None]:
def make_question_vectors(model, sentence): 
    # return numpy document vector by averaging constituent word vectors
    # sentence is a list of words in same style as iterator makes for entering into word2vec
    word_vecs = []
    for word in sentence: 
        try: 
            new_word = model[word]
        except KeyError:
            continue
        # check whether array has nan before appending
        if not np.isnan(np.sum(new_word)):
            word_vecs.append(new_word)
    # if no appropriate word vectors found, return array of zeros
    if not word_vecs:
        return np.zeros(model.layer1_size)
    word_vecs = np.array(word_vecs)
    return word_vecs.mean(axis=0)

In [None]:
vec1 = df_train['question1'].apply(lambda x: make_question_vectors(model,x.split()))
vec2 = df_train['question2'].apply(lambda x: make_question_vectors(model,x.split()))
df_train['q1_feats'] = list(vec1)
df_train['q2_feats'] = list(vec2)

In [None]:
df_train.head()

TIME TO TRAIN

In [None]:
from __future__ import absolute_import
from __future__ import print_function
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge
from keras import backend as K


In [None]:
q1_branch = Sequential()
q1_branch.add(Dense(1000, input_shape=(300,), activation='relu'))
q1_branch.add(Dropout(0.2))

q2_branch = Sequential()
q2_branch.add(Dense(1000, input_shape=(300,), activation='relu'))
q2_branch.add(Dropout(0.2))

merged = Merge([q1_branch, q2_branch], mode='concat')

final_model = Sequential()
final_model.add(merged)
final_model.add(Dense(500, activation='relu'))
final_model.add(Dropout(0.2))
final_model.add(Dense(500, activation='relu'))
final_model.add(Dropout(0.2))
final_model.add(Dense(2, activation='softmax'))

In [None]:
# compile model - accuracy will be metrix we optimize for
final_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# shuffle df_train
df_train = df_train.reindex(np.random.permutation(df_train.index))

# set number of train and test instances
num_train = int(df_train.shape[0] * 0.88)
num_test = df_train.shape[0] - num_train                 
print("Number of training pairs: %i"%(num_train))
print("Number of testing pairs: %i"%(num_test))



In [None]:
# init data data arrays
X_train = np.zeros([num_train, 2, 300])
X_test  = np.zeros([num_test, 2, 300])
Y_train = np.zeros([num_train]) 
Y_test = np.zeros([num_test])

In [None]:
# format data 
b = [a[None,:] for a in list(df_train['q1_feats'].values)]
q1_feats = np.concatenate(b, axis=0)

b = [a[None,:] for a in list(df_train['q2_feats'].values)]
q2_feats = np.concatenate(b, axis=0)

In [None]:
df_train.head()

In [None]:
null