In [49]:
import keras
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation, Embedding, LSTM, RepeatVector
from keras.preprocessing.sequence import pad_sequences

import re
from gensim.models import Word2Vec
import nltk

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', -1)  

In [2]:
train_df = pd.read_csv("train.csv.zip", sep=",", compression="zip")
print(len(train_df))
train_df.head()

404290


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


## Cleaning the corpus and getting word2vec embeddings

In [3]:
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords = stopwords.difference(set(["can", "who", "any", "which", "when", "whom", 
                                      "if", "what", "how", "why", "where", "only", "same", "more", "now"
                                     ]))

def process_question(raw_question):
    tokens = re.sub(r"[^a-z0-9]+", " ", str(raw_question).lower()).split()
    return [x for x in tokens if x not in stopwords]

In [4]:
corpus = []
seen_ids = set()

for index, row in train_df.iterrows():
    q1_id = row["qid1"]
    if q1_id not in seen_ids:
        corpus.append(process_question(row["question1"]))
        seen_ids.add(q1_id)
    q2_id = row["qid2"]
    if q2_id not in seen_ids:
        corpus.append(process_question(row["question2"]))
        seen_ids.add(q2_id)    
        
print(len(corpus))
print(corpus[:10])

537933
[['what', 'step', 'step', 'guide', 'invest', 'share', 'market', 'india'], ['what', 'step', 'step', 'guide', 'invest', 'share', 'market'], ['what', 'story', 'kohinoor', 'koh', 'noor', 'diamond'], ['what', 'would', 'happen', 'if', 'indian', 'government', 'stole', 'kohinoor', 'koh', 'noor', 'diamond', 'back'], ['how', 'can', 'increase', 'speed', 'internet', 'connection', 'using', 'vpn'], ['how', 'can', 'internet', 'speed', 'increased', 'hacking', 'dns'], ['why', 'mentally', 'lonely', 'how', 'can', 'solve'], ['find', 'remainder', 'when', 'math', '23', '24', 'math', 'divided', '24', '23'], ['which', 'one', 'dissolve', 'water', 'quikly', 'sugar', 'salt', 'methane', 'carbon', 'di', 'oxide'], ['which', 'fish', 'would', 'survive', 'salt', 'water']]


In [5]:
w2v_embeddings = Word2Vec(corpus, size=100)

In [6]:
w2v_embeddings.most_similar(positive=["cat"])

[('xat', 0.7255841493606567),
 ('gmat', 0.6986239552497864),
 ('percentile', 0.6843202114105225),
 ('toefl', 0.6835753321647644),
 ('bitsat', 0.6814593076705933),
 ('aipmt', 0.6771522760391235),
 ('clat', 0.6725986003875732),
 ('cmat', 0.6654667854309082),
 ('gre', 0.6620069146156311),
 ('aiims', 0.6526833176612854)]

## Building an embedding layer

In [29]:
word2index_map = {}
index2word_map = {}
max_question_length = 0
current_index = 1

for index, row in train_df.iterrows():
    tokens_a = process_question(row["question1"])
    tokens_b = process_question(row["question2"])
    max_question_length = max(max_question_length, max(len(tokens_a), len(tokens_b)))
    for token in tokens_a + tokens_b:
        if token not in word2index_map:
            word2index_map[token] = current_index
            index2word_map[current_index] = token
            current_index += 1
            
print(len(word2index_map))
print(max_question_length)            

85180
108


In [30]:
embedding_matrix = np.zeros((len(index2word_map) + 1, 100))
for i in range(1, len(index2word_map) + 1):
    if index2word_map[i] in w2v_embeddings:
        embedding_matrix[i, :] = w2v_embeddings[index2word_map[i]]

In [31]:
embedding_layer = Embedding(input_dim=len(index2word_map)+1,
                            output_dim=100,
                            weights=[embedding_matrix],
                            input_length=max_question_length,
                            trainable=False)

## Building a dataset of duplicate pairs

In [32]:
duplicates = train_df[train_df["is_duplicate"] == 1].reset_index()
duplicate_pairs_num = len(duplicates)
print(duplicate_pairs_num)
duplicates.head()

149263


Unnamed: 0,index,id,qid1,qid2,question1,question2,is_duplicate
0,5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1
1,7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
2,11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
3,12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
4,13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [33]:
duplicates_left = np.zeros((duplicate_pairs_num, max_question_length))
duplicates_right = np.zeros((duplicate_pairs_num, max_question_length))

for index, row in duplicates.iterrows():
    token_left = process_question(row["question1"])
    indices_left = [word2index_map[token] for token in token_left]
    duplicates_left[index, :] = pad_sequences([indices_left], maxlen=max_question_length)
    
    token_right = process_question(row["question2"])
    indices_right = [word2index_map[token] for token in token_right]
    duplicates_right[index, :] = pad_sequences([indices_right], maxlen=max_question_length)

## Building Keras encoder-decoder model

In [56]:
encoder_model = Sequential()
encoder_model.add(embedding_layer)
encoder_model.add(LSTM(output_dim=100, input_dim=100, input_length=max_question_length, return_sequences=False))
encoder_model.add(RepeatVector(max_question_length))
encoder_model.add(LSTM(input_dim=100, output_dim=100, return_sequences=True))

Some helpful links:
* https://github.com/fchollet/keras/issues/369
* http://stackoverflow.com/questions/42253934/keras-how-to-use-the-learned-embedding-layer-for-input-and-output
* https://keras.io/layers/recurrent/
* http://stackoverflow.com/questions/42140922/keras-lstm-training-data-format?rq=1
* https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
* http://stackoverflow.com/questions/38134379/keras-sequence-to-sequence-encoder-decoder-part-of-speech-tagging-example-with-a

In [59]:
def custom_loss(y_true, y_pred):    
    return keras.metrics.mean_squared_error(embedding_layer(y_true), y_pred)
encoder_model.compile(loss=custom_loss, optimizer="adam")
encoder_model.fit(duplicates_left, duplicates_right, nb_epoch=1)

Epoch 1/1


TypeError: Bad input argument to theano function with name "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\keras\backend\theano_backend.py:955" at index 1 (0-based).  
Backtrace when that variable is created:

  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\ipykernel\zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\IPython\core\interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\IPython\core\interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-59-54b60236f0d2>", line 3, in <module>
    encoder_model.compile(loss=custom_loss, optimizer="adam")
  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\keras\models.py", line 594, in compile
    **kwargs)
  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\keras\engine\training.py", line 650, in compile
    dtype=K.dtype(self.outputs[i])))
  File "C:\Users\lyapin roman\Anaconda2\envs\dl_env\lib\site-packages\keras\backend\theano_backend.py", line 110, in placeholder
    x = T.TensorType(dtype, broadcast)(name)
Wrong number of dimensions: expected 3, got 2 with shape (32, 108).