In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from keras.layers import Embedding,LSTM,Dense,GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

Text Similarity

In [4]:
df=pd.read_csv('/content/drive/MyDrive/DuplicateQuestions/train.csv')

In [5]:
df.dropna(inplace=True,axis=0)

In [6]:
# df=df.sample(50000,random_state=42)
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [7]:
df['text']=df['question1']+df['question2']

In [8]:
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val = train_test_split(df.iloc[:,6:].values,df['is_duplicate'].values,test_size=0.3,random_state=1)

In [9]:
x_tr

array([['Can you arrange DOHLAROAC to form a meaningful word?Who is the most important character for the whole life with one word? Where do you come form?'],
       ['Who are the major contributors of air pollution in Delhi?What was the real cause of air pollution in Delhi?'],
       ['What is an easy way make money online?What is best way to make money online?'],
       ...,
       ['What could be the reason for an extreme chest pain that happens like once every 2 months or more?What does right side chest pain indicate?'],
       ["Should I take coaching for SSB (for TGC entry)?I have joined a company 1 year back..I haven't got right project after training till now.. shall I quit And go for higher studies?"],
       ["Is Run Ze Cao's falsification of Einstein's relativity valid?Why do Republican party supporters ask such loaded questions on Quora? Do they actually believe what they say or are they just being provocative?"]],
      dtype=object)

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(x_tr.flatten()))

In [11]:
x_tr_seq  = tokenizer.texts_to_sequences(x_tr.flatten()) 
x_val_seq = tokenizer.texts_to_sequences(x_val.flatten())

In [12]:
x_tr_seq  = pad_sequences(x_tr_seq)
x_val_seq = pad_sequences(x_val_seq)

In [13]:
x_tr_seq[1].shape

(269,)

In [14]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

81824


In [15]:
print('Indexing word vectors.')

embeddings_index = {}
f = open('/content/drive/MyDrive/collab_data/Glove6B/Glove6B/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [16]:
embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],trainable=False)) 

#lstm layer
model.add(LSTM(64,return_sequences=False,dropout=0.2))

#Global Maxpooling
#model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(32,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         24547200  
                                                                 
 lstm (LSTM)                 (None, 64)                93440     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 24,642,753
Trainable params: 95,553
Non-trainable params: 24,547,200
_________________________________________________________________
None


In [18]:
history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,epochs=20,validation_data=(np.array(x_val_seq),np.array(y_val)),verbose=1,callbacks=[es,mc])

Epoch 1/20
Epoch 1: val_acc improved from -inf to 0.74581, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_acc improved from 0.74581 to 0.76262, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_acc improved from 0.76262 to 0.76767, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_acc improved from 0.76767 to 0.77837, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_acc improved from 0.77837 to 0.77956, saving model to best_model.h5
Epoch 6/20
Epoch 6: val_acc improved from 0.77956 to 0.78486, saving model to best_model.h5
Epoch 7/20
Epoch 7: val_acc did not improve from 0.78486
Epoch 8/20
Epoch 8: val_acc did not improve from 0.78486
Epoch 9/20
Epoch 9: val_acc did not improve from 0.78486
Epoch 10/20
Epoch 10: val_acc improved from 0.78486 to 0.78610, saving model to best_model.h5
Epoch 11/20
Epoch 11: val_acc improved from 0.78610 to 0.78906, saving model to best_model.h5
Epoch 12/20
Epoch 12: val_acc did not improve from 0.78906
Epoch 13/20
Epoch 13: val_acc imp

In [19]:
cp /content/best_model.h5 /content/drive/MyDrive/collab_data