In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Question duplicates

The task of identifying duplicated questions can be viewed as an instance of the paraphrase identification problem, which is a well-studied NLP task that uses natural language sentence matching
(NLSM) to determine whether two sentences are paraphrase or not (2). This task has wide array of
useful NLP application. For example, in question-and-answer (QA) forums, there are vast numbers
of duplicate questions. Identifying these duplicates and consolidating their answers increases the
efficiency of such QA forums. Moreover, identifying questions with the same semantic content could
help web-scale question answering systems that are increasingly concentrating on retrieving focused
answers to users’ queries.


In this project, we focus on a dataset published by Quora.com containing over 400K annotated
question pairs containing binary paraphrase labels.1
. We believe that this dataset presents a great
opportunity for the NLP practitioners tue to its scale and quality; it can result in systems that accurately
identify duplicate questions, thus increasing the quality of many QA forums.

In [None]:
df = pd.read_csv('../input/quora-question-pairs/train.csv.zip')

In [None]:
df.head(10)

<p>Quora is a place to gain and share knowledge—about anything. It’s a platform to ask questions and connect with people who contribute unique insights and quality answers. This empowers people to learn from each other and to better understand the world.</p>
<p>
Over 100 million people visit Quora every month, so it's no surprise that many people ask similarly worded questions. Multiple questions with the same intent can cause seekers to spend more time finding the best answer to their question, and make writers feel they need to answer multiple versions of the same question. Quora values canonical questions because they provide a better experience to active seekers and writers, and offer more value to both of these groups in the long term.
</p>
<br>
> Credits: Kaggle 


__ Problem Statement __
- Identify which questions asked on Quora are duplicates of questions that have already been asked. 
- This could be useful to instantly provide answers to questions that have already been answered. 
- We are tasked with predicting whether a pair of questions are duplicates or not. "

### Data Overview

- Data will be in a file Train.csv <br>
- Train.csv contains 5 columns : qid1, qid2, question1, question2, is_duplicate 
- Number of rows in Train.csv = 404,290

In [None]:
df.info()

We are given a minimal number of data fields here, consisting of:

- id:  Looks like a simple rowID
- qid{1, 2}:  The unique ID of each question in the pair
- question{1, 2}:  The actual textual contents of the questions.
- is_duplicate:  The label that we are trying to predict - whether the two questions are duplicates of each other.

In [None]:
df.describe()

In [None]:
df.groupby('is_duplicate')['id'].count().plot.bar()

In [None]:
#%config Completer.use_jedi = False

In [None]:
print(f'Total number of question pairs is : {len(df)}')

In [None]:
print('Total Percentage of questions that are not similar is: {}'.format(round(100-df['is_duplicate'].mean()*100,2)))
print('Total Percentage of questions that are similar is: {}'.format(round(df['is_duplicate'].mean()*100,2)))

In [None]:
df['is_duplicate'].mean()

In [None]:
quids = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())
quids
unique_qs = np.unique(quids)
len_unique_qs = len(unique_qs)
print('Length of total questions are:',len(quids))
print('Toal number of unique questions are :',len_unique_qs)
qs_morethan_one = np.sum(quids.value_counts()>1)
print('Question appearing more than once are:',(qs_morethan_one))
print('Percentage of Question appearing more than once are:',(qs_morethan_one/len(unique_qs))*100)
print('Max number of times a single question is asked:',max(quids.value_counts()))

In [None]:
q_vals=quids.value_counts()

q_vals=q_vals.values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

x = ["unique_questions" , "Repeated Questions"]
y =  [len_unique_qs , qs_morethan_one]

plt.figure(figsize=(10,6))
plt.title('Plot representing unique and repeated questions')
sns.barplot(x,y)
plt.show()

<h3>Checking for Duplicates </h3>

In [None]:
#checking whether there are any repeated pair of questions

pair_duplicates = df[['qid1','qid2','is_duplicate']].groupby(['qid1','qid2']).count().reset_index()

print ("Number of duplicate questions",(pair_duplicates).shape[0] - df.shape[0])

In [None]:
plt.figure(figsize=(20,10))
plt.hist(quids.value_counts(),bins=160)
plt.yscale('log',nonposy='clip')
plt.title('Log-Histogram of question apperance counts ')
plt.xlabel('Number of occurance of questions')
plt.ylabel('Number of questions')
print ('Maximum number of times a single question is repeated: {}\n'.format(max(quids.value_counts()))) 
plt.show()

In [None]:
df.isnull().sum()

There are three null values we can fill with empty string

In [None]:
df=df.fillna('')
df.isnull().sum()

In [None]:
df[df['qid1']==3]

# Basic Feature Extraction (before cleaning)

Let us now construct a few features like:
 - ____freq_qid1____ = Frequency of qid1's
 - ____freq_qid2____ = Frequency of qid2's 
 - ____q1len____ = Length of q1
 - ____q2len____ = Length of q2
 - ____q1_n_words____ = Number of words in Question 1
 - ____q2_n_words____ = Number of words in Question 2
 - ____word_Common____ = (Number of common unique words in Question 1 and Question 2)
 - ____word_Total____ =(Total num of words in Question 1 + Total num of words in Question 2)
 - ____word_share____ = (word_common)/(word_Total)
 - ____freq_q1+freq_q2____ = sum total of frequency of qid1 and qid2 
 - ____freq_q1-freq_q2____ = absolute difference of frequency of qid1 and qid2 

In [None]:
# Freqeuncey of qid1 and qid2

df['qid1_freq'] = df.groupby('qid1')['qid1'].transform('count')
df['qid2_freq'] = df.groupby('qid2')['qid2'].transform('count')

#Length of question1 and question2

df['q1_len'] = df['question1'].str.len()
df['q2_len'] = df['question2'].str.len()

#Number of words in question1 and question2

df['q1_words'] = df['question1'].apply(lambda x:len(x.split(' ')))
df['q2_words'] = df['question2'].apply(lambda x:len(x.split(' ')))

In [None]:
#common words in question1 and question2
def common_words(row):
    q1_common = set(map(lambda x: x.lower().strip(),row['question1'].split(' ')))
    q2_common = set(map(lambda x: x.lower().strip(),row['question2'].split(' ')))
    return 1.0 * len(q1_common & q2_common)

df['len_common_words'] = df.apply(common_words,axis=1)

In [None]:
#Total number of words in question1 and question2

def word_length(row):
    w1 = set(map(lambda x: x ,row['question1'].split(' ')))
    w2 = set(map(lambda x: x ,row['question2'].split(' ')))
    
    return 1.0 * (len(w1) + len(w2))

df['q1_q2_word_length'] = df.apply(word_length,axis=1)

In [None]:
#word share of question1 and question2

df['word_share'] = df['len_common_words'] / df['q1_q2_word_length']

# qid1 and qid2 frequency sum

df['qid1_qid2_freq'] = df['qid1_freq'] + df['qid2_freq']

# difference of qid1 and qid2 frequency

df['diff_qid1_qid2'] = df['qid1_freq'] - df['qid2_freq']

In [None]:
#save the preprocessed file
df.to_csv('df_fe_without_preprocessing_train.csv',index=False)

In [None]:
df.head()

# Analysis of some of the extracted features

In [None]:
print ("Minimum length of the questions in question1 : " , min(df['q1_len']))

print ("Minimum length of the questions in question2 : " , min(df['q2_len']))

print ("Number of Questions with minimum length [question1] :", df[df['q1_len']== 1].shape[0])

print ("Number of Questions with minimum length [question2] :", df[df['q2_len']== 1].shape[0])

In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'word_share', data = df[0:])

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['word_share'][0:],label = 1,color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['word_share'][0:],label = 0,color = 'black')

# Word Common

In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(1,2,1)
sns.violinplot(x = 'is_duplicate', y = 'len_common_words', data = df[0:])

plt.subplot(1,2,2)
sns.distplot(df[df['is_duplicate'] == 1.0]['len_common_words'][0:] , label = "1", color = 'red')
sns.distplot(df[df['is_duplicate'] == 0.0]['len_common_words'][0:] , label = "0" , color = 'blue' )
plt.show()

# Training the Model for Detecting question duplicates

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import keras.layers as layers
from keras.models import Model
from keras import backend as K
np.random.seed(10)
tf.compat.v1.disable_v2_behavior() 

## Using Universal Sentence Encoder - For creating embedding of text at sentence level

In [None]:
# enabling the pretrained model for trainig our custom model using tensorflow hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.load(module_url)

# creating a method for embedding and will using method for every input layer 
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x,tf.string)))
 


In [None]:
# Taking the question1 as input and ceating a embedding for each question before feed it to neural network

dropout = 0.1
q1 = layers.Input(shape=(1,),dtype=tf.string)
embeding_q1 = layers.Lambda(UniversalEmbedding,output_shape=(512,))(q1)

# Taking the question2 as input and ceating a embedding for each question before feed it to neural network
q2 = layers.Input(shape=(1,),dtype=tf.string)
embeding_q2 = layers.Lambda(UniversalEmbedding,output_shape=(512,))(q2)

# Concatenating the both input layer
merged = layers.concatenate([embeding_q1,embeding_q2])
merged = layers.Dense(200,activation='relu')(merged)
merged = layers.Dropout(dropout)(merged)

# Normalizing the input layer,applying dense and dropout  layer for fully connected model and to avoid overfitting 
merged = layers.BatchNormalization()(merged)
merged = layers.Dense(200,activation='relu')(merged)
merged = layers.Dropout(dropout)(merged)

merged = layers.BatchNormalization()(merged)
merged = layers.Dense(200,activation='relu')(merged)
merged = layers.Dropout(dropout)(merged)

merged = layers.BatchNormalization()(merged)
merged = layers.Dense(200,activation='relu')(merged)
merged = layers.Dropout(dropout)(merged)

# Using the Sigmoid as the activation function and binary crossentropy for binary classifcation as 0 or 1
merged = layers.BatchNormalization()(merged)
pred = layers.Dense(2,activation='sigmoid')(merged)
model = Model(inputs=[q1,q2],outputs=pred)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model)

In [None]:
# Pushing all the strings to a list and converting to ndarray
q1= df.question1.tolist()
q2= df.question2.tolist()
labels= df.is_duplicate.tolist()

In [None]:
from sklearn.model_selection import train_test_split
X1 = df['question1']
X2 = df['question2']
y = df['is_duplicate']
# Using the sklearn to split data in question1 and question2 train and test in the ration 80-20 %
X1_train, X1_test,X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, test_size=0.2, random_state=42)

In [None]:
train_q1 = X1_train.tolist()
trainq1 = np.array(train_q1,dtype=object)[:,np.newaxis]
train_q2 = X2_train.tolist()
trainq2 = np.array(train_q2,dtype=object)[:,np.newaxis]

train_labels = np.asarray(pd.get_dummies(y_train),dtype=np.int8)

test_q1 = X1_test.tolist()
test_q1 = np.array(test_q1, dtype=object)[:, np.newaxis]
test_q2 = X2_test.tolist()
test_q2 = np.array(test_q2, dtype=object)[:, np.newaxis]

test_labels = np.asarray(pd.get_dummies(y_test), dtype = np.int8)


# Training the Model

In [None]:
from keras.callbacks import ModelCheckpoint
tf.compat.v1.disable_v2_behavior() 

with tf.compat.v1.Session() as session:
    tf.compat.v1.keras.backend.set_session(session)
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())
    
    filepath = 'model-{epoch:02d}-{val_acc:.2f}.hdf5'
    checkpoint = ModelCheckpoint(filepath,monitor = 'val_loss',save_best_only=False,save_weights_only=True,mode='auto',period=1)
    callback_list = [checkpoint]
    
    history = model.fit([trainq1,trainq2],train_labels,
                        validation_data=([test_q1,test_q2],test_labels),
                        epochs=10,
                        batch_size=512,callbacks=callback_list)
    model.save('final_model.h5')

In [None]:
import numpy as np
import tensorflow as tf

tf.compat.v1.disable_v2_behavior()
q1 = input("Type Question 1 here -->")
q2 = input("Type Question 2 here -->") 
q1 = np.array([[q1],[q1]])
q2 = np.array([[q2],[q2]])

# Using the same tensorflow session for embedding the test string
with tf.compat.v1.Session() as session:
    tf.compat.v1.keras.backend.set_session(session)
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())
    model.load_weights('./model-09-0.86.hdf5')
    predicts = model.predict([q1,q2],verbose= 0)
    predict_logits = predicts.argmax(axis=1)
    print("----FINAL RESULT----")
    if(predict_logits[0] == 1):
        print("****Questions are Similar****")
    else:
        print("****Questions are not Similar****")

In [None]:
import numpy as np
import tensorflow as tf

tf.compat.v1.disable_v2_behavior()
q1 = input("Type Question 1 here -->")
q2 = input("Type Question 2 here -->") 
q1 = np.array([[q1],[q1]])
q2 = np.array([[q2],[q2]])

# Using the same tensorflow session for embedding the test string
with tf.compat.v1.Session() as session:
    tf.compat.v1.keras.backend.set_session(session)
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())
    model.load_weights('./model-09-0.86.hdf5')
    predicts = model.predict([q1,q2],verbose= 0)
    predict_logits = predicts.argmax(axis=1)
    print("----FINAL RESULT----")
    if(predict_logits[0] == 1):
        print("****Questions are Similar****")
    else:
        print("****Questions are not Similar****")

In [None]:
import numpy as np
import tensorflow as tf

tf.compat.v1.disable_v2_behavior()
q1 = input("Type Question 1 here -->")
q2 = input("Type Question 2 here -->") 
q1 = np.array([[q1],[q1]])
q2 = np.array([[q2],[q2]])

# Using the same tensorflow session for embedding the test string
with tf.compat.v1.Session() as session:
    tf.compat.v1.keras.backend.set_session(session)
    session.run(tf.compat.v1.global_variables_initializer())
    session.run(tf.compat.v1.tables_initializer())
    model.load_weights('./model-09-0.86.hdf5')
    predicts = model.predict([q1,q2],verbose= 0)
    predict_logits = predicts.argmax(axis=1)
    print("----FINAL RESULT----")
    if(predict_logits[0] == 1):
        print("****Questions are Similar****")
    else:
        print("****Questions are not Similar****")