In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('quora_duplicate_questions.csv')

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,False
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,False
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,False
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,False
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404351 entries, 0 to 404350
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404351 non-null  int64 
 1   qid1          404351 non-null  int64 
 2   qid2          404351 non-null  int64 
 3   question1     404350 non-null  object
 4   question2     404348 non-null  object
 5   is_duplicate  404351 non-null  bool  
dtypes: bool(1), int64(3), object(2)
memory usage: 15.8+ MB


In [5]:
df['is_duplicate'] = df['is_duplicate'].map({True : 1, False : 0})
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
# since dataset is too large to process, we will work on a smaller dataset to get results in time

new_df = df.sample(30000)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
216988,216988,427839,427840,Can I connect a 5.1 speaker with a 3.5 mm jack...,Which is the best 5.1 home theatre below 10000?,0
367150,367150,718605,718606,How do you determine the specific gravity of c...,What are the characteristics of specific gravity?,0
384318,384318,751490,751491,What is the corporate culture like at Burlingt...,What is the corporate culture like at Smart & ...,0
290600,290600,570965,570966,What are the biggest regrets of your life?,What is your biggest regrett in life?,1
126442,126442,250508,250509,What is the United Nations Organization?,How are the United Nations organized?,0


In [7]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 216988 to 184646
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            30000 non-null  int64 
 1   qid1          30000 non-null  int64 
 2   qid2          30000 non-null  int64 
 3   question1     30000 non-null  object
 4   question2     30000 non-null  object
 5   is_duplicate  30000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 1.6+ MB


In [8]:
new_df.duplicated().sum()

0

In [9]:
ques_df = new_df[['question1', 'question2']]
ques_df.head()

Unnamed: 0,question1,question2
216988,Can I connect a 5.1 speaker with a 3.5 mm jack...,Which is the best 5.1 home theatre below 10000?
367150,How do you determine the specific gravity of c...,What are the characteristics of specific gravity?
384318,What is the corporate culture like at Burlingt...,What is the corporate culture like at Smart & ...
290600,What are the biggest regrets of your life?,What is your biggest regrett in life?
126442,What is the United Nations Organization?,How are the United Nations organized?


In [10]:
# combining all the questions

questions = ques_df['question1'].to_list() + ques_df['question2'].to_list()

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 3000)

In [12]:
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(), 2)

In [13]:
temp_df1 = pd.DataFrame(q1_arr, index = ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index = ques_df.index)

In [14]:
temp_df = pd.concat([temp_df1, temp_df2], axis = 1)

In [15]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
216988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
384318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
290600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
126442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
temp_df.shape

(30000, 6000)

In [17]:
temp_df['is_duplicate'] = new_df['is_duplicate']
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
216988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
384318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
290600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
126442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X = temp_df.iloc[:, :-1].values
Y = temp_df.iloc[:, -1].values

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()

rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

accuracy_score(Y_test, Y_pred)

0.744

In [24]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train, Y_train)
Y_pred = xgb.predict(X_test)

accuracy_score(Y_test, Y_pred)

0.7258333333333333