In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv(r'questions.csv')

In [3]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
data.shape

(404351, 6)

# Working with only 50,000 rows

In [5]:
new_data = data.sample(50000)

In [6]:
new_data.shape

(50000, 6)

In [7]:
# null values
new_data.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [8]:
# dropping null values
new_data = new_data.dropna()

In [9]:
new_data.isna().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [10]:
# duplicate values
new_data.duplicated().sum()

0

In [11]:
que_data = new_data[['question1', 'question2']]

In [12]:
que_data.head()

Unnamed: 0,question1,question2
287739,"Which engine is more efficient, powerful and r...",Is a diesel engine stronger than a petrol engine?
162456,How cheating is different from lying?,What are the differences between cheating and ...
369418,Yamaha R15 vs Honda CBR150R. which one is a be...,Which is a better bike: the Yamaha R15 or Puls...
296723,What is the origin of Telugu language?,How did the Telugu language originate?
367897,Can I get admitted to Stanford?,How do I get into Stanford as an undergraduate?


# Vectorization with Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)

cv.fit(list(que_data['question1']) + list(que_data['question2']))

In [14]:
# transform each column separately
q1_vector = cv.transform(que_data['question1']).toarray()
q2_vector = cv.transform(que_data['question2']).toarray()

In [15]:
print(q1_vector.shape)
print(q2_vector.shape)

(50000, 3000)
(50000, 3000)


In [16]:
# build dataframes
q1_df = pd.DataFrame(q1_vector, index=que_data.index)
q2_df = pd.DataFrame(q2_vector, index=que_data.index)

In [17]:
q1_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
361904,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
154391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
207554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# concatenate both dataframes
final_df = pd.concat([q1_df, q2_df], axis=1)

In [19]:
final_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
287739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
369418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
296723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
151563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
153221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
final_df.shape

(50000, 6000)

* question 1 has 3000 features and question 2 has 3000 features so total 6000 features

In [21]:
final_df['is_duplicate'] = new_data['is_duplicate']

In [22]:
final_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
287739,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
369418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
296723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
367897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387778,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
151563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
99063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
153221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
X = final_df.iloc[:, :-1]
y = final_df.iloc[:, -1]

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print(X_train.shape)
print(X_test.shape)

(40000, 6000)
(10000, 6000)


# Model Building

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rfc))

Accuracy: 0.7522


In [27]:
from sklearn.linear_model import LogisticRegression

lor = LogisticRegression(max_iter=500)
lor.fit(X_train, y_train)
y_pred_lr = lor.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Accuracy: 0.7076
