In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

In [3]:
df.shape

(404290, 6)

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
new_df = df.sample(50000)

In [6]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [7]:
new_df.duplicated().sum()

0

In [8]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
198790,What are the best available micro touch pH ele...,Does air have a pH?
278550,"What is the difference between vitrified, doub...",How do I know if my floor tiles have asbestos?
174992,Why should I vote for Trump?,Why should I vote for Hillary Clinton over Don...
224055,How do I avoid sleeping while reading?,What should I do to avoid sleep for studies?
338284,Which are the courses on Coursera or other MOO...,Which are the courses on Coursera or other MOO...


**Using Bag of words for question1 and 2 columns**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])


In [11]:
cv = CountVectorizer(max_features=4000)


In [12]:
bow = cv.fit_transform(questions)

In [13]:
q1_arr, q2_arr = np.vsplit(bow.toarray(),2)

**Creating a temporary dataframe with Bow as columns**


In [14]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(50000, 8000)

In [15]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
198790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
174992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
338284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139457,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
257633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
273463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
temp_df['is_duplicate'] = new_df['is_duplicate']

In [17]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3991,3992,3993,3994,3995,3996,3997,3998,3999,is_duplicate
198790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
174992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
338284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# temp_df.to_csv('/content/drive/MyDrive/temp_df.csv')

**Train - Test split**

In [3]:
temp_df = pd.read_csv('/content/drive/MyDrive/temp_df.csv',index_col=0)

In [4]:
temp_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3991.1,3992.1,3993.1,3994.1,3995.1,3996.1,3997.1,3998.1,3999.1,is_duplicate
69665,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
100788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188650,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
28932,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [5]:
from sklearn.model_selection import train_test_split

In [7]:
x = temp_df.iloc[:,0:-1].values  

50000

In [8]:
y = temp_df.iloc[:,-1].values

In [9]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
# X_train,X_test,y_train,y_test = train_test_split( temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)

**Model Building**

In [10]:
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
rf = RandomForestClassifier()
xgb = XGBClassifier()
dt = DecisionTreeClassifier()
gb = GradientBoostingClassifier()

In [12]:
rf.fit(X_train,y_train)

RandomForestClassifier()

In [None]:
xgb.fit(X_train,y_train)

In [None]:
dt.fit(X_train,y_train)

In [None]:
gb.fit(X_train,y_train)

In [None]:
# Accuracy score of random forest
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
# Accuracy score of xgBoost
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
# Accuracy score of decision tree
y_pred = dt.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
# Accuracy score of random forest
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)