# Importing the required dependencies

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,precision_score,recall_score
from sklearn.svm import SVC


# Loading the dataset into pandas dataframe

In [5]:
df = pd.read_csv('quora_duplicate_questions.tsv',sep = '\t')

# Data Preprocessing

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = df.sample(20000)

In [8]:
df.head(5)    # Viewing the dataset

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
1938,1938,3857,3858,How do you delete a question you asked on Quora,Is it possible to permanently delete a Quora q...,1
309728,309728,60047,433761,Why is Quora called Quora?,Does Quora give you an interview call if you a...,0
27035,27035,50250,50251,Why does btech life is so strange?,How does a Product manager get back to work af...,0
112611,112611,184223,184224,How do I calculate the physical address of dat...,We use so many assumptions and imperfect calcu...,0
340835,340835,468614,468615,"What did John Wooden mean when he said ""be qui...",What was it like to play for John Wooden at UCLA?,0


In [9]:
df = df.drop('id',1)    # Dropping the not required id column

  """Entry point for launching an IPython kernel.


In [10]:
df.head(5)          # Viewing the dataset

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
1938,3857,3858,How do you delete a question you asked on Quora,Is it possible to permanently delete a Quora q...,1
309728,60047,433761,Why is Quora called Quora?,Does Quora give you an interview call if you a...,0
27035,50250,50251,Why does btech life is so strange?,How does a Product manager get back to work af...,0
112611,184223,184224,How do I calculate the physical address of dat...,We use so many assumptions and imperfect calcu...,0
340835,468614,468615,"What did John Wooden mean when he said ""be qui...",What was it like to play for John Wooden at UCLA?,0


In [11]:
df.shape          # Dimensions of the dataset

(20000, 5)

In [12]:
df = df.dropna(how='any',axis=0)    # Dropping rows with null values

In [13]:
df.info()                  # Getting Information about the dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19999 entries, 1938 to 248119
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   qid1          19999 non-null  int64 
 1   qid2          19999 non-null  int64 
 2   question1     19999 non-null  object
 3   question2     19999 non-null  object
 4   is_duplicate  19999 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 937.5+ KB


# Creating a dataframe containing question1 and question2 columns only

In [19]:
que_df = df[["question1","question2"]]
que_df.head()

Unnamed: 0,question1,question2
1938,How do you delete a question you asked on Quora,Is it possible to permanently delete a Quora q...
309728,Why is Quora called Quora?,Does Quora give you an interview call if you a...
27035,Why does btech life is so strange?,How does a Product manager get back to work af...
112611,How do I calculate the physical address of dat...,We use so many assumptions and imperfect calcu...
340835,"What did John Wooden mean when he said ""be qui...",What was it like to play for John Wooden at UCLA?


# Taking all the questions into a list

In [20]:
questions = list(que_df["question1"]) + list(que_df["question2"])
print(len(questions))

39998


# Applying CountVectorizer and creating a new data frame

In [21]:
cv = CountVectorizer(max_features=3000) #taking only 2000 features
q1_arr,q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2) #splitting questions
temp_df1 = pd.DataFrame(q1_arr,index=que_df.index)
temp_df2 = pd.DataFrame(q2_arr,index=que_df.index)
#concating two dataframes
temp_df = pd.concat([temp_df1,temp_df2],axis = 1)

# Checking the dimension and viewing the created dataframe

In [22]:
temp_df.shape    
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
1938,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
309728,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
27035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
207014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
239839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
300705,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Adding the is_duplicate column from our original dataframe

In [23]:
temp_df['is_duplicate'] = df['is_duplicate']
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
1938,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
309728,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
27035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
340835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
207014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
239839,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
300705,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Feature Engineering

In [24]:
#feature engineering
temp_df["q1_len"] = df["question1"].str.len()
temp_df["q2_len"] = df["question2"].str.len()

In [25]:
temp_df.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2993,2994,2995,2996,2997,2998,2999,is_duplicate,q1_len,q2_len
141514,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,70,74
181803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,54,57
88036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,45,78


In [26]:
temp_df["q1_num_words"] = df["question1"].apply(lambda row: len(row.split(" ")))
temp_df["q2_num_words"] = df["question2"].apply(lambda row: len(row.split(" ")))
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2995,2996,2997,2998,2999,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words
1938,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,47,54,10,9
309728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,26,77,5,14
27035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,34,68,7,14
112611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,82,112,16,21
340835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,67,49,13,11


In [27]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))

In [28]:
temp_df['word_total'] = df.apply(total_words, axis=1)
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2996,2997,2998,2999,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_total
1938,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,47,54,10,9,18
309728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,26,77,5,14,16
27035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,34,68,7,14,20
112611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,82,112,16,21,34
340835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,67,49,13,11,24


In [29]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)

In [30]:
temp_df['word_common'] = df.apply(common_words, axis=1)
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2997,2998,2999,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_total,word_common
1938,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,47,54,10,9,18,3
309728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,26,77,5,14,16,1
27035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,34,68,7,14,20,1
112611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,82,112,16,21,34,3
340835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,67,49,13,11,24,3


In [31]:
temp_df['word_share'] = round(temp_df['word_common']/temp_df['word_total'],2)
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2998,2999,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_total,word_common,word_share
1938,0,0,0,0,0,0,0,0,0,0,...,0,0,1,47,54,10,9,18,3,0.17
309728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,26,77,5,14,16,1,0.06
27035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,34,68,7,14,20,1,0.05
112611,0,0,0,0,0,0,0,0,0,0,...,0,0,0,82,112,16,21,34,3,0.09
340835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,67,49,13,11,24,3,0.12


# Dividing the dataframe into input and output frames

In [32]:
X = temp_df.drop(["is_duplicate"],axis=1)
y = temp_df["is_duplicate"]

# Splitting into train and test data

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Building Model using SVM and checking the test accuracy

In [40]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(X_test,y_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))



0.7245
[[2050  464]
 [ 638  848]]
0.6463414634146342
0.5706594885598923
