In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_pickle("/content/gdrive/MyDrive/Dataset/questions_preprocessed.pkl")

In [6]:
# creating a single list of unique questions
uniq_questions = pd.concat((df['question1'], df['question2'])).unique() 

In [7]:
# applying count vectorizer
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(uniq_questions)

CountVectorizer(token_pattern='\\w{1,}')

In [8]:
# transform question1 and question2 using count vectorizer
question1_vect = count_vect.transform(df['question1'])
question2_vect = count_vect.transform(df['question2'])


In [9]:
# list all column names
print(df.columns)

Index(['question1', 'question2', 'is_duplicate', 'q1_len', 'q2_len',
       'q1_word_len', 'q2_word_len', 'q1_char_len', 'q2_char_len', 'len_diff',
       'word_len_diff', 'char_len_diff', 'common_words', 'common_words_ratio',
       'fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio',
       'token_set_ratio'],
      dtype='object')


In [10]:
# all features
from scipy.sparse import hstack
X = hstack((question1_vect, question2_vect, df[['q1_len','q2_len','q1_word_len','q1_word_len','q1_char_len','q1_char_len','len_diff','word_len_diff','char_len_diff','common_words','common_words_ratio', 'fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio', 'token_set_ratio']]))

# output feature: is_duplicate
y = df['is_duplicate']

In [11]:
# splitting the training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
Classification_Perfomance=pd.DataFrame(columns=['Classifier','Feature extraction','Accuracy'])

###Logistic regression

In [13]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, log_loss, f1_score

In [14]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [15]:
print("Accuracy Score: {}\n".format(accuracy_score(y_test, y_pred)))
print('Classification Report:\n')
print(classification_report(y_test, y_pred))

Accuracy Score: 0.6902718345741918

Classification Report:

              precision    recall  f1-score   support

           0       0.75      0.77      0.76     25407
           1       0.59      0.56      0.57     15022

    accuracy                           0.69     40429
   macro avg       0.67      0.66      0.66     40429
weighted avg       0.69      0.69      0.69     40429



In [16]:
Acc_Log=accuracy_score(y_test,logreg.predict(X_test))
Classification_Perfomance=Classification_Perfomance.append({"Classifier":"Logistic Regression","Feature extraction":"Bag of Words",'Accuracy':Acc_Log},ignore_index=True)
Classification_Perfomance

Unnamed: 0,Classifier,Feature extraction,Accuracy
0,Logistic Regression,Bag of Words,0.690272


###XGBoost 

In [17]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', n_jobs=-1, random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

In [18]:
print("Accuracy Score: {}\n".format(accuracy_score(y_test, y_pred)))
print('Classification Report:\n')
print(classification_report(y_test, y_pred))

Accuracy Score: 0.7538400652996612

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.76      0.80     25407
           1       0.65      0.74      0.69     15022

    accuracy                           0.75     40429
   macro avg       0.74      0.75      0.74     40429
weighted avg       0.76      0.75      0.76     40429



In [19]:
Acc_Log=accuracy_score(y_test,xgb_clf.predict(X_test))
Classification_Perfomance=Classification_Perfomance.append({"Classifier":"XGBoost","Feature extraction":"Bag of Words",'Accuracy':Acc_Log},ignore_index=True)
Classification_Perfomance

Unnamed: 0,Classifier,Feature extraction,Accuracy
0,Logistic Regression,Bag of Words,0.690272
1,XGBoost,Bag of Words,0.75384


###SVM

In [20]:
from sklearn import svm
svm_clf = svm.LinearSVC(random_state=20)
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)

In [21]:
print("Accuracy Score: {}\n".format(accuracy_score(y_test, y_pred)))
print('Classification Report:\n')
print(classification_report(y_test, y_pred))

Accuracy Score: 0.7752108634890796

Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.80      0.82     25407
           1       0.68      0.74      0.71     15022

    accuracy                           0.78     40429
   macro avg       0.76      0.77      0.76     40429
weighted avg       0.78      0.78      0.78     40429



In [22]:
Acc_Log=accuracy_score(y_test,svm_clf.predict(X_test))
Classification_Perfomance=Classification_Perfomance.append({"Classifier":"SVM","Feature extraction":"Bag of Words",'Accuracy':Acc_Log},ignore_index=True)
Classification_Perfomance

Unnamed: 0,Classifier,Feature extraction,Accuracy
0,Logistic Regression,Bag of Words,0.690272
1,XGBoost,Bag of Words,0.75384
2,SVM,Bag of Words,0.775211
