In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle
import warnings
warnings.filterwarnings("ignore")


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
df = pd.read_pickle("/content/gdrive/MyDrive/Dataset/questions_preprocessed.pkl")

In [6]:
# creating a single list of unique questions
uniq_questions = pd.concat((df['question1'], df['question2'])).unique()   

In [7]:
# applying tfidf vectorizer- word level
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(uniq_questions)

TfidfVectorizer(token_pattern='\\w{1,}')

In [8]:
# transform question1 and question2 using tfidf vectorizer

question1_vect = tfidf_vect.transform(df['question1'])
question2_vect = tfidf_vect.transform(df['question2'])

In [9]:
# list all column names
print(df.columns)

Index(['question1', 'question2', 'is_duplicate', 'q1_len', 'q2_len',
       'q1_word_len', 'q2_word_len', 'q1_char_len', 'q2_char_len', 'len_diff',
       'word_len_diff', 'char_len_diff', 'common_words', 'common_words_ratio',
       'fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio',
       'token_set_ratio'],
      dtype='object')


In [11]:
# all features
from scipy.sparse import hstack
X = hstack((question1_vect, question2_vect, df[['q1_len','q2_len','q1_word_len','q1_word_len','q1_char_len','q1_char_len','len_diff','word_len_diff','char_len_diff','common_words','common_words_ratio', 'fuzz_ratio', 'fuzz_partial_ratio', 'token_sort_ratio', 'token_set_ratio']]))

# output feature: is_duplicate
y = df['is_duplicate']

In [12]:
# splitting the training and testing dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

###Logistic regression

In [13]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, log_loss, f1_score
print("Accuracy Score: {}\n".format(accuracy_score(y_test, y_pred)))
print('Classification Report:\n')
print(classification_report(y_test, y_pred))

Accuracy Score: 0.6736253679289619

Classification Report:

              precision    recall  f1-score   support

           0       0.73      0.77      0.75     25407
           1       0.57      0.51      0.54     15022

    accuracy                           0.67     40429
   macro avg       0.65      0.64      0.64     40429
weighted avg       0.67      0.67      0.67     40429



In [17]:
Classification_Perfomance=pd.DataFrame(columns=['Classifier','Feature extraction','Accuracy'])
Acc_Log=accuracy_score(y_test,logreg.predict(X_test))
Classification_Perfomance=Classification_Perfomance.append({"Classifier":"Logistic Regression","Feature extraction":"TF-IDF",'Accuracy':Acc_Log},ignore_index=True)
Classification_Perfomance

Unnamed: 0,Classifier,Feature extraction,Accuracy
0,Logistic Regression,TF-IDF,0.673625


###XGBoost

In [18]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', n_jobs=-1, random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

In [19]:
print("Accuracy Score: {}\n".format(accuracy_score(y_test, y_pred)))
print('Classification Report:\n')
print(classification_report(y_test, y_pred))

Accuracy Score: 0.7531227584159885

Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.77      0.80     25407
           1       0.65      0.72      0.69     15022

    accuracy                           0.75     40429
   macro avg       0.74      0.75      0.74     40429
weighted avg       0.76      0.75      0.76     40429



In [20]:
Acc_Log=accuracy_score(y_test,xgb_clf.predict(X_test))
Classification_Perfomance=Classification_Perfomance.append({"Classifier":"XGBoost","Feature extraction":"TF-IDF",'Accuracy':Acc_Log},ignore_index=True)
Classification_Perfomance

Unnamed: 0,Classifier,Feature extraction,Accuracy
0,Logistic Regression,TF-IDF,0.673625
1,XGBoost,TF-IDF,0.753123


###SVM

In [21]:
from sklearn import svm
svm_clf = svm.LinearSVC(random_state=20)
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)

In [22]:
print("Accuracy Score: {}\n".format(accuracy_score(y_test, y_pred)))
print('Classification Report:\n')
print(classification_report(y_test, y_pred))

Accuracy Score: 0.7529743500952286

Classification Report:

              precision    recall  f1-score   support

           0       0.75      0.91      0.82     25407
           1       0.76      0.49      0.60     15022

    accuracy                           0.75     40429
   macro avg       0.76      0.70      0.71     40429
weighted avg       0.75      0.75      0.74     40429



In [23]:
Acc_Log=accuracy_score(y_test,svm_clf.predict(X_test))
Classification_Perfomance=Classification_Perfomance.append({"Classifier":"SVM","Feature extraction":"TF-IDF",'Accuracy':Acc_Log},ignore_index=True)
Classification_Perfomance

Unnamed: 0,Classifier,Feature extraction,Accuracy
0,Logistic Regression,TF-IDF,0.673625
1,XGBoost,TF-IDF,0.753123
2,SVM,TF-IDF,0.752974
