In [26]:
import utils
import os
import pandas as pd
import sklearn
from sklearn import *
import pickle
import numpy as np

# Simple model (Count Vectorizer)

In [27]:
path_folder_quora = "./Datasets/QuoraQuestionPairs/"

In [28]:
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
A_df, te_df = sklearn.model_selection.train_test_split(train_df,test_size=0.05,random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,test_size=0.05,random_state=123)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',te_df.shape)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


In [29]:
q1_train =  utils.cast_list_as_strings(list(tr_df["question1"]))
q2_train =  utils.cast_list_as_strings(list(tr_df["question2"]))
q1_validation =  utils.cast_list_as_strings(list(va_df["question1"]))
q2_validation =  utils.cast_list_as_strings(list(va_df["question2"]))
q1_test = utils.cast_list_as_strings(list(te_df["question1"]))
q2_test = utils.cast_list_as_strings(list(te_df["question2"]))

In [30]:
count_vectorizer = pickle.load(open('model_artifacts/count_vectorizer.pkl', 'rb'))
logistic = pickle.load(open('model_artifacts/logistic.pkl', 'rb'))

In [31]:
X_tr_q1q2 = utils.get_features_from_df(tr_df, count_vectorizer)
X_va_q1q2 = utils.get_features_from_df(va_df, count_vectorizer)
X_te_q1q2 = utils.get_features_from_df(te_df, count_vectorizer)

X_tr_q1q2.shape, tr_df.shape, X_va_q1q2.shape, va_df.shape, X_te_q1q2.shape, te_df.shape

((291897, 149650),
 (291897, 6),
 (15363, 149650),
 (15363, 6),
 (16172, 149650),
 (16172, 6))

In [32]:
y_train = tr_df["is_duplicate"].values
y_validation = va_df["is_duplicate"].values
y_test = te_df["is_duplicate"].values

auc_tr = metrics.roc_auc_score(y_train, logistic.predict(X_tr_q1q2))
auc_va = metrics.roc_auc_score(y_validation, logistic.predict(X_va_q1q2))
auc_te = metrics.roc_auc_score(y_test, logistic.predict(X_te_q1q2))

print('Training AUC Score: ' + str(auc_tr))
print('Validation AUC Score: ' + str(auc_va))
print('Test AUC Score: ' + str(auc_te))

Training AUC Score: 0.7874902327238901
Validation AUC Score: 0.7202777566666142
Test AUC Score: 0.7293595330779445


In [33]:
print(metrics.classification_report(y_train, logistic.predict(X_tr_q1q2)))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86    184298
           1       0.78      0.69      0.73    107599

    accuracy                           0.81    291897
   macro avg       0.81      0.79      0.79    291897
weighted avg       0.81      0.81      0.81    291897



In [34]:
print(metrics.classification_report(y_validation, logistic.predict(X_va_q1q2)))

              precision    recall  f1-score   support

           0       0.78      0.83      0.81      9696
           1       0.68      0.61      0.64      5667

    accuracy                           0.75     15363
   macro avg       0.73      0.72      0.72     15363
weighted avg       0.75      0.75      0.75     15363



In [35]:
print(metrics.classification_report(y_test, logistic.predict(X_te_q1q2)))

              precision    recall  f1-score   support

           0       0.79      0.84      0.81     10165
           1       0.70      0.62      0.65      6007

    accuracy                           0.76     16172
   macro avg       0.74      0.73      0.73     16172
weighted avg       0.75      0.76      0.75     16172



# Improved model

In [36]:
path_folder_quora = "./Datasets/QuoraQuestionPairs/"

In [37]:
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
A_df, te_df = sklearn.model_selection.train_test_split(train_df,test_size=0.05,random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,test_size=0.05,random_state=123)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',te_df.shape)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


In [38]:
tr_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
61482,125898,203030,203031,Is Java or C++ or C the most popular language ...,How do I develop a software which will have a ...,0
131546,36249,66113,66114,How do you convert direct speech into reported...,I feel weak at spoken English. I have sentence...,0
22927,199864,301469,301470,Where can I buy used wine barrels?,Where can you buy used wine barrels?,1
183520,277339,17728,138400,What was the best day of your life? (Excluding...,What is the Best Day of your life till date?,1
67694,392907,525647,525648,How is web-work.in works?,How do I get web designing work?,0


## Preprocessing

First, a preprocessing is applied to the 2 questions: Casting it as strings, lowering and removing stop words. We also tokenize them.

In [39]:
#Cast
q1_train_c =  utils.cast_list_as_strings(list(tr_df["question1"]))
q2_train_c =  utils.cast_list_as_strings(list(tr_df["question2"]))
q1_validation_c =  utils.cast_list_as_strings(list(va_df["question1"]))
q2_validation_c =  utils.cast_list_as_strings(list(va_df["question2"]))
q1_test_c =  utils.cast_list_as_strings(list(te_df["question1"]))
q2_test_c =  utils.cast_list_as_strings(list(te_df["question2"]))

In [40]:
#Lower
q1_train_lw = utils.lower_list(q1_train_c)
q2_train_lw = utils.lower_list(q2_train_c)
q1_validation_lw = utils.lower_list(q1_validation_c)
q2_validation_lw = utils.lower_list(q2_validation_c)
q1_test_lw = utils.lower_list(q1_test_c)
q2_test_lw = utils.lower_list(q2_test_c)

In [41]:
#Remove stop words
stop_words = ["a", "an", "the", "and", "but", "or", "in", "on", "at", "to", "of", "for","i","you","he","she","it","we","they",
             "me","him","her","us","them","my","your","his","its","our","their","mine","yours","hers","ours","theirs","myself",
              "yourself","himself","herself","itself","ourselves","yourselves","themselves","this","that","these","those"]
q1_train = utils.remove_sw(q1_train_lw,stop_words)
q2_train = utils.remove_sw(q2_train_lw,stop_words)
q1_validation = utils.remove_sw(q1_validation_lw,stop_words)
q2_validation = utils.remove_sw(q2_validation_lw,stop_words)
q1_test = utils.remove_sw(q1_test_lw,stop_words)
q2_test = utils.remove_sw(q2_test_lw,stop_words)

In [42]:
#Tokenize
q1_train_tokens = utils.tokenize(q1_train)
q2_train_tokens = utils.tokenize(q2_train)
q1_validation_tokens = utils.tokenize(q1_validation)
q2_validation_tokens = utils.tokenize(q2_validation)
q1_test_tokens = utils.tokenize(q1_test)
q2_test_tokens = utils.tokenize(q2_test)

## Jaccard distance at sentence level

In [43]:
jd_feature_train = utils.generate_jd_feature(q1_train_tokens,q2_train_tokens)
jd_feature_val = utils.generate_jd_feature(q1_validation_tokens,q2_validation_tokens)
jd_feature_test = utils.generate_jd_feature(q1_test_tokens,q2_test_tokens)

In [44]:
jd_feature_train = np.array(jd_feature_train).reshape(-1,1)
jd_feature_val = np.array(jd_feature_val).reshape(-1,1)
jd_feature_test = np.array(jd_feature_test).reshape(-1,1)

## Evaluation of the logistic model

In [45]:
#Concatenate here all your features, bros
X_tr = jd_feature_train
X_va = jd_feature_val
X_te = jd_feature_test

In [46]:
logistic_imp = pickle.load(open('model_artifacts/logistic_imp.pkl', 'rb'))

In [47]:
y_train = tr_df["is_duplicate"].values
y_validation = va_df["is_duplicate"].values
y_test = te_df["is_duplicate"].values

auc_tr = metrics.roc_auc_score(y_train, logistic_imp.predict(X_tr))
auc_va = metrics.roc_auc_score(y_validation, logistic_imp.predict(X_va))
auc_te = metrics.roc_auc_score(y_test, logistic_imp.predict(X_te))

print('Training AUC Score: ' + str(auc_tr))
print('Validation AUC Score: ' + str(auc_va))
print('Test AUC Score: ' + str(auc_te))

Training AUC Score: 0.5813001290293094
Validation AUC Score: 0.5812464675199654
Test AUC Score: 0.5785385979023816


In [48]:
print(metrics.classification_report(y_train, logistic_imp.predict(X_tr)))

              precision    recall  f1-score   support

           0       0.68      0.82      0.75    184298
           1       0.53      0.34      0.41    107599

    accuracy                           0.64    291897
   macro avg       0.60      0.58      0.58    291897
weighted avg       0.62      0.64      0.62    291897



In [49]:
print(metrics.classification_report(y_validation, logistic_imp.predict(X_va)))

              precision    recall  f1-score   support

           0       0.68      0.83      0.75      9696
           1       0.53      0.34      0.41      5667

    accuracy                           0.65     15363
   macro avg       0.61      0.58      0.58     15363
weighted avg       0.63      0.65      0.62     15363



In [50]:
print(metrics.classification_report(y_test, logistic_imp.predict(X_te)))

              precision    recall  f1-score   support

           0       0.68      0.82      0.74     10165
           1       0.53      0.34      0.41      6007

    accuracy                           0.64     16172
   macro avg       0.60      0.58      0.58     16172
weighted avg       0.62      0.64      0.62     16172

