In [1]:
import utils
import os
import pandas as pd
import sklearn
from sklearn import *
import os
import pickle
import numpy as np
import scipy
from gensim.models import KeyedVectors

# Simple model (Count Vectorizer)

In [2]:
par_dir = os.path.normpath(os.path.join(os.getcwd(), os.pardir))
data_dir = "/nlp_quora/Datasets/QuoraQuestionPairs/"
path_folder_quora = f"{par_dir}{data_dir}"

In [3]:
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
A_df, te_df = sklearn.model_selection.train_test_split(train_df,test_size=0.05,random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,test_size=0.05,random_state=123)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',te_df.shape)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


In [4]:
q1_train =  utils.cast_list_as_strings(list(tr_df["question1"]))
q2_train =  utils.cast_list_as_strings(list(tr_df["question2"]))
q1_validation =  utils.cast_list_as_strings(list(va_df["question1"]))
q2_validation =  utils.cast_list_as_strings(list(va_df["question2"]))
q1_test = utils.cast_list_as_strings(list(te_df["question1"]))
q2_test = utils.cast_list_as_strings(list(te_df["question2"]))

In [8]:
par_dir = os.path.normpath(os.path.join(os.getcwd(), os.pardir))
models_dir = "/nlp_quora/model_artifacts/"
models_dir_quora = f"{par_dir}{models_dir}"

In [6]:
count_vectorizer = pickle.load(open(f'{models_dir_quora}/count_vectorizer.pkl', 'rb'))
logistic = pickle.load(open(f'{models_dir_quora}/logistic.pkl', 'rb'))

In [6]:
X_tr_q1q2 = utils.get_features_from_df(tr_df, count_vectorizer)
X_va_q1q2 = utils.get_features_from_df(va_df, count_vectorizer)
X_te_q1q2 = utils.get_features_from_df(te_df, count_vectorizer)

X_tr_q1q2.shape, tr_df.shape, X_va_q1q2.shape, va_df.shape, X_te_q1q2.shape, te_df.shape

((291897, 149650),
 (291897, 6),
 (15363, 149650),
 (15363, 6),
 (16172, 149650),
 (16172, 6))

In [7]:
y_train = tr_df["is_duplicate"].values
y_validation = va_df["is_duplicate"].values
y_test = te_df["is_duplicate"].values

In [8]:
simple_model_metrics = np.zeros((3,5))
simple_model_metrics[0,:] = utils.calculate_metrics(y_train,X_tr_q1q2,logistic)
simple_model_metrics[1,:] = utils.calculate_metrics(y_validation,X_va_q1q2,logistic)
simple_model_metrics[2,:] = utils.calculate_metrics(y_test,X_te_q1q2,logistic)

In [9]:
rows = ["Train","Val.","Test"]
columns = ["Roc auc","Accuracy","Precision","Recall","f1 score"]
simple_model_df = pd.DataFrame(simple_model_metrics,rows,columns)
simple_model_df

Unnamed: 0,Roc auc,Accuracy,Precision,Recall,f1 score
Train,0.78749,0.813972,0.78206,0.686707,0.731288
Val.,0.720278,0.749007,0.677167,0.610729,0.642234
Test,0.72936,0.757791,0.695546,0.618778,0.65492


# Improved model

In [10]:
par_dir = os.path.normpath(os.path.join(os.getcwd(), os.pardir))
data_dir = "/Datasets/QuoraQuestionPairs/"
path_folder_quora = f"{par_dir}{data_dir}"

In [11]:
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
A_df, te_df = sklearn.model_selection.train_test_split(train_df,test_size=0.05,random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,test_size=0.05,random_state=123)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',te_df.shape)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


In [12]:
tr_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
61482,125898,203030,203031,Is Java or C++ or C the most popular language ...,How do I develop a software which will have a ...,0
131546,36249,66113,66114,How do you convert direct speech into reported...,I feel weak at spoken English. I have sentence...,0
22927,199864,301469,301470,Where can I buy used wine barrels?,Where can you buy used wine barrels?,1
183520,277339,17728,138400,What was the best day of your life? (Excluding...,What is the Best Day of your life till date?,1
67694,392907,525647,525648,How is web-work.in works?,How do I get web designing work?,0


## Preprocessing

First, a preprocessing is applied to the 2 questions: Casting it as strings, lowering and removing stop words. We also tokenize them.

In [13]:
#Cast
q1_train_c =  utils.cast_list_as_strings(list(tr_df["question1"]))
q2_train_c =  utils.cast_list_as_strings(list(tr_df["question2"]))
q1_validation_c =  utils.cast_list_as_strings(list(va_df["question1"]))
q2_validation_c =  utils.cast_list_as_strings(list(va_df["question2"]))
q1_test_c =  utils.cast_list_as_strings(list(te_df["question1"]))
q2_test_c =  utils.cast_list_as_strings(list(te_df["question2"]))

In [14]:
#Lower
q1_train_lw = utils.lower_list(q1_train_c)
q2_train_lw = utils.lower_list(q2_train_c)
q1_validation_lw = utils.lower_list(q1_validation_c)
q2_validation_lw = utils.lower_list(q2_validation_c)
q1_test_lw = utils.lower_list(q1_test_c)
q2_test_lw = utils.lower_list(q2_test_c)

In [15]:
#Remove stop words
stop_words = ["a", "an", "the", "and", "but", "or", "in", "on", "at", "to", "of", "for","i","you","he","she","it","we","they",
             "me","him","her","us","them","my","your","his","its","our","their","mine","yours","hers","ours","theirs","myself",
              "yourself","himself","herself","itself","ourselves","yourselves","themselves","this","that","these","those"]
q1_train = utils.remove_sw(q1_train_lw,stop_words)
q2_train = utils.remove_sw(q2_train_lw,stop_words)
q1_validation = utils.remove_sw(q1_validation_lw,stop_words)
q2_validation = utils.remove_sw(q2_validation_lw,stop_words)
q1_test = utils.remove_sw(q1_test_lw,stop_words)
q2_test = utils.remove_sw(q2_test_lw,stop_words)

In [16]:
#Tokenize
q1_train_tokens = utils.tokenize(q1_train)
q2_train_tokens = utils.tokenize(q2_train)
q1_validation_tokens = utils.tokenize(q1_validation)
q2_validation_tokens = utils.tokenize(q2_validation)
q1_test_tokens = utils.tokenize(q1_test)
q2_test_tokens = utils.tokenize(q2_test)

## Jaccard distance at sentence level

In [17]:
jd_feature_train = utils.generate_jd_feature(q1_train_tokens,q2_train_tokens)
jd_feature_val = utils.generate_jd_feature(q1_validation_tokens,q2_validation_tokens)
jd_feature_test = utils.generate_jd_feature(q1_test_tokens,q2_test_tokens)

In [18]:
jd_feature_train = np.array(jd_feature_train).reshape(-1,1)
jd_feature_val = np.array(jd_feature_val).reshape(-1,1)
jd_feature_test = np.array(jd_feature_test).reshape(-1,1)

# Cosine Distance

In [19]:
par_dir = os.path.normpath(os.path.join(os.getcwd(), os.pardir))
models_dir = "/nlp_quora/model_artifacts/"
models_dir_quora = f"{par_dir}{models_dir}"

In [None]:
tfidf_vectorizer = pickle.load(open(f'{models_dir_quora}/tfidf_vectorizer.pkl', 'rb'))

tfidf_vectorizer_train_q1 = tfidf_vectorizer.transform(q1_train)
tfidf_vectorizer_train_q2 = tfidf_vectorizer.transform(q2_train)

tfidf_vectorizer_val_q1 = tfidf_vectorizer.transform(q1_validation)
tfidf_vectorizer_val_q2 = tfidf_vectorizer.transform(q2_validation)

tfidf_vectorizer_test_q1 = tfidf_vectorizer.transform(q1_test)
tfidf_vectorizer_test_q2 = tfidf_vectorizer.transform(q2_test)

In [21]:
tfidf_train = scipy.sparse.hstack((tfidf_vectorizer_train_q1,tfidf_vectorizer_train_q2))
tfidf_val = scipy.sparse.hstack((tfidf_vectorizer_val_q1,tfidf_vectorizer_val_q2))
tfidf_test = scipy.sparse.hstack((tfidf_vectorizer_test_q1,tfidf_vectorizer_test_q2))

In [22]:
cosine_distance_train = np.array([utils.cosine_distance(i,j) for i, j in zip(tfidf_vectorizer_train_q1,tfidf_vectorizer_train_q2)]).reshape(-1,1)
cosine_distance_val = np.array([utils.cosine_distance(i,j) for i, j in zip(tfidf_vectorizer_val_q1,tfidf_vectorizer_val_q2)]).reshape(-1,1)
cosine_distance_test = np.array([utils.cosine_distance(i,j) for i, j in zip(tfidf_vectorizer_test_q1,tfidf_vectorizer_test_q2)]).reshape(-1,1)

cosine_distance_train = np.nan_to_num(cosine_distance_train, nan=0.0)
cosine_distance_val = np.nan_to_num(cosine_distance_val, nan=0.0)
cosine_distance_test = np.nan_to_num(cosine_distance_test, nan=0.0)

  vector2.T.toarray()[0]/np.linalg.norm(vector2.toarray()))
  vector1.T.toarray()[0]/np.linalg.norm(vector1.toarray()),


# Embeddings

In [23]:
wv = KeyedVectors.load(f'{models_dir_quora}/wordvectors', mmap='r')

embedings_train_q1 = utils.w2v_embedding(q1_train_tokens, wv)
embedings_train_q2 = utils.w2v_embedding(q2_train_tokens, wv)

embedings_val_q1 = utils.w2v_embedding(q1_validation_tokens, wv)
embedings_val_q2 = utils.w2v_embedding(q2_validation_tokens, wv)

embedings_test_q1 = utils.w2v_embedding(q1_test_tokens, wv)
embedings_test_q2 = utils.w2v_embedding(q2_test_tokens, wv)

In [24]:
cosine_distance_embeddings_train = np.array([utils.cosine_distance(i,j) for i, j in 
                                  zip(embedings_train_q1,embedings_train_q2)]).reshape(-1,1)

cosine_distance_embeddings_val = np.array([utils.cosine_distance(i,j) for i, j in 
                                  zip(embedings_val_q1,embedings_val_q2)]).reshape(-1,1)

cosine_distance_embeddings_test = np.array([utils.cosine_distance(i,j) for i, j in 
                                  zip(embedings_test_q1,embedings_test_q2)]).reshape(-1,1)

## Comparing Words (first one and ordered)

In [25]:
kw_feature_train = utils.generate_key_words_feature(q1_train_tokens,q2_train_tokens)
kw_feature_val = utils.generate_key_words_feature(q1_validation_tokens,q2_validation_tokens)
kw_feature_test = utils.generate_key_words_feature(q1_test_tokens,q2_test_tokens)

In [26]:
kw_feature_train = np.array(kw_feature_train).reshape(-1,1)
kw_feature_val = np.array(kw_feature_val).reshape(-1,1)
kw_feature_test = np.array(kw_feature_test).reshape(-1,1)

In [27]:
ow_feature_train = utils.generate_ordered_words_feature(q1_train_tokens,q2_train_tokens)
ow_feature_val = utils.generate_ordered_words_feature(q1_validation_tokens,q2_validation_tokens)
ow_feature_test = utils.generate_ordered_words_feature(q1_test_tokens,q2_test_tokens)

In [28]:
ow_feature_train = np.array(ow_feature_train).reshape(-1,1)
ow_feature_val = np.array(ow_feature_val).reshape(-1,1)
ow_feature_test = np.array(ow_feature_test).reshape(-1,1)

## Evaluation of the logistic model

In [29]:
X_tr = scipy.sparse.hstack((jd_feature_train, tfidf_train, kw_feature_train, ow_feature_train, cosine_distance_train, embedings_train_q1, embedings_train_q2, cosine_distance_embeddings_train))
X_va = scipy.sparse.hstack((jd_feature_val, tfidf_val, kw_feature_val, ow_feature_val, cosine_distance_val, embedings_val_q1, embedings_val_q2, cosine_distance_embeddings_val))
X_te = scipy.sparse.hstack((jd_feature_test, tfidf_test, kw_feature_test, ow_feature_test, cosine_distance_test, embedings_test_q1, embedings_test_q2, cosine_distance_embeddings_test))

In [11]:
logistic_imp = pickle.load(open(f'{models_dir_quora}/logistic_imp.pkl', 'rb'))

In [31]:
y_train = tr_df["is_duplicate"].values
y_validation = va_df["is_duplicate"].values
y_test = te_df["is_duplicate"].values

In [32]:
imp_model_metrics = np.zeros((3,5))
imp_model_metrics[0,:] = utils.calculate_metrics(y_train,X_tr,logistic_imp)
imp_model_metrics[1,:] = utils.calculate_metrics(y_validation,X_va,logistic_imp)
imp_model_metrics[2,:] = utils.calculate_metrics(y_test,X_te,logistic_imp)

In [33]:
rows = ["Train","Val.","Test"]
columns = ["Roc auc","Accuracy","Precision","Recall","f1 score"]
imp_model_df = pd.DataFrame(imp_model_metrics,rows,columns)
imp_model_df

Unnamed: 0,Roc auc,Accuracy,Precision,Recall,f1 score
Train,0.803387,0.824212,0.782721,0.724133,0.752288
Val.,0.766909,0.791903,0.740179,0.671608,0.704228
Test,0.770093,0.794274,0.746233,0.676045,0.709407


In [34]:
simple_model_df

Unnamed: 0,Roc auc,Accuracy,Precision,Recall,f1 score
Train,0.78749,0.813972,0.78206,0.686707,0.731288
Val.,0.720278,0.749007,0.677167,0.610729,0.642234
Test,0.72936,0.757791,0.695546,0.618778,0.65492
