In [1]:
import utils
import os
import pandas as pd
import sklearn
from sklearn import *
import os
import errno
import pickle
import numpy as np
import scipy

# Simple model (Count Vectorizer)

In [2]:
path_folder_quora = "./Datasets/QuoraQuestionPairs/"

In [3]:
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
A_df, te_df = sklearn.model_selection.train_test_split(train_df,test_size=0.05,random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,test_size=0.05,random_state=123)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',te_df.shape)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


In [4]:
tr_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
61482,125898,203030,203031,Is Java or C++ or C the most popular language ...,How do I develop a software which will have a ...,0
131546,36249,66113,66114,How do you convert direct speech into reported...,I feel weak at spoken English. I have sentence...,0
22927,199864,301469,301470,Where can I buy used wine barrels?,Where can you buy used wine barrels?,1
183520,277339,17728,138400,What was the best day of your life? (Excluding...,What is the Best Day of your life till date?,1
67694,392907,525647,525648,How is web-work.in works?,How do I get web designing work?,0


In [5]:
q1_train =  utils.cast_list_as_strings(list(tr_df["question1"]))
q2_train =  utils.cast_list_as_strings(list(tr_df["question2"]))
q1_validation =  utils.cast_list_as_strings(list(va_df["question1"]))
q2_validation =  utils.cast_list_as_strings(list(va_df["question2"]))

In [6]:
q1_train[0], q2_train[0]

('Is Java or C++ or C the most popular language amongst startups for backend development?',
 'How do I develop a software which will have a Java GUI and a C++ or C backend?')

In [7]:
all_train_questions = q1_train + q2_train

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_train_questions)

In [8]:
X_tr_q1q2 = utils.get_features_from_df(tr_df, count_vectorizer)

X_tr_q1q2.shape, tr_df.shape

((291897, 149650), (291897, 6))

In [9]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
y_train = tr_df["is_duplicate"].values
logistic.fit(X_tr_q1q2, y_train)

In [10]:
mistake_indices_tr, predictions_tr = utils.get_mistakes(logistic, X_tr_q1q2,  y_train)

utils.print_mistake_k(10, tr_df, mistake_indices_tr, predictions_tr)

What is a good website for free ebooks?
What are the sites to download free eBooks?
true class: 1
prediction: 0


In [11]:
# check if directory exists
if not os.path.exists('model_artifacts'):
    try:
        # create folder
        os.makedirs('model_artifacts')
        # open file descriptors for writing
        countvectorizer_file = open('model_artifacts/count_vectorizer.pkl', 'wb')
        logistic_file = open('model_artifacts/logistic.pkl', 'wb')
        # serialize and write objects to files
        pickle.dump(count_vectorizer, countvectorizer_file)
        pickle.dump(logistic, logistic_file)
        # close file descriptors
        countvectorizer_file.close()
        logistic_file.close()
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

# Improved model

In [2]:
path_folder_quora = "./Datasets/QuoraQuestionPairs/"

In [3]:
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
A_df, te_df = sklearn.model_selection.train_test_split(train_df,test_size=0.05,random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,test_size=0.05,random_state=123)
print('tr_df.shape=',tr_df.shape)
print('va_df.shape=',va_df.shape)
print('te_df.shape=',te_df.shape)

tr_df.shape= (291897, 6)
va_df.shape= (15363, 6)
te_df.shape= (16172, 6)


In [4]:
tr_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
61482,125898,203030,203031,Is Java or C++ or C the most popular language ...,How do I develop a software which will have a ...,0
131546,36249,66113,66114,How do you convert direct speech into reported...,I feel weak at spoken English. I have sentence...,0
22927,199864,301469,301470,Where can I buy used wine barrels?,Where can you buy used wine barrels?,1
183520,277339,17728,138400,What was the best day of your life? (Excluding...,What is the Best Day of your life till date?,1
67694,392907,525647,525648,How is web-work.in works?,How do I get web designing work?,0


## Preprocessing

First, a preprocessing is applied to the 2 questions: Casting it as strings, lowering and removing stop words. We also tokenize them.

In [5]:
#Cast
q1_train_c =  utils.cast_list_as_strings(list(tr_df["question1"]))
q2_train_c =  utils.cast_list_as_strings(list(tr_df["question2"]))
q1_validation_c =  utils.cast_list_as_strings(list(va_df["question1"]))
q2_validation_c =  utils.cast_list_as_strings(list(va_df["question2"]))

In [6]:
#Lower
q1_train_lw = utils.lower_list(q1_train_c)
q2_train_lw = utils.lower_list(q2_train_c)
q1_validation_lw = utils.lower_list(q1_validation_c)
q2_validation_lw = utils.lower_list(q2_validation_c)

In [7]:
#Remove stop words
stop_words = ["a", "an", "the", "and", "but", "or", "in", "on", "at", "to", "of", "for","i","you","he","she","it","we","they",
             "me","him","her","us","them","my","your","his","its","our","their","mine","yours","hers","ours","theirs","myself",
              "yourself","himself","herself","itself","ourselves","yourselves","themselves","this","that","these","those"]
q1_train = utils.remove_sw(q1_train_lw,stop_words)
q2_train = utils.remove_sw(q2_train_lw,stop_words)
q1_validation = utils.remove_sw(q1_validation_lw,stop_words)
q2_validation = utils.remove_sw(q2_validation_lw,stop_words)

In [8]:
#Tokenize
q1_train_tokens = utils.tokenize(q1_train)
q2_train_tokens = utils.tokenize(q2_train)
q1_validation_tokens = utils.tokenize(q1_validation)
q2_validation_tokens = utils.tokenize(q2_validation)

In [9]:
print(len(q1_train_tokens),len(q2_train_tokens))

291897 291897


## Jaccard distance at sentence level

In [10]:
jd_feature_train = utils.generate_jd_feature(q1_train_tokens,q2_train_tokens)
jd_feature_val = utils.generate_jd_feature(q1_validation_tokens,q2_validation_tokens)

In [11]:
jd_feature_train = np.array(jd_feature_train).reshape(-1,1)
jd_feature_train.shape

(291897, 1)

In [12]:
print(q1_train_tokens[2])
print(q2_train_tokens[2])
print(tr_df["is_duplicate"].values[2])
print(jd_feature_train[2])

['where', 'can', 'buy', 'used', 'wine', 'barrels']
['where', 'can', 'buy', 'used', 'wine', 'barrels']
1
[0.]


In [13]:
print(q1_train_tokens[3])
print(q2_train_tokens[3])
print(tr_df["is_duplicate"].values[3])
print(jd_feature_train[3])

['what', 'was', 'best', 'day', 'life', 'excluding', 'family', 'things', 'like', 'births']
['what', 'is', 'best', 'day', 'life', 'till', 'date']
1
[0.69230769]


## Unit tests TF-IDF and Cosine Distatnce

In [15]:
all_train_questions = q1_train + q2_train

vectorizer = utils.tfidf_vectorizer(all_train_questions, max_features=10000)

tfidf_vectorizer_train_q1 = vectorizer.transform(q1_train)
tfidf_vectorizer_train_q2 = vectorizer.transform(q2_train)

tfidf_vectorizer_val_q1 = vectorizer.transform(q1_validation)
tfidf_vectorizer_val_q2 = vectorizer.transform(q2_validation)

In [16]:
utils.save_model(vectorizer, 'tfidf_vectorizer_train')

In [17]:
tfidf_train = scipy.sparse.hstack((tfidf_vectorizer_train_q1,tfidf_vectorizer_train_q2))
tfidf_val = scipy.sparse.hstack((tfidf_vectorizer_val_q1,tfidf_vectorizer_val_q2))

In [18]:
cosine_distance_train = np.array([utils.cosine_distance(i,j) for i, j in zip(tfidf_vectorizer_train_q1,tfidf_vectorizer_train_q2)]).reshape(-1,1)
cosine_distance_val = np.array([utils.cosine_distance(i,j) for i, j in zip(tfidf_vectorizer_val_q1,tfidf_vectorizer_val_q2)]).reshape(-1,1)

cosine_distance_train = np.nan_to_num(cosine_distance_train, nan=0.0)
cosine_distance_val = np.nan_to_num(cosine_distance_val, nan=0.0)

MemoryError: Unable to allocate 41.7 GiB for an array with shape (74780, 74780) and data type float64

# Embeddings

In [None]:
all_tokens_train = utils.tokenize(all_train_questions)

word2vec_train = utils.build_w2v_model(all_tokens_train, n_fueatures=300)
tokens_train_q1 = utils.tokenize(q1_train)
tokens_train_q2 = utils.tokenize(q2_train)
tokens_val_q1 = utils.tokenize(q1_validation)
tokens_val_q2 = utils.tokenize(q2_validation)

embedings_train_q1 = utils.w2v_embedding(tokens_train_q1, word2vec_train)
embedings_train_q2 = utils.w2v_embedding(tokens_train_q2, word2vec_train)

all_tokens_val = utils.tokenize(q1_validation + q2_validation)

word2vec_val = utils.build_w2v_model(all_tokens_val, n_fueatures=300)

embedings_val_q1 = utils.w2v_embedding(tokens_val_q1, word2vec_val)
embedings_val_q2 = utils.w2v_embedding(tokens_val_q2, word2vec_val)

utils.save_model(word2vec_train, 'word2vec_train')
utils.save_model(word2vec_val, 'word2vec_val')

In [None]:
cosine_distance_embeddings_train = np.array([utils.cosine_distance(i,j) for i, j in 
                                  zip(embedings_train_q1,embedings_train_q2)]).reshape(-1,1)

cosine_distance_embeddings_val = np.array([utils.cosine_distance(i,j) for i, j in 
                                  zip(embedings_val_q1,embedings_val_q2)]).reshape(-1,1)

## Comparing Words (keys ones and by order)

In [19]:
kw_feature_train = utils.generate_key_words_feature(q1_train_tokens,q2_train_tokens)
kw_feature_val = utils.generate_key_words_feature(q1_validation_tokens,q2_validation_tokens)

In [20]:
kw_feature_train = np.array(kw_feature_train).reshape(-1,1)
kw_feature_train.shape

(291897, 1)

In [21]:
print(q1_train_tokens[20])
print(q2_train_tokens[20])
print(tr_df["is_duplicate"].values[20])
print(kw_feature_train[20])

['what', 'do', 'hate', 'about', 'toptal', 's', 'interviewing', 'process']
['need', 'indian', 'representative', 'supply', 'raw', 'materials', 'pharmaceutical', 'products', 'company', 'roger', 'rene957', 'yahoo', 'com', 'bst', 'rgd', 'rene']
0
[0]


In [22]:
print(q1_train_tokens[3])
print(q2_train_tokens[3])
print(tr_df["is_duplicate"].values[3])
print(kw_feature_train[3])

['what', 'was', 'best', 'day', 'life', 'excluding', 'family', 'things', 'like', 'births']
['what', 'is', 'best', 'day', 'life', 'till', 'date']
1
[1]


In [23]:
ow_feature_train = utils.generate_ordered_words_feature(q1_train_tokens,q2_train_tokens)
ow_feature_val = utils.generate_ordered_words_feature(q1_validation_tokens,q2_validation_tokens)

ow_feature_train = np.array(ow_feature_train).reshape(-1,1)
ow_feature_train.shape

(291897, 1)

In [24]:
print(q1_train_tokens[2])
print(q2_train_tokens[2])
print(tr_df["is_duplicate"].values[2])
print(ow_feature_train[2])
print(q1_train_tokens[3])
print(q2_train_tokens[3])
print(tr_df["is_duplicate"].values[3])
print(ow_feature_train[3])

['where', 'can', 'buy', 'used', 'wine', 'barrels']
['where', 'can', 'buy', 'used', 'wine', 'barrels']
1
[1.]
['what', 'was', 'best', 'day', 'life', 'excluding', 'family', 'things', 'like', 'births']
['what', 'is', 'best', 'day', 'life', 'till', 'date']
1
[0.57142857]


## Negation

In [23]:
neg_feature_train = utils.generate_negation_feature(q1_train,q2_train)
neg_feature_val = utils.generate_negation_feature(q1_validation,q2_validation)

neg_feature_train = np.array(neg_feature_train).reshape(-1,1)
neg_feature_train.shape

100%|██████████| 291897/291897 [56:50<00:00, 85.59it/s]  
100%|██████████| 15363/15363 [03:10<00:00, 80.77it/s] 


(291897, 1)

In [32]:
print(q1_train[2])
print(q2_train[2])
print(tr_df["is_duplicate"].values[2])
print(neg_feature_train[2])

where can  buy used wine barrels?
where can  buy used wine barrels?
1
[1]


In [33]:
print(q1_train[27])
print(q2_train[27])
print(tr_df["is_duplicate"].values[27])
print(neg_feature_train[27])

can someone still get  message  snapchat if  haven't read  yet   deactivated  account?
if  block someone  snapchat will  still see  last message  sent?
0
[0]


## Training a logistic model

In [60]:
#Concatenate here all your features, bros
X_tr = scipy.sparse.hstack((jd_feature_train, tfidf_train, ow_feature_train, cosine_distance_train, embedings_train_q1, embedings_train_q2, cosine_distance_embeddings_train))

X_tr.shape

(291897, 149562)

In [61]:
logistic_improved = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
y_train = tr_df["is_duplicate"].values
logistic_improved.fit(X_tr, y_train)

In [62]:
metrics.accuracy_score(y_train, logistic_improved.predict(X_tr))

0.821368496421683

In [37]:
jd_feature_val = np.array(jd_feature_val).reshape(-1,1)
kw_feature_val = np.array(kw_feature_val).reshape(-1,1)
ow_feature_val = np.array(ow_feature_val).reshape(-1,1)

In [63]:
X_val = scipy.sparse.hstack((jd_feature_val, tfidf_val, ow_feature_val, cosine_distance_val, embedings_val_q1, embedings_val_q2, cosine_distance_embeddings_val))
y_val = va_df["is_duplicate"].values

In [64]:
metrics.accuracy_score(y_val, logistic_improved.predict(X_val))

0.7907309770227169

In [67]:
mistake_indices_tr, predictions_tr = utils.get_mistakes(logistic_improved, X_tr,  y_train)

utils.print_mistake_k(0, tr_df, mistake_indices_tr, predictions_tr)

How is web-work.in works?
How do I get web designing work?
true class: 0
prediction: 1


In [68]:
try:
    # open file descriptor for writing
    logistic_file = open('model_artifacts/logistic_imp.pkl', 'wb')
    # serialize and write object to file
    pickle.dump(logistic_improved, logistic_file)
    # close file descriptor
    logistic_file.close()
except OSError as e:
    if e.errno != errno.EEXIST:
        raise