### Feature Extraction using Tfidf weighted Word-Vectors

In [1]:
# Import the required libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import os
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from tqdm import tqdm


In [2]:
# Load the train.csv
original_data = pd.read_csv('./Data/train.csv')
original_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
id              404290 non-null int64
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [4]:
original_data.question1 = original_data.question1.apply(lambda x: str(x))
original_data.question2 = original_data.question2.apply(lambda x: str(x))

#### Calculate the Tfidf Vectors

In [5]:
# fit the TfidfVectorizer with the vocabulary of all the questions
# all_questions = list( original_data.question1 ) + \
#                 list( original_data.question2 ) 
all_questions = pd.concat((original_data.question1,original_data.question2)).unique()

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(all_questions)

word_tfidf_score_dictionary = dict(zip(tfidf.get_feature_names(), \
                                      tfidf.idf_))



#### Use GLOVE model from SPACY for Word to Vector Representation

 GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.
 
 https://spacy.io/usage/vectors-similarity

It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.

In [6]:
# Load the small NLP English Model
nlp = spacy.load('en_core_web_sm')
def calculate_tfidf_weightd_word_vecs(questions):
    
    word_vectors = []
    for question in tqdm(list(questions)):
        # Parse the document with Spacy
        document = nlp(question)
        # Build the mean_vectors
        mean_vector = np.zeros([len(document),len(document[0].vector)])
        
        for word in document:
            word_vector = word.vector
            #Get the related tfidf score from the dictionary
            try:
                idf_score = word_tfidf_score_dictionary[str(word)]
            except:
                idf_score = 0
            
            # Compute final vectors
            mean_vector += word_vector * idf_score
        mean_vector = mean_vector.mean(axis= 0)
        word_vectors.append(mean_vector)
        
    return word_vectors
#     print(word_vectors)
        

In [7]:
# Calculate the tfidf Weighted Word Vectors for Question 1 & Question 2
if not os.path.isfile('./data/tfidf_w_vec_features_train.csv'):
    original_data['question_1_wordvec'] =pd.Series(calculate_tfidf_weightd_word_vecs(list(original_data.question1)))

In [8]:
# Calculate the tfidf Weighted Word Vectors for Question 1 & Question 2
if not os.path.isfile('./data/tfidf_w_vec_features_train.csv'):
    original_data['question_2_wordvec'] =pd.Series(calculate_tfidf_weightd_word_vecs(list(original_data.question2)))

In [9]:
if not os.path.isfile('./data/tfidf_w_vec_features_train.csv'):
    original_data.to_csv('./data/tfidf_w_vec_features_train.csv')

else:
    original_data = pd.read_csv('./Data/tfidf_w_vec_features_train.csv')

In [10]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question_1_wordvec,question_2_wordvec
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,[ -5.59272313 37.58352843 -67.61382651 3...,[ -14.00575626 59.74545303 -52.98290777 1...
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,[ 8.892131 -80.98844707 -46.14297098 7...,[ -2.23745598 -18.84306788 -129.61781764 ...
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,[ 95.56324315 24.1265742 -39.51886807 1...,[ 158.04430708 59.85587764 -8.1835475 2...
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,[ 57.96231103 -22.90785587 -4.45696445 -8...,[ 40.78553909 56.19196877 31.05742574 -...
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,[ 81.20799637 -40.57675505 -81.5618248 -5...,[ -14.21143198 -4.57012299 -68.61176462 -4...


#### We have to join all the features that we have preprocessed and extracted

In [11]:
# Load all the previous features
simple_features_data = pd.read_csv('./Data/question_feature_extracted.csv')
nlp_features_data = pd.read_csv('./Data/nlp_advanced_features_train.csv')


In [12]:
simple_features_data.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1_len,q2_len,q1_n_words,q2_n_words,words_common,total_unique_words,words_shared_bw_qs,freq_q1+q2,freq_q1-q2
0,0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10,23,0.434783,2,0
1,1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,4,20,0.2,5,3
2,2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1,1,73,59,14,10,4,24,0.166667,2,0
3,3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,1,50,65,11,9,0,19,0.0,2,0
4,4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3,1,76,39,13,7,2,20,0.1,4,2


In [13]:
nlp_features_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,0.99998,0.833319,0.999988,0.999988,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,3,4,what is the story of kohinoor koh i noor diamond,what would happen if the Indian government sto...,0,0.799984,0.399996,0.999986,0.874989,...,0.466664,0.0,1.0,5.0,12.5,86,63,65,73,0.58
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,0.399992,0.333328,0.999986,0.874989,...,0.285712,0.0,1.0,4.0,12.0,66,66,54,54,0.166667
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,0,0.0,0.0,0.857131,0.857131,...,0.0,0.0,0.0,2.0,12.0,36,36,36,41,0.04
4,4,9,10,which one dissolve in water quickly sugar salt...,which fish would survive in salt water,0,0.399992,0.199998,0.999983,0.749991,...,0.30769,0.0,1.0,6.0,10.0,67,46,45,56,0.175


In [14]:
q1_word_vectors_data = pd.DataFrame(original_data.question_1_wordvec.str.split().values.tolist())

In [15]:
q1_word_vectors_data=q1_word_vectors_data.drop([0],axis=1)

In [16]:
q1_word_vectors_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,88,89,90,91,92,93,94,95,96,97
0,-5.59272313,37.58352843,-67.61382651,31.80409777,143.10172033,134.56082559,17.37897539,53.67839028,80.79305518,231.94682908,...,-59.83876254,-22.37941504,102.24721466,-67.81757915,-54.84739025,-67.23370951,116.47177976,60.22977319,-12.01418304],
1,8.892131,-80.98844707,-46.14297098,78.98283601,184.42664099,101.5130302,74.53171651,48.53591487,127.78073668,113.15878177,...,-99.21034664,19.42594814,-21.37699652,-76.87456751,83.65138116,41.63923484,129.90671277,116.81679153,4.26553643],
2,95.56324315,24.1265742,-39.51886807,16.43975806,55.84492457,47.35515308,8.02114137,39.4617095,107.11958885,226.09264362,...,87.76531097,3.47331452,56.98986173,-44.36421299,-54.44539499,-52.14603806,77.61747181,102.43186593,-33.71415892],
3,57.96231103,-22.90785587,-4.45696445,-88.3697232,-4.70386136,-54.37595606,75.2712594,105.41978294,15.52152705,38.52360508,...,41.30827757,-10.98244941,16.51094508,-35.8200686,9.65364176,-64.15589905,95.12407374,-33.37442064,69.85657161],
4,81.20799637,-40.57675505,-81.5618248,-52.35726821,78.33695567,-19.65145421,53.29571784,98.38354611,159.28732747,285.82424641,...,110.25452018,-90.91986871,-26.02538925,131.62270793,-15.1205253,-98.78080773,205.83000337,282.35832238,67.39155388],


In [17]:
q2_word_vectors_data = pd.DataFrame(original_data.question_2_wordvec.str.split().values.tolist())

In [18]:
q2_word_vectors_data=q2_word_vectors_data.drop([0],axis=1)

In [19]:
q2_word_vectors_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,88,89,90,91,92,93,94,95,96,97
0,-14.00575626,59.74545303,-52.98290777,19.13617885,113.8144992,101.10490113,8.0952456,65.33954014,32.32999611,210.19543636,...,-36.83040191,-31.28040695,93.27955209,-44.51305062,-34.11760159,-75.91725892,99.41412181,50.51768955,-17.51887226],
1,-2.23745598,-18.84306788,-129.61781764,2.30889046,81.28253603,25.06501657,79.11327255,83.13555571,129.23629934,276.68512525,...,-67.93706304,-14.76568208,-3.93812315,26.13085419,169.9645927,-55.65178585,195.01965582,127.96923757,55.5324496,]
2,158.04430708,59.85587764,-8.1835475,28.79510415,132.41035867,110.72395122,90.32491708,21.89750004,23.78260994,169.31577286,...,-18.67186002,74.84405112,25.36805932,-90.57822466,-177.25953549,-90.22422969,21.30134118,20.07493424,48.81069972],
3,40.78553909,56.19196877,31.05742574,-5.34382874,33.06373858,78.59863746,15.45697208,39.32268262,20.92085993,101.13109529,...,29.84074485,48.69019422,27.50041473,25.38212584,-31.81959295,-3.66474462,-13.90011597,4.84666491,7.49122095],
4,-14.21143198,-4.57012299,-68.61176462,-48.33305228,18.58716631,-50.60609037,23.7025404,60.04910313,31.59406807,56.0245004,...,46.91105819,-49.57773685,44.14995003,39.04330338,-33.47314119,-97.51764727,21.98912784,68.03776824,21.44643831],


#### Save the final features

In [20]:
if not os.path.isfile('./Data/all_features_final.csv'):
    q1_word_vectors_data['id'] = nlp_features_data['id']
    q2_word_vectors_data['id'] = nlp_features_data['id']
    
    nlp_features_data = nlp_features_data.drop(['qid1','qid2','question1','question2'],axis=1)
    simple_features_data = simple_features_data.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
    
    nlp_n_simple_data = nlp_features_data.merge(simple_features_data,on='id',how='left')
    temp = q1_word_vectors_data.merge(q2_word_vectors_data,on='id',how='left')
    all_data = nlp_n_simple_data.merge(temp,on = 'id',how ='left')
    
    all_data.to_csv('./Data/all_features_final.csv')


In [21]:
all_data.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,...,88_y,89_y,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y
0,0,0,0.99998,0.833319,0.999988,0.999988,0.916659,0.785709,0.0,1.0,...,-36.83040191,-31.28040695,93.27955209,-44.51305062,-34.11760159,-75.91725892,99.41412181,50.51768955,-17.51887226],
1,1,0,0.799984,0.399996,0.999986,0.874989,0.699993,0.466664,0.0,1.0,...,-67.93706304,-14.76568208,-3.93812315,26.13085419,169.9645927,-55.65178585,195.01965582,127.96923757,55.5324496,]
2,2,0,0.399992,0.333328,0.999986,0.874989,0.399996,0.285712,0.0,1.0,...,-18.67186002,74.84405112,25.36805932,-90.57822466,-177.25953549,-90.22422969,21.30134118,20.07493424,48.81069972],
3,3,0,0.0,0.0,0.857131,0.857131,0.0,0.0,0.0,0.0,...,29.84074485,48.69019422,27.50041473,25.38212584,-31.81959295,-3.66474462,-13.90011597,4.84666491,7.49122095],
4,4,0,0.399992,0.199998,0.999983,0.749991,0.57142,0.30769,0.0,1.0,...,46.91105819,-49.57773685,44.14995003,39.04330338,-33.47314119,-97.51764727,21.98912784,68.03776824,21.44643831],


In [23]:
print([i for i in all_data.columns])

['id', 'is_duplicate', 'cwc_min', 'cwc_max', 'csc_min', 'csc_max', 'ctc_min', 'ctc_max', 'last_word_eq', 'first_word_eq', 'abs_len_diff', 'mean_len', 'token_set_ratio', 'token_sort_ratio', 'fuzz_ratio', 'fuzz_partial_ratio', 'longest_substr_ratio', 'Unnamed: 0', 'freq_qid1', 'freq_qid2', 'q1_len', 'q2_len', 'q1_n_words', 'q2_n_words', 'words_common', 'total_unique_words', 'words_shared_bw_qs', 'freq_q1+q2', 'freq_q1-q2', '1_x', '2_x', '3_x', '4_x', '5_x', '6_x', '7_x', '8_x', '9_x', '10_x', '11_x', '12_x', '13_x', '14_x', '15_x', '16_x', '17_x', '18_x', '19_x', '20_x', '21_x', '22_x', '23_x', '24_x', '25_x', '26_x', '27_x', '28_x', '29_x', '30_x', '31_x', '32_x', '33_x', '34_x', '35_x', '36_x', '37_x', '38_x', '39_x', '40_x', '41_x', '42_x', '43_x', '44_x', '45_x', '46_x', '47_x', '48_x', '49_x', '50_x', '51_x', '52_x', '53_x', '54_x', '55_x', '56_x', '57_x', '58_x', '59_x', '60_x', '61_x', '62_x', '63_x', '64_x', '65_x', '66_x', '67_x', '68_x', '69_x', '70_x', '71_x', '72_x', '73_x', 