<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [38]:
#df_fe_without_preprocessing_train.csv (Simple Fearture Extraction)
#nlp_features_train.csv (Advanced NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("Please run the previous notebook 2.Quora_Preprocessing.ipynb")

if os.path.isfile('df_fe_without_preprocessing.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing.csv",encoding='latin-1')
else:
    print("Please run the earlier notebook 1.Quora.ipynb")

In [39]:
df = dfnlp.drop(['qid1','qid2','question1','question2','is_duplicate'], axis=1)
df = df.merge(dfppro, on='id',how='left')
df.head(2)

Unnamed: 0,id,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,...,question1,question2,is_duplicate,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share
0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,...,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66,57,14,12,10.0,23.0,0.434783
1,1,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,...,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51,88,8,13,4.0,20.0,0.2


In [40]:
from sklearn.model_selection import train_test_split

y = df['is_duplicate']
X = df.drop(['is_duplicate'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [41]:
# merging question1 and question2 of training data into a single list
questions = list(X_train['question1']) + list(X_train['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dictionary with key:word and value:idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- Here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [42]:
# en_vectors_web_lg is a Spacy pre-trained glove model, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_train['question1'])):
    doc1 = nlp(qu1) 
    # 96 is the number of dimensions of vectors 
    mean_vec1 = np.zeros((len(doc1), len(doc1[0].vector)))
    weight_sum = 0
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetching idf score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        tf = (qu1.count(str(word1))/len(doc1))
        # computing final vec
        mean_vec1 += vec1 * idf * tf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
X_train['q1_feats_m'] = list(vecs1)

100%|█████████████████████████████████████████████████████████████████████████| 283003/283003 [31:58<00:00, 147.52it/s]


In [43]:
vecs2 = []
for qu2 in tqdm(list(X_train['question2'])):
    doc2 = nlp(qu2)
    # 384 is the number of dimensions of vectors 
    mean_vec2 = np.zeros([len(doc1), len(doc2[0].vector)])
    weight_sum = 0
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetching idf score
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        tf = (qu2.count(str(word2))/len(doc2))
        # computing final vec
        mean_vec2 += vec2 * idf * tf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
X_train['q2_feats_m'] = list(vecs2)

100%|█████████████████████████████████████████████████████████████████████████| 283003/283003 [30:54<00:00, 152.60it/s]


In [44]:
vecs1 = []
for qu1 in tqdm(list(X_test['question1'])):
    doc1 = nlp(qu1) 
    # 96 is the number of dimensions of vectors 
    mean_vec1 = np.zeros((len(doc1), len(doc1[0].vector)))
    weight_sum = 0
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetching idf score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        tf = (qu1.count(str(word1))/len(doc1))
        # computing final vec
        mean_vec1 += vec1 * idf * tf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
X_test['q1_feats_m'] = list(vecs1)

100%|█████████████████████████████████████████████████████████████████████████| 121287/121287 [13:10<00:00, 153.49it/s]


In [45]:
vecs2 = []
for qu2 in tqdm(list(X_test['question2'])):
    doc2 = nlp(qu2)
    # 384 is the number of dimensions of vectors 
    mean_vec2 = np.zeros([len(doc1), len(doc2[0].vector)])
    weight_sum = 0
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetching idf score
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        tf = (qu2.count(str(word2))/len(doc2))
        # computing final vec
        mean_vec2 += vec2 * idf * tf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
X_test['q2_feats_m'] = list(vecs2)

100%|█████████████████████████████████████████████████████████████████████████| 121287/121287 [13:15<00:00, 152.39it/s]


In [46]:
X_train.head(2)

Unnamed: 0,id,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,...,question2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,q1_feats_m,q2_feats_m
223376,223376,0.399992,0.333328,0.0,0.0,0.22222,0.199998,0.0,0.0,1.0,...,What are the healthy ways of gaining weight an...,49,56,9,10,2.0,19.0,0.105263,"[-5.563232835382223, 0.7567734122276306, -12.2...","[-3.342882439494133, -0.3936847001314163, -12...."
3424,3424,0.799984,0.799984,0.999986,0.999986,0.916659,0.916659,0.0,1.0,0.0,...,What is unusual or different about the food an...,67,65,12,12,11.0,24.0,0.458333,"[-4.0849579041823745, 3.3718531392514706, -13....","[-2.9109734958037734, 4.073800444602966, -12.4..."


In [47]:
X_train['q1_feats_m'][1].shape

(96,)

In [48]:
train_q1 = pd.DataFrame(X_train.q1_feats_m.values.tolist(), index= X_train.index)
train_q2 = pd.DataFrame(X_train.q2_feats_m.values.tolist(), index= X_train.index)
X_train = X_train.drop(['qid1','qid2','question1','question2','q1_feats_m','q2_feats_m'],axis=1)

test_q1 = pd.DataFrame(X_test.q1_feats_m.values.tolist(), index= X_test.index)
test_q2 = pd.DataFrame(X_test.q2_feats_m.values.tolist(), index= X_test.index)
X_test = X_test.drop(['qid1','qid2','question1','question2','q1_feats_m','q2_feats_m'],axis=1)

In [57]:
# storing the final train features to csv file
if not os.path.isfile('final_features_train_w2v.csv'):
    train_q1['id']=X_train['id']
    train_q2['id']=X_train['id']
    df1  = train_q1.merge(train_q2, on='id',how='left')
    result  = X_train.merge(df1, on='id',how='left')
    result.to_csv('final_features_train_w2v.csv')

In [58]:
# storing the final test features to csv file
if not os.path.isfile('final_features_test_w2v.csv'):
    test_q1['id']=X_test['id']
    test_q2['id']=X_test['id']
    df1  = test_q1.merge(test_q2, on='id',how='left')
    result  = X_test.merge(df1, on='id',how='left')
    result.to_csv('final_features_test_w2v.csv')