<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [12]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz --no-deps

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0MB)
[K    100% |████████████████████████████████| 12.0MB 123kB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: en-core-web-sm
  Running setup.py bdist_wheel for en-core-web-sm ... [?25ldone
[?25h  Stored in directory: /home/shanud6711/.cache/pip/wheels/6a/47/fb/6b5a0b8906d8e8779246c67d4658fd8a544d4a03a75520197a
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.2.5
Note: you may need to restart the kernel to use updated packages.


In [4]:
import en_core_web_sm

In [10]:
# en_vectors_web_lg, which includes over 1 million unique vectors.

nlp = en_core_web_sm.load()
vecs1 = []

for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)


100%|██████████| 404290/404290 [1:00:15<00:00, 111.83it/s]


In [11]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc1),96])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [1:02:00<00:00, 108.67it/s]


In [12]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [13]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [14]:
# dataframe of nlp features
df1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,66,66,54,54,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,36,36,35,40,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,46,56,0.175


In [15]:
# data before preprocessing 
df2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.2,5,3
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19.0,0.0,2,0
4,4,3,1,76,39,13,7,2.0,20.0,0.1,4,2


In [16]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-6.179502,37.450734,-67.929891,32.22426,143.348841,135.37458,17.865204,54.562352,81.618952,232.909844,...,-71.834681,-60.222836,-22.026407,103.336721,-68.477435,-54.976582,-67.802667,116.269998,60.515907,-12.245918
1,9.23666,-80.371412,-45.785899,78.291651,183.56822,100.894079,74.344814,48.3608,127.297406,112.987303,...,-32.130517,-98.080321,19.113786,-20.507512,-76.981003,82.66507,41.085575,129.377789,115.868471,4.383535
2,97.546825,22.972184,-39.558384,18.72342,56.928621,48.307634,8.719282,36.893752,106.899967,226.283079,...,-66.835018,87.592126,4.032439,56.851707,-43.625406,-57.580953,-50.425837,78.59199,105.714349,-33.304142
3,57.586979,-22.017081,-4.599306,-88.939274,-4.732172,-54.209051,74.614933,106.533731,15.520613,39.009704,...,28.362964,41.981225,-11.204994,16.83343,-36.372474,8.927553,-64.55319,95.054234,-34.157576,70.821923
4,83.185766,-40.506968,-83.403915,-52.648649,79.07488,-19.038236,53.728726,97.648597,160.555808,290.54138,...,-4.390929,109.604429,-91.16017,-25.739906,133.123054,-13.508796,-100.115195,208.424405,286.930891,68.02763


In [17]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,-14.616961,59.755496,-53.263741,19.514483,113.916494,101.657068,8.561492,66.232771,32.888134,210.81274,...,-72.266623,-37.072061,-31.142731,94.064858,-45.053226,-34.155226,-76.548091,99.282783,50.791735,-17.566246
1,-3.565752,-16.844562,-130.911787,0.320241,79.350263,23.562032,79.124558,84.119844,128.684146,279.539867,...,6.193168,-65.084206,-15.654533,-3.47583,26.999822,170.172597,-57.038937,194.269548,128.207809,55.490069
2,156.833635,59.991907,-8.414309,29.251429,133.680214,112.457563,89.84979,21.613006,24.331765,171.114489,...,-26.185222,-19.283222,75.602445,24.144021,-91.874393,-178.454113,-91.471484,19.92271,21.266688,49.574857
3,41.472446,56.717314,31.530609,-5.520159,33.454808,79.596186,15.509003,40.042088,21.094026,101.998123,...,-17.779016,30.152287,49.300139,27.783794,25.93719,-32.107066,-3.817625,-14.231002,4.772116,7.711626
4,-14.446979,-4.338253,-70.196203,-48.636356,18.35685,-50.807065,24.311191,60.043682,32.42199,57.1487,...,36.08947,47.193193,-49.969586,44.796025,39.740805,-33.763316,-98.282342,22.118792,68.802085,21.02538


In [18]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 96
Number of features in question2 w2v  dataframe : 96
Number of features in final dataframe  : 221


In [19]:
# storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')