<h3>4.4 Featurising TF-IDF and AVG W2V</h3>

In [27]:

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import os
import gc

import re
from nltk.corpus import stopwords
import distance
from nltk.stem import PorterStemmer
import re
from nltk.corpus import stopwords
# This package is used for finding longest common subsequence between two strings
# you can write your own dp code for this
from bs4 import BeautifulSoup
from thefuzz import fuzz
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from tqdm import tqdm
import pickle

import final_inference as fi

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Reading dataset
abspath = os.getcwd()
# Load the CSV file
df = pd.read_csv(os.path.join(abspath, 'Datasets/train.csv'))

In [3]:
# before applying we can preprocess the data
def pre_process(data):
    x = str(data).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")

    # Replacing Date with date string
    pattern = '([0-2][0-9]|(3)[0-1])(\/|\.)(((0)[0-9])|((1)[0-2]))(\/|\.)\d{2,4}'
    x = re.sub(pattern, 'date', x)

    # Replacing links with link string
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    x = re.sub(pattern, 'link', x)

    #million and thousand representation
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)

    # Stemming
    porter = PorterStemmer()
    # Removing special chars
    pattern = re.compile('\W')

    if type(x) == type(''):
        x = re.sub(pattern, ' ', x)

    if type(x) == type(''):
        x = porter.stem(x)
        example1 = BeautifulSoup(x)
        x = example1.get_text()

    return x

In [4]:
# Applying the preprocess function over description data
df['question1'] = df['question1'].apply(pre_process)
df['question2'] = df['question2'].apply(pre_process)

In [5]:
# merging the questions
questions = df['question1'].values + df['question2'].values

In [9]:
# creating TFIDF
tf_idf = TfidfVectorizer(min_df = 50)
tf_idf.fit_transform(questions)
print('Total Dimensions: ', len(tf_idf.get_feature_names_out()))
# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tf_idf.get_feature_names_out(), tf_idf.idf_))
pickle.dump(tf_idf, open('trained_model/tf_idf.pkl', 'wb'))

Total Dimensions:  7333


In [40]:
# Loading the pretrained model to obtain the w2v
nlp = spacy.load("en_core_web_lg")

In [41]:
# Now for each row we need w2v 
# creating word to vec for question2
q1_w2v = [] # this for each row

#Step1. We need to loop thrrough each rows sentence
for sentence in tqdm(df['question1'].values):
    word_vec1 =  nlp(sentence)
    # for each word we need to have empty vectors of the dimension (1, number of dimension trained)
    word_vec = np.zeros(word_vec1.vector.shape)
    word_cnt = 0
    # Step:2 Looping through each sentence words
    for word in word_vec1:
        # Obtaining the polarity and numerical vector for each word
        vec = word.vector
        if str(word) in word2tfidf.keys():
            # Step:3 Multiplying the numerical vector with its IDF
            word_vec += (word2tfidf[str(word)] * vec)
        if np.sum(word_vec) != 0:
            word_cnt += 1
    # Storing Avg IDF W2V for each sentences
    q1_w2v.append(word_vec / word_cnt)

df['q1_feats_m'] = list(q1_w2v)


100%|██████████| 404290/404290 [1:04:12<00:00, 104.93it/s]  


In [42]:
# Now for each row we need w2v 

# creating word to vec for question2
q2_w2v = [] # this for each row

#Step1. We need to loop thrrough each rows sentence
for sentence in tqdm(df['question2'].values):
    word_vec1 =  nlp(sentence)
    # for each word we need to have empty vectors of the dimension (1, number of dimension trained)
    word_vec = np.zeros(word_vec1.vector.shape)
    word_cnt = 0
    # Step:2 Looping through each sentence words
    for word in word_vec1:
        # Obtaining the polarity and numerical vector for each word
        vec = word.vector
        if str(word) in word2tfidf.keys():
            # Step:3 Multiplying the numerical vector with its IDF
            word_vec += (word2tfidf[str(word)] * vec)
        if np.sum(word_vec) != 0:
            word_cnt += 1
    # Storing Avg IDF W2V for each sentences
    q2_w2v.append(word_vec / word_cnt)

df['q2_feats_m'] = list(q2_w2v)

100%|██████████| 404290/404290 [3:58:39<00:00, 28.23it/s]     


In [46]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('DataSets/nlp_and_fuzzy_train.csv'):
    dfnlp = pd.read_csv("DataSets/nlp_and_fuzzy_train.csv",encoding='latin-1')
else:
    print("run previous notebook -> 2_Quora_Preprocessing_new.ipynb")

if os.path.isfile('DataSets/df_feature_extraction.csv'):
    dfppro = pd.read_csv("DataSets/df_feature_extraction.csv",encoding='latin-1')
else:
    print("Run previous notebook -> 2_Quora_Preprocessing_new.ipynb")

In [47]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [48]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 16
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 300
Number of features in question2 w2v  dataframe : 300
Number of features in final dataframe  : 628


In [50]:
# storing the final features to csv file
if not os.path.isfile('DataSets/final_features_train.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('DataSets/final_features_train.csv')