# Convert Training data to weigthed TF-IDF vectors and extract features and combine these features with featres extracted during Pre-Processed

In [None]:
#References
#Followed part of code from https://www.kaggle.com/code/creatorghost/quora-question-pair

In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy

In [2]:
# Read the training file
df = pd.read_csv("quora-question-pairs/train.csv")
#split the training file into dataframe
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Calculate TF-IDF Scores for all words in the combined questions

In [5]:
# merge the two question columns to create a unique column
questions = list(df['question1']) + list(df['question2'])

#Convert all the words into theit TF-IDF values
tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# Create a dictionary with  key:word and their corresponding value:tf-idf scores
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

# Use SpaCy and NLP to calculated weighted Word2Vec

In [6]:
#Load the Spacy library
!python -m spacy download en_core_web_lg
!python -m spacy download en

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     -------------------------------------- 777.4/777.4 MB 5.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     --------------------------------------- 13.9/13.9 MB 21.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spac

In [None]:
#Use the en_core_web_lg NLP model containing over a million word vectors
nlp = spacy.load('en_core_web_lg')

In [8]:
#Convert each word in Question2 into their vector format and match their TF-IDF score to create the features and their values
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 404290/404290 [23:52<00:00, 282.23it/s]


In [10]:
#Convert each word in Question1 into their vector format and match their TF-IDF score to create the features and their values
vecs1 = []
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)

100%|██████████| 404290/404290 [24:24<00:00, 276.10it/s]


# Combine pre-process and TFIDF features

In [14]:
#Read the files created during preprocessing
if os.path.isfile('quora-question-pairs/nlp_features_train.csv'):
    dfnlp = pd.read_csv("quora-question-pairs/nlp_features_train.csv",encoding='latin-1')
else:
    print("nlp_features_train.csv Does not exist")

if os.path.isfile('quora-question-pairs/trainingdata_before_preprocessing.csv'):
    dfppro = pd.read_csv("quora-question-pairs/trainingdata_before_preprocessing.csv",encoding='latin-1')
else:
    print("trainingdata_before_preprocessing.csv Does not exist")

#Drop all the original columns from these datasets
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

#Concatinate/join the newly created columns using Spavy and TFIDF with pre processed columns
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [15]:
# dataframe containing pre-processed features
df1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,66,66,54,54,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,36,36,35,40,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,46,56,0.175


In [16]:
# tfidf weighted word2vec for Question 1
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-5.856872,17.449559,4.86272,7.971019,20.345586,-5.514759,-4.0778,-2.820742,8.029026,146.599092,...,-17.370964,5.393082,0.384676,-8.362788,-1.88029,-10.799672,-12.999799,3.225858,1.256145,16.807275
1,9.356103,13.098566,18.945098,-2.079594,-15.703841,-2.173409,8.969065,-20.458267,-20.674299,13.760798,...,25.948247,0.603713,-10.516349,6.040723,30.476707,3.97689,-28.25461,12.613432,-7.770673,31.456654
2,0.90952,16.050299,-8.126856,-4.848289,-2.80619,9.75228,4.349992,-5.120332,6.785252,106.342974,...,-20.942061,2.398984,8.663028,-0.654124,16.220601,-2.719094,10.485332,-1.103132,-7.290877,19.31425
3,-4.950745,17.098874,-15.474965,1.04468,-2.392017,-0.051889,2.650595,-8.451192,2.584123,116.184408,...,-2.551312,-4.97148,-0.478381,-1.930166,9.336016,2.574459,4.803863,-1.182989,-2.962115,3.225704
4,-11.520302,19.769948,-4.510997,-6.548994,-20.835286,33.663909,-30.390504,0.826553,-19.571472,84.458577,...,-8.331733,-4.866335,18.828458,-40.357679,-10.336167,15.29463,-0.989347,-9.072091,-8.194567,23.84756


In [None]:
# tfidf weighted word2vec for Question 2
df3_q2.head()

In [17]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 300
Number of features in question2 w2v  dataframe : 300
Number of features in final dataframe  : 629


# Create a final CSV containing 629 features

In [18]:
#Store all the features into final dataset
if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features_Copy.csv')