## Vectorizing text data with tfidf weighted word-vectors

### 1. Loading necessary libraries 

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import sys
import time
from tqdm import tqdm
import spacy

from nltk.corpus import stopwords

from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### 2. Reading data

In [2]:
df = pd.read_csv("quora_train.csv")

df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404290 non-null  object
 4   question2     404290 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


### 3. Merge texts of Question 1 & Question 2

In [3]:
questions = list(df['question1']) + list(df['question2'])

### 4. Generate tf-idf score for each word in questions

In [4]:
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- Here we use a pre-trained GLOVE model which comes bundled with "spacy" library. It is trained on Wikipedia and therefore, it is stronger in terms of word semantics.

In [15]:
!python -m spacy download en_core_web_lg
!python -m spacy download en

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use
the full pipeline package name 'en_core_web_sm' instead.[0m
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### 5. Generate GLOVE vectors

In [5]:
# en_vectors_web_lg, which includes over 1 million unique vectors.

nlp = spacy.load('en_core_web_lg')

x = nlp('man')
len(x.vector)

300

- __Observation:__ The word 'man' is converted into a 300-dimensional numerical vector.

In [6]:
vecs1 = []

for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
            
        # compute final vec
        mean_vec1 += vec1 * idf
        
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
    
df['q1_feats_m'] = list(vecs1)

100%|█████████████████████████████████████████████████████████████████████████| 404290/404290 [35:26<00:00, 190.12it/s]


In [7]:
vecs2 = []

for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
            
        # compute final vec
        mean_vec2 += vec2 * idf
        
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
    
df['q2_feats_m'] = list(vecs2)

100%|█████████████████████████████████████████████████████████████████████████| 404290/404290 [36:49<00:00, 182.99it/s]


### 6. Overview of final dataset

In [8]:
# df_fe_without_preprocessing_train.csv ---> (Basic features before preprocessing)
# nlp_features_train.csv ---> (NLP Features)

if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("Run previous notebooks.")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("Run previous notebooks.")

- __Dataframe: dfnlp__

In [9]:
dfnlp

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guid to invest in sha...,what is the step by step guid to invest in sha...,0,0.999980,0.833319,0.999983,0.999983,...,0.785709,0.0,1.0,2.0,13.0,100,92,92,100,0.982143
1,1,3,4,what is the stori of kohinoor koh i noor diamond,what would happen if the indian govern stole t...,0,0.799984,0.399996,0.749981,0.599988,...,0.466664,0.0,1.0,5.0,12.5,86,65,67,75,0.571429
2,2,5,6,how can i increas the speed of my internet con...,how can internet speed be increas by hack thro...,0,0.599988,0.499992,0.399992,0.249997,...,0.357140,0.0,1.0,4.0,12.0,73,66,54,56,0.188679
3,3,7,8,whi am i mental veri lone how can i solv it,find the remaind when math 23 24 math is divid...,0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,2.0,12.0,38,37,35,40,0.045455
4,4,9,10,which one dissolv in water quikli sugar salt m...,which fish would surviv in salt water,0,0.399992,0.199998,0.999950,0.666644,...,0.307690,0.0,1.0,6.0,10.0,68,49,45,54,0.157895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,404285,433578,379845,how mani keyword are there in the racket progr...,how mani keyword are there in perl program lan...,0,0.857131,0.857131,0.999980,0.833319,...,0.785709,1.0,1.0,1.0,13.5,96,87,91,86,0.410959
404286,404286,18840,155606,do you believ there is life after death,is it true that there is life after death,1,0.666644,0.666644,0.599988,0.599988,...,0.555549,1.0,0.0,1.0,8.5,79,70,72,74,0.650000
404287,404287,537928,537929,what is one coin,what is thi coin,0,0.499975,0.499975,0.999950,0.999950,...,0.749981,1.0,1.0,0.0,4.0,86,81,81,81,0.470588
404288,404288,537930,537931,what is the approx annual cost of live while s...,i am have littl hairfal problem but i want to ...,0,0.000000,0.000000,0.124998,0.099999,...,0.040000,0.0,0.0,8.0,21.0,47,47,41,45,0.056818


In [10]:
dfnlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    404290 non-null  int64  
 1   qid1                  404290 non-null  int64  
 2   qid2                  404290 non-null  int64  
 3   question1             404274 non-null  object 
 4   question2             404279 non-null  object 
 5   is_duplicate          404290 non-null  int64  
 6   cwc_min               404290 non-null  float64
 7   cwc_max               404290 non-null  float64
 8   csc_min               404290 non-null  float64
 9   csc_max               404290 non-null  float64
 10  ctc_min               404290 non-null  float64
 11  ctc_max               404290 non-null  float64
 12  last_word_eq          404290 non-null  float64
 13  first_word_eq         404290 non-null  float64
 14  abs_len_diff          404290 non-null  float64
 15  

- __Dataframe: dfppro__

In [11]:
dfppro

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,freq_q1+q2,freq_q1-q2,word_Common,word_Total,word_share
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,2,0,10.0,23.0,0.434783
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,5,3,4.0,20.0,0.200000
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1,1,73,59,14,10,2,0,4.0,24.0,0.166667
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,1,50,65,11,9,2,0,0.0,19.0,0.000000
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3,1,76,39,13,7,4,2,2.0,20.0,0.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0,2,2,85,79,14,13,4,0,11.0,25.0,0.440000
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1,12,1,41,42,8,9,13,11,5.0,16.0,0.312500
404287,404287,537928,537929,What is one coin?,What's this coin?,0,1,1,17,17,4,3,2,0,1.0,7.0,0.142857
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,1,1,94,127,17,25,2,0,1.0,40.0,0.025000


In [12]:
dfppro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            404290 non-null  int64  
 1   qid1          404290 non-null  int64  
 2   qid2          404290 non-null  int64  
 3   question1     404289 non-null  object 
 4   question2     404288 non-null  object 
 5   is_duplicate  404290 non-null  int64  
 6   freq_qid1     404290 non-null  int64  
 7   freq_qid2     404290 non-null  int64  
 8   q1len         404290 non-null  int64  
 9   q2len         404290 non-null  int64  
 10  q1_n_words    404290 non-null  int64  
 11  q2_n_words    404290 non-null  int64  
 12  freq_q1+q2    404290 non-null  int64  
 13  freq_q1-q2    404290 non-null  int64  
 14  word_Common   404290 non-null  float64
 15  word_Total    404290 non-null  float64
 16  word_share    404290 non-null  float64
dtypes: float64(3), int64(12), object(2)
memory usage

- __Dataframe: df__

In [13]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_feats_m,q2_feats_m
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[-17.302040576934814, 65.07790648937225, -262....","[-5.95164680480957, 77.03271555900574, -248.92..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[-21.27127695083618, 42.34149789810181, 84.177...","[-111.66787052154541, 108.6460747718811, -29.2..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[-48.436742186546326, 112.14766091108322, -157...","[-29.703657031059265, 19.86686795949936, -63.5..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[137.4333928823471, 79.76209062337875, -111.17...","[-130.56681299209595, -40.23609447479248, -93...."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[8.331833839416504, -108.39999568462372, -214....","[-5.494155645370483, 17.43384277820587, -121.1..."
...,...,...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0,"[-101.96317064762115, -24.91492909193039, 101....","[-72.29086697101593, -55.78942668437958, 175.6..."
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1,"[18.76608443260193, 81.44477081298828, -135.08...","[13.67344719171524, 74.36056709289551, -98.983..."
404287,404287,537928,537929,What is one coin?,What's this coin?,0,"[-73.12651538848877, 42.51656985282898, 17.824...","[-41.6717848777771, 39.25337290763855, -0.7525..."
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0,"[-140.13347053527832, -15.863579392433167, -11...","[-116.28652596473694, 271.0460817962885, -345...."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404290 non-null  object
 4   question2     404290 non-null  object
 5   is_duplicate  404290 non-null  int64 
 6   q1_feats_m    404290 non-null  object
 7   q2_feats_m    404290 non-null  object
dtypes: int64(4), object(4)
memory usage: 24.7+ MB


In [15]:
df['q1_feats_m']

0         [-17.302040576934814, 65.07790648937225, -262....
1         [-21.27127695083618, 42.34149789810181, 84.177...
2         [-48.436742186546326, 112.14766091108322, -157...
3         [137.4333928823471, 79.76209062337875, -111.17...
4         [8.331833839416504, -108.39999568462372, -214....
                                ...                        
404285    [-101.96317064762115, -24.91492909193039, 101....
404286    [18.76608443260193, 81.44477081298828, -135.08...
404287    [-73.12651538848877, 42.51656985282898, 17.824...
404288    [-140.13347053527832, -15.863579392433167, -11...
404289    [-56.34378623962402, 49.95776480436325, -120.5...
Name: q1_feats_m, Length: 404290, dtype: object

In [16]:
df['q1_feats_m'][0]

array([-1.73020406e+01,  6.50779065e+01, -2.62939456e+02, -2.06865619e+01,
        1.68056481e+02,  3.70790873e+01, -6.28020022e+01,  1.63003715e+02,
       -2.56633855e+02, -5.61347729e+00,  4.29033543e+02,  2.02931574e+02,
       -4.57172807e+02,  1.43723528e+02, -5.50389922e+01,  1.85617480e+02,
        1.62979617e+02,  1.47945432e+02, -5.95168309e+01, -2.13590037e+02,
        4.83582387e+01, -7.42316880e+01, -1.80441754e+02, -1.21027371e+02,
       -1.90992724e+02, -2.88042920e+01, -9.19915497e+01, -1.27441321e+02,
        7.18430538e+01,  4.35519301e+01,  2.00901523e+02, -4.99544508e+01,
       -1.11133328e+02, -5.28859767e+01,  1.59588275e+01, -6.48045990e+01,
        3.50147365e+01,  2.94780710e+02,  3.64744709e+01, -2.98325251e+01,
       -8.12138438e+01,  6.42383418e+01,  1.33297369e+01,  2.16547847e-01,
       -1.10282119e+02,  1.57877754e+02,  2.19726710e+02, -1.71487647e+02,
        5.13175529e+01,  4.77328691e+01,  1.64068550e+00,  2.03963968e+02,
       -5.60416582e+01, -

In [17]:
df['q1_feats_m'][0].shape

(300,)

In [18]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate', 'freq_qid1', 'freq_qid2', 'freq_q1+q2', 'freq_q1-q2'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [19]:
# Dataframe of NLP features

print(df1.shape)
df1.head()

(404290, 17)


Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,92,92,100,0.982143
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,65,67,75,0.571429
2,2,0,0.599988,0.499992,0.399992,0.249997,0.499995,0.35714,0.0,1.0,4.0,12.0,73,66,54,56,0.188679
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,38,37,35,40,0.045455
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,68,49,45,54,0.157895


In [20]:
# Dataframe of Basic features

print(df2.shape)
df2.head()

(404290, 8)


Unnamed: 0,id,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share
0,0,66,57,14,12,10.0,23.0,0.434783
1,1,51,88,8,13,4.0,20.0,0.2
2,2,73,59,14,10,4.0,24.0,0.166667
3,3,50,65,11,9,0.0,19.0,0.0
4,4,76,39,13,7,2.0,20.0,0.1


In [21]:
# Dataframe of Questions 1 tfidf weighted word2vec

print(df3_q1.shape)
df3_q1.head()

(404290, 300)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-17.302041,65.077906,-262.939456,-20.686562,168.056481,37.079087,-62.802002,163.003715,-256.633855,-5.613477,...,120.913699,-132.868682,-111.14414,76.800932,-17.031813,18.101131,-26.370254,-169.293018,-136.936277,95.165242
1,-21.271277,42.341498,84.177521,-106.393414,88.151337,-43.99884,45.112466,109.73658,21.342126,-31.136926,...,51.731451,21.67201,50.035608,-10.191149,-110.361197,60.418664,36.341111,-174.373243,-63.843372,79.621566
2,-48.436742,112.147661,-157.016985,66.946747,200.748916,-25.216032,68.918125,361.157204,-185.235991,79.779303,...,-38.794809,-44.498598,87.357947,27.83921,-68.311364,156.044141,117.030545,-217.245489,-194.690445,63.290935
3,137.433393,79.762091,-111.175812,-172.055902,-100.82188,73.477431,4.726071,176.659988,-144.263077,71.553731,...,134.599823,-151.525408,67.42563,-109.629912,-40.876037,-106.195349,70.469539,85.817208,-158.018648,224.495181
4,8.331834,-108.399996,-214.260982,35.240404,148.534434,-233.111588,-4.039943,342.230251,179.777785,-97.866637,...,69.994948,-110.065816,307.267988,-82.233615,-271.016133,149.470724,368.845373,14.56945,53.777558,-70.254787


In [22]:
# Dataframe of Questions 2 tfidf weighted word2vec

print(df3_q2.shape)
df3_q2.head()

(404290, 300)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-5.951647,77.032716,-248.923202,-25.578063,109.20947,59.628984,-78.496038,139.783484,-279.317965,9.203245,...,89.509871,-110.533625,-100.479424,90.664803,19.507948,-15.140525,-19.345136,-129.408695,-118.284831,89.844395
1,-111.667871,108.646075,-29.244612,-92.022102,76.001256,50.877741,28.651991,189.795767,68.172862,6.925361,...,81.915579,52.01094,91.056819,-0.052428,-165.972075,103.07754,-83.073252,-68.480301,-117.666431,173.873648
2,-29.703657,19.866868,-63.588384,82.38214,110.056585,37.244608,-10.899151,268.314468,-127.871219,-0.33702,...,88.559925,19.308219,128.521377,126.306732,-44.066754,148.308988,152.486339,-244.803431,2.997449,250.107358
3,-130.566813,-40.236094,-93.626929,-28.221276,178.629043,-39.46899,-4.871327,47.261978,-40.398232,-66.204589,...,46.10954,-82.95861,25.99856,32.833132,-39.527964,20.387944,-82.090781,-163.982581,34.050951,9.135598
4,-5.494156,17.433843,-121.100618,21.822141,79.132462,-75.508125,-38.197533,179.805744,-40.293043,39.91961,...,113.437943,-65.196309,123.960645,-68.888675,-154.013207,-66.353766,8.589358,-33.926499,-25.562273,3.6695


In [23]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 8
Number of features in question1 w2v dataframe : 300
Number of features in question2 w2v dataframe : 300
Number of features in final dataframe : 625


### 7. Storing final dataframe

In [24]:
# storing the final features to csv file

if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')

In [25]:
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))
pickle.dump(nlp, open('nlp.pkl', 'wb'))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# import re
# from bs4 import BeautifulSoup
# import distance
# from fuzzywuzzy import fuzz
# import pickle
# import numpy as np
# # import nltk

# cv = pickle.load(open('cv.pkl','rb'))


# def test_common_words(q1,q2):
#     w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
#     w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
#     return len(w1 & w2)

# def test_total_words(q1,q2):
#     w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
#     w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
#     return (len(w1) + len(w2))


# def test_fetch_token_features(q1, q2):
#     SAFE_DIV = 0.0001

#     STOP_WORDS = pickle.load(open('stop.pkl','rb'))
#     # STOP_WORDS = stopwords.words("english")
#     # STOP_WORDS = nltk.download('stopwords')

#     token_features = [0.0] * 8

#     # Converting the Sentence into Tokens:
#     q1_tokens = q1.split()
#     q2_tokens = q2.split()

#     if len(q1_tokens) == 0 or len(q2_tokens) == 0:
#         return token_features

#     # Get the non-stopwords in Questions
#     q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
#     q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

#     # Get the stopwords in Questions
#     q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
#     q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

#     # Get the common non-stopwords from Question pair
#     common_word_count = len(q1_words.intersection(q2_words))

#     # Get the common stopwords from Question pair
#     common_stop_count = len(q1_stops.intersection(q2_stops))

#     # Get the common Tokens from Question pair
#     common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

#     token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
#     token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
#     token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
#     token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
#     token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
#     token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)

#     # Last word of both question is same or not
#     token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])

#     # First word of both question is same or not
#     token_features[7] = int(q1_tokens[0] == q2_tokens[0])

#     return token_features


# def test_fetch_length_features(q1, q2):
#     length_features = [0.0] * 3

#     # Converting the Sentence into Tokens:
#     q1_tokens = q1.split()
#     q2_tokens = q2.split()

#     if len(q1_tokens) == 0 or len(q2_tokens) == 0:
#         return length_features

#     # Absolute length features
#     length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

#     # Average Token Length of both Questions
#     length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2

#     strs = list(distance.lcsubstrings(q1, q2))
#     length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)

#     return length_features


# def test_fetch_fuzzy_features(q1, q2):
#     fuzzy_features = [0.0] * 4

#     # fuzz_ratio
#     fuzzy_features[0] = fuzz.QRatio(q1, q2)

#     # fuzz_partial_ratio
#     fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

#     # token_sort_ratio
#     fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

#     # token_set_ratio
#     fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

#     return fuzzy_features


# def preprocess(q):
#     q = str(q).lower().strip()

#     # Replace certain special characters with their string equivalents
#     q = q.replace('%', ' percent')
#     q = q.replace('$', ' dollar ')
#     q = q.replace('₹', ' rupee ')
#     q = q.replace('€', ' euro ')
#     q = q.replace('@', ' at ')

#     # The pattern '[math]' appears around 900 times in the whole dataset.
#     q = q.replace('[math]', '')

#     # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
#     q = q.replace(',000,000,000 ', 'b ')
#     q = q.replace(',000,000 ', 'm ')
#     q = q.replace(',000 ', 'k ')
#     q = re.sub(r'([0-9]+)000000000', r'\1b', q)
#     q = re.sub(r'([0-9]+)000000', r'\1m', q)
#     q = re.sub(r'([0-9]+)000', r'\1k', q)

#     # Decontracting words
#     # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
#     # https://stackoverflow.com/a/19794953
#     contractions = {
#         "ain't": "am not",
#         "aren't": "are not",
#         "can't": "can not",
#         "can't've": "can not have",
#         "'cause": "because",
#         "could've": "could have",
#         "couldn't": "could not",
#         "couldn't've": "could not have",
#         "didn't": "did not",
#         "doesn't": "does not",
#         "don't": "do not",
#         "hadn't": "had not",
#         "hadn't've": "had not have",
#         "hasn't": "has not",
#         "haven't": "have not",
#         "he'd": "he would",
#         "he'd've": "he would have",
#         "he'll": "he will",
#         "he'll've": "he will have",
#         "he's": "he is",
#         "how'd": "how did",
#         "how'd'y": "how do you",
#         "how'll": "how will",
#         "how's": "how is",
#         "i'd": "i would",
#         "i'd've": "i would have",
#         "i'll": "i will",
#         "i'll've": "i will have",
#         "i'm": "i am",
#         "i've": "i have",
#         "isn't": "is not",
#         "it'd": "it would",
#         "it'd've": "it would have",
#         "it'll": "it will",
#         "it'll've": "it will have",
#         "it's": "it is",
#         "let's": "let us",
#         "ma'am": "madam",
#         "mayn't": "may not",
#         "might've": "might have",
#         "mightn't": "might not",
#         "mightn't've": "might not have",
#         "must've": "must have",
#         "mustn't": "must not",
#         "mustn't've": "must not have",
#         "needn't": "need not",
#         "needn't've": "need not have",
#         "o'clock": "of the clock",
#         "oughtn't": "ought not",
#         "oughtn't've": "ought not have",
#         "shan't": "shall not",
#         "sha'n't": "shall not",
#         "shan't've": "shall not have",
#         "she'd": "she would",
#         "she'd've": "she would have",
#         "she'll": "she will",
#         "she'll've": "she will have",
#         "she's": "she is",
#         "should've": "should have",
#         "shouldn't": "should not",
#         "shouldn't've": "should not have",
#         "so've": "so have",
#         "so's": "so as",
#         "that'd": "that would",
#         "that'd've": "that would have",
#         "that's": "that is",
#         "there'd": "there would",
#         "there'd've": "there would have",
#         "there's": "there is",
#         "they'd": "they would",
#         "they'd've": "they would have",
#         "they'll": "they will",
#         "they'll've": "they will have",
#         "they're": "they are",
#         "they've": "they have",
#         "to've": "to have",
#         "wasn't": "was not",
#         "we'd": "we would",
#         "we'd've": "we would have",
#         "we'll": "we will",
#         "we'll've": "we will have",
#         "we're": "we are",
#         "we've": "we have",
#         "weren't": "were not",
#         "what'll": "what will",
#         "what'll've": "what will have",
#         "what're": "what are",
#         "what's": "what is",
#         "what've": "what have",
#         "when's": "when is",
#         "when've": "when have",
#         "where'd": "where did",
#         "where's": "where is",
#         "where've": "where have",
#         "who'll": "who will",
#         "who'll've": "who will have",
#         "who's": "who is",
#         "who've": "who have",
#         "why's": "why is",
#         "why've": "why have",
#         "will've": "will have",
#         "won't": "will not",
#         "won't've": "will not have",
#         "would've": "would have",
#         "wouldn't": "would not",
#         "wouldn't've": "would not have",
#         "y'all": "you all",
#         "y'all'd": "you all would",
#         "y'all'd've": "you all would have",
#         "y'all're": "you all are",
#         "y'all've": "you all have",
#         "you'd": "you would",
#         "you'd've": "you would have",
#         "you'll": "you will",
#         "you'll've": "you will have",
#         "you're": "you are",
#         "you've": "you have"
#     }

#     q_decontracted = []

#     for word in q.split():
#         if word in contractions:
#             word = contractions[word]

#         q_decontracted.append(word)

#     q = ' '.join(q_decontracted)
#     q = q.replace("'ve", " have")
#     q = q.replace("n't", " not")
#     q = q.replace("'re", " are")
#     q = q.replace("'ll", " will")

#     # Removing HTML tags
#     q = BeautifulSoup(q)
#     q = q.get_text()

#     # Remove punctuations
#     pattern = re.compile('\W')
#     q = re.sub(pattern, ' ', q).strip()

#     return q


# def query_point_creator(q1, q2):
#     input_query = []

#     # preprocess
#     q1 = preprocess(q1)
#     q2 = preprocess(q2)

#     # fetch basic features
#     input_query.append(len(q1))
#     input_query.append(len(q2))

#     input_query.append(len(q1.split(" ")))
#     input_query.append(len(q2.split(" ")))

#     input_query.append(test_common_words(q1, q2))
#     input_query.append(test_total_words(q1, q2))
#     input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2))

#     # fetch token features
#     token_features = test_fetch_token_features(q1, q2)
#     input_query.extend(token_features)

#     # fetch length based features
#     length_features = test_fetch_length_features(q1, q2)
#     input_query.extend(length_features)

#     # fetch fuzzy features
#     fuzzy_features = test_fetch_fuzzy_features(q1, q2)
#     input_query.extend(fuzzy_features)

#     # bow feature for q1
#     q1_bow = cv.transform([q1]).toarray()

#     # bow feature for q2
#     q2_bow = cv.transform([q2]).toarray()

#     return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow))