In [8]:
import pandas as pd
import numpy as np

data = pd.read_csv('../data/quora_duplicate_questions.tsv',sep='\t',error_bad_lines=False, encoding='utf-8')
data= data.drop(['id','qid1','qid2'], axis = 1)

data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
# length based features

data['len_q1']= data.question1.apply(lambda x: len(str(x)))
data['len_q2']= data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1-data.len_q2

# character length based features
data['len_char_q1'] =data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ','')))))
data['len_char_q2']= data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ','')))))

# word length based features
data['len_word_q1']= data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2']= data.question2.apply(lambda x: len(str(x).split()))

# common words in the 2 questions
data['common_words']=data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))),axis =1)

In [10]:
data.head()

Unnamed: 0,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66,57,9,20,20,14,12,10
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51,88,-37,21,29,8,13,4
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,73,59,14,25,24,14,10,4
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,50,65,-15,19,26,11,9,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,76,39,37,25,18,13,7,2


In [11]:
fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 
        'len_char_q2', 'len_word_q1', 'len_word_q2',     
        'common_words']

In [12]:
from fuzzywuzzy import fuzz

In [13]:
fuzz.QRatio("Why did Trump win the Presidency?",
"How did Donald Trump win the 2016 Presidential Election")

67

In [14]:
fuzz.QRatio("How can I start an online shopping (e-commerce) website?", "Which web technology is best suitable for building a big E-Commerce website?")

60

In [15]:
fuzz.partial_ratio("Why did Trump win the Presidency?", 
"How did Donald Trump win the 2016 Presidential Election")

73

In [16]:
fuzz.partial_ratio("How can I start an online shopping (e-commerce) website?", "Which web technology is best suitable for building a big E-Commerce website?")

57

In [25]:
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']),str(x['question2'])),axis=1)
data['fuzz_wratio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']),str(x['question2'])),axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']),str(x['question2'])),axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']),str(x['question2'])),axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),str(x['question2'])),axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']),str(x['question2'])),axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']),str(x['question2'])),axis=1)

In [26]:
fs_2=['fuzz_ratio','fuzz_wratio','fuzz_partial_ratio','fuzz_partial_token_set_ratio','fuzz_token_sort_ratio','fuzz_token_set_ratio','fuzz_token_sort_ratio']

In [28]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from copy import deepcopy
tfv_q1 = TfidfVectorizer(min_df=3, max_features=None,strip_accents='unicode', analyzer='word',
                        token_pattern=r'w{1,}',ngram_range=(1,2),use_idf=1, smooth_idf=1, sublinear_tf=1,stop_words='english')

tfv_q2=deepcopy(tfv_q1)

In [29]:
q1_tfidf = tfv_q1.fit_transform(data.question1.fillna(""))
q2_tfidf = tfv_q2.fit_transform(data.question2.fillna(""))

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


In [32]:
from sklearn.decomposition import TruncatedSVD

svd_q1 = TruncatedSVD(n_components=8)
svd_q2 = TruncatedSVD(n_components=8)

question1_vectors = svd_q1.fit_transform(q1_tfidf)
question2_vectors = svd_q2.fit_transform(q2_tfidf)

In [None]:
from scipy import sparse

fs3_1 =sparce.hstach((q1_tfidf, q2_tfidf))
tfv = TfidfVectorizer(min_df=3, max_features =None, strip_accents ='unicode',analyzer='word', token_pattern = r'w{1,}',
                     ngram_range=(1,2),use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words='english')

q1q2 = data.question1.fillna("")
q1q2 +=" "+data.question2.fillna("")
fs3_2 = tfv.fit_transform(q1q2)

fs3_3 =np.hstack((question1_vectors, question2_vectors))

In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary = True)

In [None]:
import nltk
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))

def sent2vec(s,model):
    M=[]
    words = word.tokenize(str(s).lower())
    for word in words:
        if word not in stop_words:
            if word.isalpha():
                if word in model:
                    M.append(model[word])
                    M=np.array(M)
                    if len(M)>0:
                        v=M.sum(axis=0)
                        return v/np.sqrt((v**2).sum())
                    else:
                        return np.zeros(300)

In [36]:
w2v_q1 = np.array([sent2vec(q,model) for q in data.question1])
w2v_q2 = np.array([sent2vec(q, model) for q in data.queston2])

NameError: name 'sent2vec' is not defined

In [35]:
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)]
data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1,w2v_q2)]
data['jaccard_distance']=[jaccard(x,y) for (x,y) in zip(w2v_q1,w2v_q2)]
data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1,w2v_q2)]
data['euclidean_distance']=[euclidean_distance(x,y) for (x,y) in zip(w2v_q1,w2v_q2)]
data['minkowski_distance']=[minkowski_distance(x,y) for (x,y) in zip(w2v_q1,w2v_q2)]
data['braycurtis_distance']=[braycurtis(x,y) for (x,y) in zip(w2v_q1,w2v_q2)]

NameError: name 'w2v_q1' is not defined

In [None]:
fs4_1=['cosine_distance','cityblock_distance','jaccard_distance','canberra_distance','euclidean_distance','minkowski_distance','braycurtis_distance']

w2v= np.hstack((w2v_q1, w2v_q2))

In [None]:
def wmd(s1, s2,model):
    s1 = str(s1).lower().split()
    s2=str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1=[w for w in s1 if w not in stop_words]
    s2=[w for w in s2 if w not in stopwords]
    return model.wmdistance(s1,s2)
    

In [None]:
data['wmd'] = data.apply(lambda x: wmd(x['question1'],x['question2'], model), axis=1)
model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: wmd(x['question1'],x['question2'],model), axis=1)
fs2_4 =['wmd','norm_wmd']

In [None]:
# test ml models

import gc
import psutil

del([tfv_q1,tfv_q2,tfv,q1q2,question1_vectors, question2_vectors,
    svd_q1,svd_q2,q1_tfidf,q2_tfidf])

del([w2v_qi,w2v_q2])
del[model]
gc.collect()
psutil.virtual_memory()

In [None]:
from sklearn import linear_model
from sklearn.preprocessing import StrandardScaler
import xgboost as xgb

In [None]:
scaler = StandardScaler()
y = data.is_duplicate.values
y= y.astype('float32').reshape(-1,1)

X = data[fs_1_fs_2+fs_3+_4fs_4_1+fs4_2]
X=X.replace([np.inf,-np.inf],np.nan).fillna(0).values
X = scaler.fit_transform(X)
X= np.hstack((X, fs3,3))