In [199]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [200]:
import warnings
warnings.filterwarnings('ignore')

In [201]:
from toolz import partition_all

In [202]:
INPUT_BUCKET = 'dq-data'
HASH_BUCKET = 'dq-hashed'

In [203]:
#load train_set
data = 'train.csv'
filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404287 non-null int64
qid2            404287 non-null int64
question1       404287 non-null object
question2       404287 non-null object
is_duplicate    404287 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


### Train-test split

In [205]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
df = df.iloc[:75000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [206]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50250 entries, 71916 to 15795
Data columns (total 4 columns):
qid1         50250 non-null int64
qid2         50250 non-null int64
question1    50250 non-null object
question2    50250 non-null object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [207]:
del X,y,df

# Feature Extraction

### Tokenizing and preprocessing

In [208]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [209]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')

<minio.definitions.Object at 0x7f0866aa0668>

In [210]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [None]:
import os
os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')



In [104]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [105]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(100500,)

In [106]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [107]:
del X_ft

In [24]:
#X_train = pd.concat([X_train, pd.Series(X1_ft, name='q1_ft',index=X_train.index), pd.Series(X2_ft, name='q2_ft',index=X_train.index)], axis=1)
#X_train.head()

##### Test set

In [108]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [109]:
X_ft_test.shape

(49500,)

In [110]:
del model

In [111]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [112]:
del X_ft_test

### Pairwise Metrics

In [113]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from scipy.spatial.distance import cdist
def compute_pairwise_kernel(pc1, pc2, method='linear'):
    if method=='polynomial':
        return polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        return rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        return sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        return laplacian_kernel(pc1, pc2)
    else:
        return linear_kernel(pc1, pc2)
    
def compute_pairwise_dist(pc1, pc2, method='euclidean'):
    if pc1.size == 0:
        return []
    if pc2.size == 0:
        return []
    return cdist(pc1, pc2, metric=method)
        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [114]:
def get_q_lengths(X):
    q_meta = []
    for q in X:
        q_meta.append(len(q))
    return q_meta

In [115]:
X1_ft.shape

(50250,)

In [116]:
def split_arrays(X):
    for y in (x for x in X if x.size>0):
        yield np.vsplit(y,len(y))[0]

In [117]:
q_meta = get_q_lengths(X1_ft) + get_q_lengths(X2_ft)

In [118]:
X = np.vstack((np.concatenate([x for x in split_arrays(X1_ft)]), np.concatenate([x for x in split_arrays(X2_ft)])))

In [119]:
X.shape

(100367, 300)

In [120]:
del X1_ft, X2_ft

In [174]:
import sys
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) 
        for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], 
       key=lambda x: x[1], reverse=True)

[('X', 120440512),
 ('X_test', 59344912),
 ('X_train', 13036526),
 ('q_meta', 804064),
 ('y_train', 804024),
 ('mahalanobis', 406496),
 ('X1_rd', 402096),
 ('X2_rd', 402096),
 ('braycurtis', 402048),
 ('canberra', 402048),
 ('chebyshev', 402048),
 ('correlation', 402048),
 ('cosine', 402048),
 ('dice', 402048),
 ('hamming', 402048),
 ('jaccard', 402048),
 ('kulsinski', 402048),
 ('rogerstanimoto', 402048),
 ('russellrao', 402048),
 ('sokalmichener', 402048),
 ('yule', 402048),
 ('q_meta_test', 396064),
 ('y_test', 396024),
 ('Client', 3096),
 ('FastText', 2000),
 ('HTTPResponse', 1464),
 ('Geometry', 1056),
 ('Dict', 888),
 ('List', 888),
 ('Tuple', 888),
 ('y', 816),
 ('dtypes', 368),
 ('f_out', 176),
 ('assign_pwmetric', 136),
 ('cdist', 136),
 ('compute', 136),
 ('compute_pairwise_dist', 136),
 ('compute_pairwise_kernel', 136),
 ('get_ft_vectors', 136),
 ('get_q_lengths', 136),
 ('get_tokens', 136),
 ('laplacian_kernel', 136),
 ('linear_kernel', 136),
 ('mmread', 136),
 ('mmwrite', 

In [122]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_train.mtx', X )

In [123]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_train.mtx', source='wor2vec_300_train.mtx')

pushed file wor2vec_300_train.mtx from wor2vec_300_train.mtx to minio bucket dq-data


True

In [124]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_train.mtx', filepath='embed_train.mtx')

<minio.definitions.Object at 0x7f07f6394e10>

In [125]:
from scipy.io import mmread
X_rd = mmread('embed_train.mtx')

In [126]:
X_rd.shape

(100367, 3)

In [127]:
len(q_meta)

100500

In [128]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
q_halflen = int(len(q_meta)/2)
q1_meta = q_meta[:q_halflen]
q2_meta = q_meta[q_halflen:]
X_rd_halflen = int(X_rd.shape[0]/2)
X1_rd_tmp = X_rd[:X_rd_halflen]
X2_rd_tmp = X_rd[X_rd_halflen:]
for len_q1, len_q2 in zip(q1_meta, q2_meta):
    q1 = X1_rd_tmp[:len_q1]
    q2 = X2_rd_tmp[:len_q2]
    X1_list.append(q1)
    X2_list.append(q2)
    X1_rd_tmp = X1_rd_tmp[len_q1:]
    X2_rd_tmp = X2_rd_tmp[len_q2:]
X1_rd = np.array(X1_list)
X2_rd = np.array(X2_list)

In [129]:
del X1_list, X2_list, q1_meta, q2_meta, X_rd, X1_rd_tmp, X2_rd_tmp

In [130]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

In [131]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
#mahalanobis = []
yule = []
dice = []
kulsinski = []
rogerstanimoto = []
russellrao = []
sokalmichener = []
for q_tuple in zip(X1_rd, X2_rd):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        #mahalanobis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mahalanobis'))
        yule.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'yule'))
        dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
    else:
        jaccard.append(delayed([]))
        chebyshev.append(delayed([]))
        braycurtis.append(delayed([])) 
        cosine.append(delayed([]))
        correlation.append(delayed([]))
        hamming.append(delayed([])) 
        canberra.append(delayed([]))
        #mahalanobis.append(delayed([]))
        yule.append(delayed([])) 
        dice.append(delayed([]))
        kulsinski.append(delayed([]))
        rogerstanimoto.append(delayed([])) 
        russellrao.append(delayed([]))
        sokalmichener.append(delayed([])) 

In [132]:
jaccard = compute(*jaccard)

In [133]:
chebyshev = compute(*chebyshev)

In [134]:
braycurtis = compute(*braycurtis)

In [135]:
cosine = compute(*cosine)

In [136]:
correlation = compute(*correlation)

In [137]:
hamming = compute(*hamming)

In [138]:
canberra = compute(*canberra)

In [140]:
yule = compute(*yule)

In [141]:
dice = compute(*dice)

In [142]:
kulsinski = compute(*kulsinski)

In [143]:
rogerstanimoto = compute(*rogerstanimoto)

In [144]:
russellrao = compute(*russellrao)



In [145]:
sokalmichener = compute(*sokalmichener)



In [146]:
len(braycurtis)

50250

#### add above metrics to X_train

In [197]:
X_train = pd.concat([X_train, 
                     pd.Series(jaccard, name='jaccard',index=X_train.index), 
                     pd.Series(chebyshev, name='chebyshev',index=X_train.index), 
                     pd.Series(braycurtis, name='braycurtis',index=X_train.index), 
                     pd.Series(cosine, name='cosine',index=X_train.index), 
                     pd.Series(correlation, name='correlation',index=X_train.index), 
                     pd.Series(hamming, name='hamming',index=X_train.index), 
                     pd.Series(canberra, name='canberra',index=X_train.index), 
                     pd.Series(yule, name='yule',index=X_train.index), 
                     pd.Series(dice, name='dice',index=X_train.index), 
                     pd.Series(kulsinski, name='kulsinski',index=X_train.index), 
                     pd.Series(rogerstanimoto, name='rogerstanimoto',index=X_train.index), 
                     pd.Series(russellrao, name='russellrao',index=X_train.index), 
                     pd.Series(sokalmichener, name='sokalmichener',index=X_train.index)
                    ], axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,yule,dice,kulsinski,rogerstanimoto,russellrao,sokalmichener
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,"[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]","[[0.014126931602426835, 0.012581112662755235],...","[[7.192769595735011, 1.0508811404370453], [3.3...","[[1.9928794846028173, 0.9876017385688001], [2....","[[1.9990646340809, 0.8290676419744057], [1.961...","[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]","[[3.0, 2.374214601134712], [0.0004180739245818...","[[nan, nan], [nan, nan], [nan, nan]]","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]"
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,"[[1.0, 1.0], [1.0, 1.0]]","[[0.0025916664134411427, 0.003734074629871132]...","[[0.8056192916790164, 0.8820333320079468], [0....","[[0.4904439764915841, 0.9601788570284864], [0....","[[0.5529982013627186, 0.19406683678255077], [0...","[[1.0, 1.0], [1.0, 1.0]]","[[1.9797973210198996, 2.2020987126465954], [1....","[[nan, nan], [nan, nan]]","[[0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0]]","[[0.0, 0.0], [0.0, 0.0]]"
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[[0.013495506135781002, 0.008361488983873884, ...","[[1.7389287628370538, 3.082026603286921, 1.122...","[[1.706409657188209, 1.8502682602333516, 1.187...","[[1.3143016943547168, 1.7405977529289918, 1.73...","[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[[2.791127304823243, 2.1952025342832835, 2.464...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,"[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1....","[[0.006135665268631926, 0.0022207229085029864,...","[[0.8118536465828418, 0.758105060833011, 0.238...","[[0.8179461458545746, 0.908097707647699, 0.007...","[[0.8191842962784616, 0.8942658152743844, 4.90...","[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1....","[[1.8498885147898858, 2.0753995912271237, 0.82...","[[nan, nan, nan, nan, nan, nan, nan], [nan, na...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0....","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0...."
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,"[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], [...","[[0.0027977673856161055, 0.005010864998591739,...","[[1.0450967041196748, 2.1282196707603966, 0.77...","[[1.1759060353880262, 1.721726227280588, 0.951...","[[1.1268274287351194, 0.9627227139611261, 0.74...","[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], [...","[[2.257326933695591, 3.0, 2.0013512893087393, ...","[[nan, nan, nan, nan], [nan, nan, nan, nan], [...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [...","[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [..."


##### Test set

In [149]:
q_meta_test = get_q_lengths(X1_ft_test) + get_q_lengths(X2_ft_test)

In [150]:
X_test = np.vstack((np.concatenate([x for x in split_arrays(X1_ft_test)]), np.concatenate([x for x in split_arrays(X2_ft_test)])))

In [151]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_test.mtx', X_test )

In [152]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_test.mtx', source='wor2vec_300_test.mtx')

pushed file wor2vec_300_test.mtx from wor2vec_300_test.mtx to minio bucket dq-data


True

In [153]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_test.mtx', filepath='embed_test.mtx')

<minio.definitions.Object at 0x7f07d54e6978>

In [177]:
from scipy.io import mmread
X_rd_test = mmread('embed_test.mtx')

In [178]:
X_rd_test.shape

(49454, 3)

In [179]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
q_halflen = int(len(q_meta_test)/2)
q1_meta = q_meta_test[:q_halflen]
q2_meta = q_meta_test[q_halflen:]
X_rd_halflen = int(X_rd_test.shape[0]/2)
X1_rd_tmp = X_rd_test[:X_rd_halflen]
X2_rd_tmp = X_rd_test[X_rd_halflen:]
for len_q1, len_q2 in zip(q1_meta, q2_meta):
    q1 = X1_rd_tmp[:len_q1]
    q2 = X2_rd_tmp[:len_q2]
    X1_list.append(q1)
    X2_list.append(q2)
    X1_rd_tmp = X1_rd_tmp[len_q1:]
    X2_rd_tmp = X2_rd_tmp[len_q2:]
X1_rd_test = np.array(X1_list)
X2_rd_test = np.array(X2_list)

In [180]:
X1_rd_test.shape

(24750,)

In [181]:
del X1_list, X2_list, q1_meta, q2_meta, X_rd_test, X1_rd_tmp, X2_rd_tmp

In [182]:
jaccard_test = []
chebyshev_test = []
braycurtis_test = []
cosine_test = []
correlation_test = []
hamming_test = []
canberra_test = []
#mahalanobis_test = []
yule_test = []
dice_test = []
kulsinski_test = []
rogerstanimoto_test = []
russellrao_test = []
sokalmichener_test = []
for q_tuple in zip(X1_rd_test, X2_rd_test):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        #mahalanobis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mahalanobis'))
        yule_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'yule'))
        dice_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        kulsinski_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        rogerstanimoto_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        russellrao_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        sokalmichener_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
    else:
        jaccard_test.append(delayed([]))
        chebyshev_test.append(delayed([]))
        braycurtis_test.append(delayed([])) 
        cosine_test.append(delayed([]))
        correlation_test.append(delayed([]))
        hamming_test.append(delayed([])) 
        canberra_test.append(delayed([]))
        #mahalanobis_test.append(delayed([]))
        yule_test.append(delayed([])) 
        dice_test.append(delayed([]))
        kulsinski_test.append(delayed([]))
        rogerstanimoto_test.append(delayed([])) 
        russellrao_test.append(delayed([]))
        sokalmichener_test.append(delayed([])) 



In [183]:
jaccard_test = compute(*jaccard_test)



In [184]:
chebyshev_test = compute(*chebyshev_test)



In [185]:
braycurtis_test = compute(*braycurtis_test)



In [186]:
cosine_test = compute(*cosine_test)



In [187]:
correlation_test = compute(*correlation_test)



In [188]:
hamming_test = compute(*hamming_test)



In [189]:
canberra_test = compute(*canberra_test)



In [190]:
#mahalanobis_test = compute(*mahalanobis_test)

In [191]:
yule_test = compute(*yule_test)



In [192]:
dice_test = compute(*dice_test)



In [193]:
kulsinski_test = compute(*kulsinski_test)



In [194]:
rogerstanimoto_test = compute(*rogerstanimoto_test)



In [195]:
russellrao_test = compute(*russellrao_test)



In [196]:
sokalmichener_test = compute(*sokalmichener_test)



In [198]:
X_test = pd.concat([X_test, 
                     pd.Series(jaccard_test, name='jaccard',index=X_test.index), 
                     pd.Series(chebyshev_test, name='chebyshev',index=X_test.index), 
                     pd.Series(braycurtis_test, name='braycurtis',index=X_test.index), 
                     pd.Series(cosine_test, name='cosine',index=X_test.index), 
                     pd.Series(correlation_test, name='correlation',index=X_test.index), 
                     pd.Series(hamming_test, name='hamming',index=X_test.index), 
                     pd.Series(canberra_test, name='canberra',index=X_test.index), 
                     pd.Series(yule_test, name='yule',index=X_test.index), 
                     pd.Series(dice_test, name='dice',index=X_test.index), 
                     pd.Series(kulsinski_test, name='kulsinski',index=X_test.index), 
                     pd.Series(rogerstanimoto_test, name='rogerstanimoto',index=X_test.index), 
                     pd.Series(russellrao_test, name='russellrao',index=X_test.index), 
                     pd.Series(sokalmichener_test, name='sokalmichener',index=X_test.index)
                    ], axis=1)
X_test.head()

AttributeError: 'numpy.ndarray' object has no attribute 'index'

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

### Fuzzy-wuzzy

In [None]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

In [None]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

In [None]:
# build complete feature dataframe
X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
X_train_temp.head()

In [None]:
X_train = pd.concat([X_train_temp, X_train], axis=1)
del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

In [None]:
X_train.info()

##### Test set

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2', 'qid1', 'qid2'])
X_test.head()

# Modeling

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train, y_train)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))