In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
INPUT_BUCKET = 'dq-data'
HASH_BUCKET = 'dq-hashed'

In [5]:
#load train_set
data = 'train.csv'
filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404287 non-null int64
qid2            404287 non-null int64
question1       404287 non-null object
question2       404287 non-null object
is_duplicate    404287 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


### Train-test split

In [7]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
df = df.iloc[:75000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50250 entries, 71916 to 15795
Data columns (total 4 columns):
qid1         50250 non-null int64
qid2         50250 non-null int64
question1    50250 non-null object
question2    50250 non-null object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [9]:
#del X,y,df

# Feature Extraction

### Tokenizing and preprocessing

In [10]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [11]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')

<minio.definitions.Object at 0x7f80e7c310b8>

In [12]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [13]:
import os
os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')

In [14]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [15]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(100500,)

In [16]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [None]:
#del X_ft

##### Test set

In [17]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [18]:
X_ft_test.shape

(49500,)

In [19]:
#del model

In [20]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [None]:
#del X_ft_test

### Pairwise Metrics

In [21]:
def get_q_lengths(X):
    #q_meta = []
    for q in X:
        #q_meta.append(len(q))
        yield len(q)
    #return q_meta

In [22]:
X1_ft.shape

(50250,)

In [23]:
q_meta_train = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft), get_q_lengths(X2_ft))]

In [24]:
X_train_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft if x.size>0)] )
)

In [25]:
X_train_300.shape

(482518, 300)

In [None]:
#del X1_ft, X2_ft

In [None]:
#import sys
## These are the usual ipython objects, including this one you are creating
#ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
#sorted([(x, sys.getsizeof(globals().get(x))) 
#        for x in dir() if not x.startswith('_') 
#        and x not in sys.modules and x not in ipython_vars], 
#       key=lambda x: x[1], reverse=True)

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_train.mtx', X_train_300 )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_train.mtx', source='wor2vec_300_train.mtx')

In [None]:
#del X_train_300

In [26]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_train.mtx', filepath='embed_train.mtx')

<minio.definitions.Object at 0x7f7ed99c3860>

In [27]:
from scipy.io import mmread
X_rd = mmread('embed_train.mtx')

In [28]:
X_rd.shape

(482518, 3)

In [29]:
len(q_meta_train)

50250

In [30]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
#q_halflen = int(len(q_meta_train)/2)
#q1_meta = q_meta_train[:q_halflen]
#q2_meta = q_meta_train[q_halflen:]
q1_ptr = 0
for len_q1, _ in q_meta_train:
    q1 = np.array(X_rd[q1_ptr:q1_ptr+len_q1])
    #q2 = X2_rd_tmp[:len_q2]
    X1_list.append(q1)
    #X2_list.append(q2)
    #X1_rd_tmp = X1_rd_tmp[len_q1:]
    #X2_rd_tmp = X2_rd_tmp[len_q2:]
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_train:
    #q1 = X1_rd_tmp[q1_ptr:q1_ptr+len_q1]
    q2 = np.array(X_rd[q2_ptr:q2_ptr+len_q2])
    #X1_list.append(q1)
    X2_list.append(q2)
    #X1_rd_tmp = X1_rd_tmp[len_q1:]
    #X2_rd_tmp = X2_rd_tmp[len_q2:]
    q2_ptr = q2_ptr+len_q2
X1_rd = np.array(X1_list)
X2_rd = np.array(X2_list)

In [None]:
#del X1_list, X2_list, X_rd, X1_rd_tmp, X2_rd_tmp

In [31]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

In [32]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from scipy.spatial.distance import cdist, directed_hausdorff
from scipy.stats import wasserstein_distance
from MDAnalysis.analysis.psa import hausdorff, hausdorff_wavg, hausdorff_avg, discrete_frechet
def compute_pairwise_kernel(pc1, pc2, method='linear'):
    if method=='polynomial':
        return polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        return rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        return sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        return laplacian_kernel(pc1, pc2)
    else:
        return linear_kernel(pc1, pc2)
    
def compute_pairwise_dist(pc1, pc2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return (np.nan,np.nan,np.nan)
    dist_mat = cdist(pc1, pc2, metric=method)
    return (np.mean(dist_mat), np.min(dist_mat), np.max(dist_mat))

def compute_pairwise_metric(pc1, pc2, method='hausdorff'):
    if pc1.size == 0:
        return np.nan
    if pc2.size == 0:
        return np.nan
    if method == 'hausdorff':
        return directed_hausdorff(pc1, pc2)[0]
    if method == 'mda_hausdorff':
        return hausdorff(pc1, pc2)
    if method == 'mda_hausdorff_wavg':
        return hausdorff_wavg(pc1, pc2)
    if method == 'mda_hausdorff_avg':
        return hausdorff_avg(pc1, pc2)
    if method == 'discrete_frechet':
        return discrete_frechet(pc1, pc2)
        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [None]:
#import numpy as np
#compute_pairwise_metric(np.array([[1,0,0],[1,0,1],[1,1,0]]), np.array([[2,2,2],[1,0,0]]), method='hausdorff')

In [33]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
hausdorff = []
mda_hausdorff = []
mda_hausdorff_wavg = []
mda_hausdorff_avg = []
discrete_frechet = []
#mahalanobis = []
#yule = []
#dice = []
#kulsinski = []
#rogerstanimoto = []
#russellrao = []
#sokalmichener = []
for q_tuple in zip(X1_rd, X2_rd):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'hausdorff'))
        mda_hausdorff.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'mda_hausdorff'))
        mda_hausdorff_wavg.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'mda_hausdorff_wavg'))
        mda_hausdorff_avg.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'mda_hausdorff_avg'))
        discrete_frechet.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'discrete_frechet'))
        #mahalanobis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mahalanobis'))
        #yule.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'yule'))
        #dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        #kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        #rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        #russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        #sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
    else:
        jaccard.append(delayed((np.nan,np.nan,np.nan)))
        chebyshev.append(delayed((np.nan,np.nan,np.nan)))
        braycurtis.append(delayed((np.nan,np.nan,np.nan))) 
        cosine.append(delayed((np.nan,np.nan,np.nan)))
        correlation.append(delayed((np.nan,np.nan,np.nan)))
        hamming.append(delayed((np.nan,np.nan,np.nan))) 
        canberra.append(delayed((np.nan,np.nan,np.nan)))
        hausdorff.append(delayed(np.nan))
        mda_hausdorff.append(delayed(np.nan))
        mda_hausdorff_wavg.append(delayed(np.nan)) 
        mda_hausdorff_avg.append(delayed(np.nan))
        discrete_frechet.append(delayed(np.nan))
        #mahalanobis.append(delayed([]))
        #yule.append(delayed([])) 
        #dice.append(delayed([]))
        #kulsinski.append(delayed([]))
        #rogerstanimoto.append(delayed([])) 
        #russellrao.append(delayed([]))
        #sokalmichener.append(delayed([])) 

In [None]:
jaccard = compute(*jaccard)

In [None]:
chebyshev = compute(*chebyshev)

In [None]:
braycurtis = compute(*braycurtis)

In [None]:
cosine = compute(*cosine)

In [None]:
correlation = compute(*correlation)

In [None]:
hamming = compute(*hamming)

In [None]:
canberra = compute(*canberra)

In [None]:
hausdorff = compute(*hausdorff)

In [None]:
mda_hausdorff = compute(*mda_hausdorff)

In [None]:
mda_hausdorff_wavg = compute(*mda_hausdorff_wavg)

In [None]:
mda_hausdorff_avg = compute(*mda_hausdorff_avg)

In [None]:
discrete_frechet = compute(*discrete_frechet)

In [None]:
#yule = compute(*yule)

In [None]:
#dice = compute(*dice)

In [None]:
#kulsinski = compute(*kulsinski)

In [None]:
#rogerstanimoto = compute(*rogerstanimoto)

In [None]:
#russellrao = compute(*russellrao)

In [None]:
#sokalmichener = compute(*sokalmichener)

In [None]:
len(braycurtis)

#### add above metrics to X_train

In [None]:
X_train = pd.concat([X_train,
                     pd.Series((x for x,_,_ in jaccard), name='jaccard_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in jaccard), name='jaccard_min',index=X_train.index), 
                     pd.Series((x for _,_,x in jaccard), name='jaccard_max',index=X_train.index), 
                     pd.Series((x for x,_,_ in chebyshev), name='chebyshev_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in chebyshev), name='chebyshev_min',index=X_train.index), 
                     pd.Series((x for _,_,x in chebyshev), name='chebyshev_max',index=X_train.index), 
                     pd.Series((x for x,_,_ in braycurtis), name='braycurtis_mean',index=X_train.index),
                     pd.Series((x for _,x,_ in braycurtis), name='braycurtis_min',index=X_train.index), 
                     pd.Series((x for _,_,x in braycurtis), name='braycurtis_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in cosine), name='cosine_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in cosine), name='cosine_min',index=X_train.index), 
                     pd.Series((x for _,_,x in cosine), name='cosine_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in correlation), name='correlation_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in correlation), name='correlation_min',index=X_train.index), 
                     pd.Series((x for _,_,x in correlation), name='correlation_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in hamming), name='hamming_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in hamming), name='hamming_min',index=X_train.index), 
                     pd.Series((x for _,_,x in hamming), name='hamming_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in canberra), name='canberra_mean',index=X_train.index) , 
                     pd.Series((x for _,x,_ in canberra), name='canberra_min',index=X_train.index), 
                     pd.Series((x for _,_,x in canberra), name='canberra_max',index=X_train.index),  
                     pd.Series(hausdorff, name='hausdorff',index=X_train.index),
                     pd.Series(mda_hausdorff, name='mda_hausdorff',index=X_train.index), 
                     pd.Series(mda_hausdorff_wavg, name='mda_hausdorff_wavg',index=X_train.index), 
                     pd.Series(mda_hausdorff_avg, name='mda_hausdorff_avg',index=X_train.index) , 
                     pd.Series(discrete_frechet, name='discrete_frechet',index=X_train.index)
                     #pd.Series(yule, name='yule',index=X_train.index), 
                     #pd.Series(dice, name='dice',index=X_train.index), 
                     #pd.Series(kulsinski, name='kulsinski',index=X_train.index), 
                     #pd.Series(rogerstanimoto, name='rogerstanimoto',index=X_train.index), 
                     #pd.Series(russellrao, name='russellrao',index=X_train.index), 
                     #pd.Series(sokalmichener, name='sokalmichener',index=X_train.index)
                    ], axis=1)
X_train.head()

In [None]:
X_train[X_train.isnull().any(axis=1)]

##### Test set

In [None]:
q_meta_test = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft_test), get_q_lengths(X2_ft_test))]

In [None]:
X_test_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft_test if x.size>0)] )
)

In [None]:
X_test_300.shape

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_test.mtx', X_test_300 )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_test.mtx', source='wor2vec_300_test.mtx')

In [None]:
del X_test_300

In [None]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_test.mtx', filepath='embed_test.mtx')

In [None]:
from scipy.io import mmread
X_rd_test = mmread('embed_test.mtx')

In [None]:
X_rd_test.shape

In [None]:
# rebuild X1_rd_test and X2_rd_test
X1_list = []
X2_list = []
q1_ptr = 0
for len_q1, _ in q_meta_test:
    q1 = np.array(X_rd_test[q1_ptr:q1_ptr+len_q1])
    X1_list.append(q1)
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_test:
    q2 = np.array(X_rd_test[q2_ptr:q2_ptr+len_q2])
    X2_list.append(q2)
    q2_ptr = q2_ptr+len_q2
X1_rd_test = np.array(X1_list)
X2_rd_test = np.array(X2_list)

In [None]:
X1_rd_test.shape

In [None]:
del X1_list, X2_list, q1_meta, q2_meta, X_rd_test, X1_rd_tmp, X2_rd_tmp

In [None]:
jaccard_test = []
chebyshev_test = []
braycurtis_test = []
cosine_test = []
correlation_test = []
hamming_test = []
canberra_test = []
hausdorff_test = []
mda_hausdorff_test = []
mda_hausdorff_wavg_test = []
mda_hausdorff_avg_test = []
discrete_frechet_test = []
#mahalanobis_test = []
#yule_test = []
#dice_test = []
#kulsinski_test = []
#rogerstanimoto_test = []
#russellrao_test = []
#sokalmichener_test = []
for q_tuple in zip(X1_rd_test, X2_rd_test):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        mda_hausdorff_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mda_hausdorff'))
        mda_hausdorff_wavg_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mda_hausdorff_wavg'))
        mda_hausdorff_avg_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mda_hausdorff_avg'))
        discrete_frechet_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'discrete_frechet'))
        #mahalanobis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mahalanobis'))
        #yule_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'yule'))
        #dice_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        #kulsinski_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        #rogerstanimoto_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        #russellrao_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        #sokalmichener_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
    else:
        jaccard_test.append(delayed(np.nan))
        chebyshev_test.append(delayed(np.nan))
        braycurtis_test.append(delayed(np.nan)) 
        cosine_test.append(delayed(np.nan))
        correlation_test.append(delayed(np.nan))
        hamming_test.append(delayed(np.nan)) 
        canberra_test.append(delayed(np.nan))
        hausdorff_test.append(delayed(np.nan))
        mda_hausdorff_test.append(delayed(np.nan))
        mda_hausdorff_wavg_test.append(delayed(np.nan)) 
        mda_hausdorff_avg_test.append(delayed(np.nan))
        discrete_frechet_test.append(delayed(np.nan))
        #mahalanobis_test.append(delayed([]))
        #yule_test.append(delayed([])) 
        #dice_test.append(delayed([]))
        #kulsinski_test.append(delayed([]))
        #rogerstanimoto_test.append(delayed([])) 
        #russellrao_test.append(delayed([]))
        #sokalmichener_test.append(delayed([])) 

In [None]:
jaccard_test = compute(*jaccard_test)

In [None]:
chebyshev_test = compute(*chebyshev_test)

In [None]:
braycurtis_test = compute(*braycurtis_test)

In [None]:
cosine_test = compute(*cosine_test)

In [None]:
correlation_test = compute(*correlation_test)

In [None]:
hamming_test = compute(*hamming_test)

In [None]:
canberra_test = compute(*canberra_test)

In [None]:
hausdorff_test = compute(*hausdorff_test)

In [None]:
mda_hausdorff_test = compute(*mda_hausdorff_test)

In [None]:
mda_hausdorff_wavg_test = compute(*mda_hausdorff_wavg_test)

In [None]:
mda_hausdorff_avg_test = compute(*mda_hausdorff_avg_test)

In [None]:
discrete_frechet_test = compute(*discrete_frechet_test)

In [None]:
#mahalanobis_test = compute(*mahalanobis_test)

In [None]:
#yule_test = compute(*yule_test)

In [None]:
#dice_test = compute(*dice_test)

In [None]:
#kulsinski_test = compute(*kulsinski_test)

In [None]:
#rogerstanimoto_test = compute(*rogerstanimoto_test)

In [None]:
#russellrao_test = compute(*russellrao_test)

In [None]:
#sokalmichener_test = compute(*sokalmichener_test)

In [None]:
X_test = pd.concat([X_test, 
                     pd.Series(jaccard_test, name='jaccard',index=X_test.index), 
                     pd.Series(chebyshev_test, name='chebyshev',index=X_test.index), 
                     pd.Series(braycurtis_test, name='braycurtis',index=X_test.index), 
                     pd.Series(cosine_test, name='cosine',index=X_test.index), 
                     pd.Series(correlation_test, name='correlation',index=X_test.index), 
                     pd.Series(hamming_test, name='hamming',index=X_test.index), 
                     pd.Series(canberra_test, name='canberra',index=X_test.index),
                     pd.Series(hausdorff_test, name='hausdorff',index=X_test.index),
                     pd.Series(mda_hausdorff_test, name='mda_hausdorff',index=X_train.index), 
                     pd.Series(mda_hausdorff_wavg_test, name='mda_hausdorff_wavg',index=X_train.index), 
                     pd.Series(mda_hausdorff_avg_test, name='mda_hausdorff_avg',index=X_train.index) , 
                     pd.Series(discrete_frechet_test, name='discrete_frechet',index=X_train.index)
                     #pd.Series(yule_test, name='yule',index=X_test.index), 
                     #pd.Series(dice_test, name='dice',index=X_test.index), 
                     #pd.Series(kulsinski_test, name='kulsinski',index=X_test.index), 
                     #pd.Series(rogerstanimoto_test, name='rogerstanimoto',index=X_test.index), 
                     #pd.Series(russellrao_test, name='russellrao',index=X_test.index), 
                     #pd.Series(sokalmichener_test, name='sokalmichener',index=X_test.index)
                    ], axis=1)
X_test.head()

In [None]:
X_test[X_test.isnull().any(axis=1)]

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

### Fuzzy-wuzzy

In [None]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

In [None]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

In [None]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [None]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train_final = X_train.drop(columns=['qid1', 'qid2','question1','question2']).dropna()
X_train_final.info()

In [None]:
X_train_final.tail(20)

##### Test set

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
X_test_final = X_test.drop(columns=['question1','question2', 'qid1', 'qid2']).dropna()
X_test_final.info()

# Modeling

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
y_train_final = y_train.loc[X_train_final.index]
logr_cv.fit(X_train_final, y_train_final)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

In [None]:
logr_pred = logr_model.predict(X_test_final)
y_test_final = y_test.loc[X_test_final.index]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))