In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
INPUT_BUCKET = 'dq-data'
HASH_BUCKET = 'dq-hashed'

In [299]:
#load train_set
data = 'train.csv'
filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [300]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404287 non-null int64
qid2            404287 non-null int64
question1       404287 non-null object
question2       404287 non-null object
is_duplicate    404287 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


### Train-test split

In [301]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
df = df.iloc[:75000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [302]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50250 entries, 71916 to 15795
Data columns (total 4 columns):
qid1         50250 non-null int64
qid2         50250 non-null int64
question1    50250 non-null object
question2    50250 non-null object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [270]:
del X,y,df

# Feature Extraction

### Tokenizing and preprocessing

In [10]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [11]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')

<minio.definitions.Object at 0x7fd0ccbd6eb8>

In [12]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [303]:
import os
os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')

In [14]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [304]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(100500,)

In [305]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [274]:
del X_ft

In [18]:
#X_train = pd.concat([X_train, pd.Series(X1_ft, name='q1_ft',index=X_train.index), pd.Series(X2_ft, name='q2_ft',index=X_train.index)], axis=1)
#X_train.head()

##### Test set

In [306]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [307]:
X_ft_test.shape

(49500,)

In [277]:
del model

In [308]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [279]:
del X_ft_test

### Pairwise Metrics

In [309]:
def get_q_lengths(X):
    #q_meta = []
    for q in X:
        #q_meta.append(len(q))
        yield len(q)
    #return q_meta

In [310]:
X1_ft.shape

(50250,)

In [27]:
def split_arrays(X):
    for y in (x for x in X if x.size>0):
        yield np.vsplit(y,len(y))[0]

In [311]:
q_meta_train = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft), get_q_lengths(X2_ft))]

In [353]:
X_train_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft if x.size>0)] )
)

In [354]:
X_train_300.shape

(482518, 300)

In [284]:
del X1_ft, X2_ft

In [32]:
#import sys
## These are the usual ipython objects, including this one you are creating
#ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
#sorted([(x, sys.getsizeof(globals().get(x))) 
#        for x in dir() if not x.startswith('_') 
#        and x not in sys.modules and x not in ipython_vars], 
#       key=lambda x: x[1], reverse=True)

In [355]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_train.mtx', X_train_300 )

In [356]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_train.mtx', source='wor2vec_300_train.mtx')

pushed file wor2vec_300_train.mtx from wor2vec_300_train.mtx to minio bucket dq-data


True

In [287]:
del X_train_300

In [317]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_train.mtx', filepath='embed_train.mtx')

<minio.definitions.Object at 0x7fcff1489908>

In [318]:
from scipy.io import mmread
X_rd = mmread('embed_train.mtx')

In [319]:
X_rd.shape

(100367, 3)

In [338]:
len(q_meta_train)

50250

In [328]:
X_rd_halflen = int(X_rd.shape[0]/2)
X_rd[-20:]

array([[-3.74598133e-03,  7.14532917e-04, -4.45617058e-03],
       [ 6.53278401e-04, -1.30925216e-03, -5.23273672e-03],
       [-7.19155826e-03,  3.12010063e-03, -8.30275541e-03],
       [-8.57720872e-04, -7.38833537e-04,  7.66427598e-04],
       [-2.84270654e-03,  3.30637048e-03,  2.72318815e-03],
       [-9.78708603e-04, -2.60696575e-03,  2.63144678e-03],
       [-4.07846945e-03,  4.06954832e-03,  1.58547115e-04],
       [-7.19222671e-03,  3.12021587e-03, -8.30256797e-03],
       [-3.54968165e-03,  3.24023707e-03, -2.03453508e-03],
       [-1.20300859e-03,  1.95971352e-03, -1.66474326e-03],
       [-8.20778779e-04, -6.62160825e-04,  4.15418822e-03],
       [-7.19235245e-03,  3.11763306e-03, -8.30384669e-03],
       [-5.47330631e-03,  5.72216858e-04, -4.91344930e-04],
       [ 1.14653108e-03, -3.84291051e-04,  1.01223989e-03],
       [-1.42622266e-03,  3.72392377e-03, -4.76971979e-04],
       [-1.23269583e-03,  3.26893889e-03, -4.80229418e-03],
       [ 1.38685421e-03, -3.23103815e-03

In [330]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
#q_halflen = int(len(q_meta_train)/2)
#q1_meta = q_meta_train[:q_halflen]
#q2_meta = q_meta_train[q_halflen:]
q1_ptr = 0
for len_q1, _ in q_meta_train:
    q1 = np.array(X_rd[q1_ptr:q1_ptr+len_q1])
    #q2 = X2_rd_tmp[:len_q2]
    X1_list.append(q1)
    #X2_list.append(q2)
    #X1_rd_tmp = X1_rd_tmp[len_q1:]
    #X2_rd_tmp = X2_rd_tmp[len_q2:]
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_train:
    #q1 = X1_rd_tmp[q1_ptr:q1_ptr+len_q1]
    q2 = np.array(X_rd[q2_ptr:q2_ptr+len_q2])
    #X1_list.append(q1)
    X2_list.append(q2)
    #X1_rd_tmp = X1_rd_tmp[len_q1:]
    #X2_rd_tmp = X2_rd_tmp[len_q2:]
    q2_ptr = q2_ptr+len_q2
X1_rd = np.array(X1_list)
X2_rd = np.array(X2_list)

In [213]:
del X1_list, X2_list, X_rd, X1_rd_tmp, X2_rd_tmp

In [42]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

In [214]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from scipy.spatial.distance import cdist, directed_hausdorff
def compute_pairwise_kernel(pc1, pc2, method='linear'):
    if method=='polynomial':
        return polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        return rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        return sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        return laplacian_kernel(pc1, pc2)
    else:
        return linear_kernel(pc1, pc2)
    
def compute_pairwise_dist(pc1, pc2, method='euclidean'):
    if pc1.size == 0:
        return np.nan
    if pc2.size == 0:
        return np.nan
    if method == 'hausdorff':
        return directed_hausdorff(pc1, pc2)[0]
    return np.mean(cdist(pc1, pc2, metric=method))
        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [326]:
q_meta_train[-20:]

[(3, 6),
 (4, 7),
 (5, 4),
 (7, 7),
 (5, 7),
 (4, 2),
 (4, 5),
 (3, 3),
 (8, 8),
 (6, 6),
 (5, 6),
 (4, 4),
 (3, 4),
 (3, 4),
 (10, 19),
 (4, 5),
 (8, 6),
 (4, 4),
 (4, 5),
 (4, 6)]

In [337]:
X1_rd

array([array([[-0.00171841,  0.00040429, -0.00041477],
       [-0.00106731, -0.00099031,  0.00050008],
       [-0.00389859,  0.00471625,  0.00320875]]),
       array([[-0.00303273,  0.0012433 , -0.00013369],
       [-0.00142547, -0.00013464, -0.00248457]]),
       array([[-4.42830005e-03,  6.54272156e-03, -1.13187236e-03],
       [-5.63124560e-03,  5.44332657e-03, -3.40553598e-03],
       [-6.12664820e-03,  1.96141408e-03, -2.55055704e-03],
       [ 6.50481743e-04,  3.54164642e-03,  3.86373919e-05]]),
       ..., array([], shape=(0, 3), dtype=float64),
       array([], shape=(0, 3), dtype=float64),
       array([], shape=(0, 3), dtype=float64)], dtype=object)

In [215]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
hausdorff = []
#mahalanobis = []
#yule = []
#dice = []
#kulsinski = []
#rogerstanimoto = []
#russellrao = []
#sokalmichener = []
for q_tuple in zip(X1_rd, X2_rd):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        #mahalanobis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mahalanobis'))
        #yule.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'yule'))
        #dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        #kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        #rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        #russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        #sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
    else:
        jaccard.append(delayed(np.nan))
        chebyshev.append(delayed(np.nan))
        braycurtis.append(delayed(np.nan)) 
        cosine.append(delayed(np.nan))
        correlation.append(delayed(np.nan))
        hamming.append(delayed(np.nan)) 
        canberra.append(delayed(np.nan))
        hausdorff.append(delayed(np.nan))
        #mahalanobis.append(delayed([]))
        #yule.append(delayed([])) 
        #dice.append(delayed([]))
        #kulsinski.append(delayed([]))
        #rogerstanimoto.append(delayed([])) 
        #russellrao.append(delayed([]))
        #sokalmichener.append(delayed([])) 

In [216]:
jaccard = compute(*jaccard)

In [217]:
chebyshev = compute(*chebyshev)

In [218]:
braycurtis = compute(*braycurtis)

In [219]:
cosine = compute(*cosine)

In [220]:
correlation = compute(*correlation)

In [221]:
hamming = compute(*hamming)

In [222]:
canberra = compute(*canberra)

In [223]:
hausdorff = compute(*hausdorff)

In [140]:
#yule = compute(*yule)

In [141]:
#dice = compute(*dice)

In [142]:
#kulsinski = compute(*kulsinski)

In [143]:
#rogerstanimoto = compute(*rogerstanimoto)

In [144]:
#russellrao = compute(*russellrao)



In [145]:
#sokalmichener = compute(*sokalmichener)



In [224]:
len(braycurtis)

50250

#### add above metrics to X_train

In [225]:
X_train = pd.concat([X_train,
                     pd.Series(jaccard, name='jaccard',index=X_train.index), 
                     pd.Series(chebyshev, name='chebyshev',index=X_train.index), 
                     pd.Series(braycurtis, name='braycurtis',index=X_train.index), 
                     pd.Series(cosine, name='cosine',index=X_train.index), 
                     pd.Series(correlation, name='correlation',index=X_train.index), 
                     pd.Series(hamming, name='hamming',index=X_train.index), 
                     pd.Series(canberra, name='canberra',index=X_train.index) , 
                     pd.Series(hausdorff, name='hausdorff',index=X_train.index)
                     #pd.Series(yule, name='yule',index=X_train.index), 
                     #pd.Series(dice, name='dice',index=X_train.index), 
                     #pd.Series(kulsinski, name='kulsinski',index=X_train.index), 
                     #pd.Series(rogerstanimoto, name='rogerstanimoto',index=X_train.index), 
                     #pd.Series(russellrao, name='russellrao',index=X_train.index), 
                     #pd.Series(sokalmichener, name='sokalmichener',index=X_train.index)
                    ], axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125


##### Test set

In [357]:
q_meta_test = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft_test), get_q_lengths(X2_ft_test))]

In [358]:
X_test_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft_test if x.size>0)] )
)

In [None]:
X_test_300.shape

In [359]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_test.mtx', X_test_300 )

In [360]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_test.mtx', source='wor2vec_300_test.mtx')

pushed file wor2vec_300_test.mtx from wor2vec_300_test.mtx to minio bucket dq-data


True

In [187]:
del X_test_300

In [71]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_test.mtx', filepath='embed_test.mtx')

<minio.definitions.Object at 0x7fd07388fc18>

In [226]:
from scipy.io import mmread
X_rd_test = mmread('embed_test.mtx')

In [227]:
X_rd_test.shape

(49454, 3)

In [228]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
q_halflen = int(len(q_meta_test)/2)
q1_meta = q_meta_test[:q_halflen]
q2_meta = q_meta_test[q_halflen:]
X_rd_halflen = int(X_rd_test.shape[0]/2)
X1_rd_tmp = X_rd_test[:X_rd_halflen]
X2_rd_tmp = X_rd_test[X_rd_halflen:]
for len_q1, len_q2 in zip(q1_meta, q2_meta):
    q1 = X1_rd_tmp[:len_q1]
    q2 = X2_rd_tmp[:len_q2]
    X1_list.append(q1)
    X2_list.append(q2)
    X1_rd_tmp = X1_rd_tmp[len_q1:]
    X2_rd_tmp = X2_rd_tmp[len_q2:]
X1_rd_test = np.array(X1_list)
X2_rd_test = np.array(X2_list)

In [229]:
X1_rd_test.shape

(24750,)

In [230]:
del X1_list, X2_list, q1_meta, q2_meta, X_rd_test, X1_rd_tmp, X2_rd_tmp

In [231]:
jaccard_test = []
chebyshev_test = []
braycurtis_test = []
cosine_test = []
correlation_test = []
hamming_test = []
canberra_test = []
hausdorff_test = []
#mahalanobis_test = []
#yule_test = []
#dice_test = []
#kulsinski_test = []
#rogerstanimoto_test = []
#russellrao_test = []
#sokalmichener_test = []
for q_tuple in zip(X1_rd_test, X2_rd_test):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        #mahalanobis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'mahalanobis'))
        #yule_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'yule'))
        #dice_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        #kulsinski_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        #rogerstanimoto_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        #russellrao_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        #sokalmichener_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
    else:
        jaccard_test.append(delayed(np.nan))
        chebyshev_test.append(delayed(np.nan))
        braycurtis_test.append(delayed(np.nan)) 
        cosine_test.append(delayed(np.nan))
        correlation_test.append(delayed(np.nan))
        hamming_test.append(delayed(np.nan)) 
        canberra_test.append(delayed(np.nan))
        hausdorff_test.append(delayed(np.nan))
        #mahalanobis_test.append(delayed([]))
        #yule_test.append(delayed([])) 
        #dice_test.append(delayed([]))
        #kulsinski_test.append(delayed([]))
        #rogerstanimoto_test.append(delayed([])) 
        #russellrao_test.append(delayed([]))
        #sokalmichener_test.append(delayed([])) 

In [232]:
jaccard_test = compute(*jaccard_test)

In [233]:
chebyshev_test = compute(*chebyshev_test)

In [234]:
braycurtis_test = compute(*braycurtis_test)

In [235]:
cosine_test = compute(*cosine_test)

In [236]:
correlation_test = compute(*correlation_test)

In [237]:
hamming_test = compute(*hamming_test)

In [238]:
canberra_test = compute(*canberra_test)

In [239]:
hausdorff_test = compute(*hausdorff_test)

In [190]:
#mahalanobis_test = compute(*mahalanobis_test)

In [191]:
#yule_test = compute(*yule_test)



In [192]:
#dice_test = compute(*dice_test)



In [193]:
#kulsinski_test = compute(*kulsinski_test)



In [194]:
#rogerstanimoto_test = compute(*rogerstanimoto_test)



In [195]:
#russellrao_test = compute(*russellrao_test)



In [196]:
#sokalmichener_test = compute(*sokalmichener_test)



In [240]:
X_test = pd.concat([X_test, 
                     pd.Series(jaccard_test, name='jaccard',index=X_test.index), 
                     pd.Series(chebyshev_test, name='chebyshev',index=X_test.index), 
                     pd.Series(braycurtis_test, name='braycurtis',index=X_test.index), 
                     pd.Series(cosine_test, name='cosine',index=X_test.index), 
                     pd.Series(correlation_test, name='correlation',index=X_test.index), 
                     pd.Series(hamming_test, name='hamming',index=X_test.index), 
                     pd.Series(canberra_test, name='canberra',index=X_test.index),
                     pd.Series(hausdorff_test, name='hausdorff',index=X_test.index) 
                     #pd.Series(yule_test, name='yule',index=X_test.index), 
                     #pd.Series(dice_test, name='dice',index=X_test.index), 
                     #pd.Series(kulsinski_test, name='kulsinski',index=X_test.index), 
                     #pd.Series(rogerstanimoto_test, name='rogerstanimoto',index=X_test.index), 
                     #pd.Series(russellrao_test, name='russellrao',index=X_test.index), 
                     #pd.Series(sokalmichener_test, name='sokalmichener',index=X_test.index)
                    ], axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",1.0,0.004548,1.024092,0.758146,0.754297,1.0,2.043331,0.000195
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,1.0,0.008146,0.984704,0.662968,0.884503,1.0,2.430144,0.008853
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",1.0,0.005041,0.608893,0.535137,0.849425,1.0,1.715228,0.01465
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,1.0,0.006368,1.004397,1.019846,1.054118,1.0,2.265732,0.008221
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,1.0,0.004689,0.753609,0.613558,0.973406,1.0,2.190644,0.005121


### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

### Fuzzy-wuzzy

In [241]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732,5
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846,16
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177,82
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185,3
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125,3


In [242]:
from fuzzywuzzy import fuzz

In [243]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732,5,68
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846,16,64
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177,82,27
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185,3,62
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125,3,47


In [244]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732,5,68,78
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846,16,64,60
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177,82,27,44
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185,3,62,59
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125,3,47,47


In [245]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732,5,68,78,74
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846,16,64,60,72
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177,82,27,44,29
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185,3,62,59,64
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125,3,47,47,48


In [246]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732,5,68,78,74,74
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846,16,64,60,72,72
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177,82,27,44,29,32
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185,3,62,59,64,71
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125,3,47,47,48,55


In [None]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [247]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

Unnamed: 0_level_0,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
71916,1.0,0.004183,0.749184,0.577846,0.849309,1.0,1.386667,0.00732,5,68,78,74,74
43137,1.0,0.004974,1.196143,1.049221,1.028845,1.0,2.483181,0.005846,16,64,60,72,72
66647,1.0,0.00613,0.941305,0.914124,0.885923,1.0,2.03401,0.006177,82,27,44,29,32
21351,1.0,0.006051,1.053847,1.055724,1.013987,1.0,2.433737,0.005185,3,62,59,64,71
68289,1.0,0.003852,0.963405,0.849872,0.723745,1.0,2.430036,0.005125,3,47,47,48,55


In [248]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50250 entries, 71916 to 15795
Data columns (total 13 columns):
jaccard             10306 non-null float64
chebyshev           10306 non-null float64
braycurtis          10306 non-null float64
cosine              10306 non-null float64
correlation         10306 non-null float64
hamming             10306 non-null float64
canberra            10306 non-null float64
hausdorff           10306 non-null float64
size_diff           50250 non-null int64
ratio               50250 non-null int64
partial_ratio       50250 non-null int64
token_sort_ratio    50250 non-null int64
token_set_ratio     50250 non-null int64
dtypes: float64(8), int64(5)
memory usage: 5.4 MB


In [249]:
X_train.tail(20)

Unnamed: 0_level_0,jaccard,chebyshev,braycurtis,cosine,correlation,hamming,canberra,hausdorff,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
71932,,,,,,,,,24,71,85,71,93
28693,,,,,,,,,24,80,82,80,100
53707,,,,,,,,,16,60,57,57,72
5311,,,,,,,,,3,80,89,86,91
67969,,,,,,,,,26,36,37,39,41
64925,,,,,,,,,25,51,49,49,53
62955,,,,,,,,,14,46,47,56,62
59735,,,,,,,,,6,63,63,82,82
769,,,,,,,,,1,92,92,89,100
64820,,,,,,,,,8,69,70,70,79


##### Test set

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2', 'qid1', 'qid2'])
X_test.head()

# Modeling

### Logistic Regression

In [159]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train_ft, y_train)

ValueError: setting an array element with a sequence.

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_ft, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))