In [35]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
from toolz import partition_all

In [38]:
INPUT_BUCKET = 'dq-data'
HASH_BUCKET = 'dq-hashed'

In [39]:
#load train_set
data = 'train.csv'
filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404287 non-null int64
qid2            404287 non-null int64
question1       404287 non-null object
question2       404287 non-null object
is_duplicate    404287 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


### Train-test split

In [41]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
df = df.iloc[:75000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [42]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50250 entries, 71916 to 15795
Data columns (total 4 columns):
qid1         50250 non-null int64
qid2         50250 non-null int64
question1    50250 non-null object
question2    50250 non-null object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [43]:
#del X,y,df

# Feature Extraction

### Tokenizing and preprocessing

In [44]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [13]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')

<minio.definitions.Object at 0x7fca6ad3ba58>

In [None]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [45]:
import os
os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')

In [46]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [47]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(100500,)

In [48]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [None]:
#del X_ft

##### Test set

In [49]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [50]:
X_ft_test.shape

(49500,)

In [None]:
#del model

In [51]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [None]:
#del X_ft_test

### Pairwise Metrics

In [52]:
def get_q_lengths(X):
    #q_meta = []
    for q in X:
        #q_meta.append(len(q))
        yield len(q)
    #return q_meta

In [53]:
X1_ft.shape

(50250,)

In [54]:
q_meta_train = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft), get_q_lengths(X2_ft))]

In [55]:
X_train_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft if x.size>0)] )
)

In [56]:
X_train_300.shape

(482518, 300)

In [None]:
#del X1_ft, X2_ft

In [None]:
#import sys
## These are the usual ipython objects, including this one you are creating
#ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
#sorted([(x, sys.getsizeof(globals().get(x))) 
#        for x in dir() if not x.startswith('_') 
#        and x not in sys.modules and x not in ipython_vars], 
#       key=lambda x: x[1], reverse=True)

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_train.mtx', X_train_300 )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_train.mtx', source='wor2vec_300_train.mtx')

In [None]:
#del X_train_300

In [57]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_train.mtx', filepath='embed_train.mtx')

<minio.definitions.Object at 0x7fca4a3a7470>

In [58]:
from scipy.io import mmread
X_rd = mmread('embed_train.mtx')

In [59]:
X_rd.shape

(482518, 3)

In [60]:
len(q_meta_train)

50250

In [61]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
q1_ptr = 0
for len_q1, _ in q_meta_train:
    q1 = np.array(X_rd[q1_ptr:q1_ptr+len_q1])
    X1_list.append(q1)
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_train:
    q2 = np.array(X_rd[q2_ptr:q2_ptr+len_q2])
    X2_list.append(q2)
    q2_ptr = q2_ptr+len_q2
X1_rd = np.array(X1_list)
X2_rd = np.array(X2_list)

In [None]:
#del X1_list, X2_list, X_rd, X1_rd_tmp, X2_rd_tmp

In [62]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

In [63]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, directed_hausdorff
def compute_pairwise_kernel(pc1, pc2, method='linear'):
    if method=='polynomial':
        return polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        return rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        return sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        return laplacian_kernel(pc1, pc2)
    else:
        return linear_kernel(pc1, pc2)
    
def compute_pairwise_dist(pc1, pc2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return (np.nan,np.nan,np.nan)
    if method=='hausdorff':
        dist = directed_hausdorff(pc1, pc2)
        return dist
    else:
        dist_mat = pairwise_distances(pc1, pc2, metric=method) 
    #dist_mat = cdist(pc1, pc2, metric=method)
    return (np.mean(dist_mat), np.min(dist_mat), np.max(dist_mat))

def compute_pairwise_metric(pc1, pc2, method='hausdorff'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method == 'mda_hausdorff':
        return hausdorff(pc1, pc2)
    if method == 'mda_hausdorff_wavg':
        return hausdorff_wavg(pc1, pc2)
    if method == 'mda_hausdorff_avg':
        return hausdorff_avg(pc1, pc2)
    if method == 'discrete_frechet':
        return discrete_frechet(pc1, pc2)
    #return directed_hausdorff(pc1, pc2)[0]
        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

#### Procrustes Analysis

In [153]:
! pip install rpy2

Collecting rpy2
[?25l  Downloading https://files.pythonhosted.org/packages/02/d1/074ffbbe7b4bf74c60b75d74c8e67a1e4515b0d85f85cd6540e39610754a/rpy2-2.9.5.tar.gz (194kB)
[K    100% |████████████████████████████████| 194kB 2.8MB/s ta 0:00:01
[?25h    Complete output from command python setup.py egg_info:
    Error: Tried to guess R's HOME but no command 'R' in the PATH.
    
    ----------------------------------------
[31mCommand "python setup.py egg_info" failed with error code 1 in /tmp/pip-install-_3pl4lma/rpy2/[0m


In [64]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
hausdorff = []
cityblock = []
euclidean = []
l1 = []
l2 = []
manhattan = []
dice = []
kulsinski = []
rogerstanimoto = []
russellrao = []
sokalmichener = []
minkowski = []
seuclidean = []
sokalsneath = []
sqeuclidean = []
for q_tuple in zip(X1_rd, X2_rd):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        cityblock.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cityblock'))
        euclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'euclidean'))
        l1.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l1'))
        l2.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l2'))
        manhattan.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'manhattan'))
        dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
        minkowski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'minkowski'))
        seuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'seuclidean'))
        sokalsneath.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalsneath'))
        sqeuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sqeuclidean'))
    else:
        jaccard.append(delayed((np.nan,np.nan,np.nan)))
        chebyshev.append(delayed((np.nan,np.nan,np.nan)))
        braycurtis.append(delayed((np.nan,np.nan,np.nan))) 
        cosine.append(delayed((np.nan,np.nan,np.nan)))
        correlation.append(delayed((np.nan,np.nan,np.nan)))
        hamming.append(delayed((np.nan,np.nan,np.nan))) 
        canberra.append(delayed((np.nan,np.nan,np.nan)))
        hausdorff.append(delayed((np.nan,np.nan,np.nan)))
        cityblock.append(delayed((np.nan,np.nan,np.nan)))
        euclidean.append(delayed((np.nan,np.nan,np.nan)))
        l1.append(delayed((np.nan,np.nan,np.nan)))
        l2.append(delayed((np.nan,np.nan,np.nan)))
        manhattan.append(delayed((np.nan,np.nan,np.nan)))
        dice.append(delayed((np.nan,np.nan,np.nan)))
        kulsinski.append(delayed((np.nan,np.nan,np.nan)))
        rogerstanimoto.append(delayed((np.nan,np.nan,np.nan))) 
        russellrao.append(delayed((np.nan,np.nan,np.nan)))
        sokalmichener.append(delayed((np.nan,np.nan,np.nan))) 
        minkowski.append(delayed((np.nan,np.nan,np.nan)))
        seuclidean.append(delayed((np.nan,np.nan,np.nan))) 
        sokalsneath.append(delayed((np.nan,np.nan,np.nan)))
        sqeuclidean.append(delayed((np.nan,np.nan,np.nan))) 

In [65]:
jaccard = compute(*jaccard)

In [66]:
chebyshev = compute(*chebyshev)

In [67]:
braycurtis = compute(*braycurtis)

In [68]:
cosine = compute(*cosine)

In [69]:
correlation = compute(*correlation)

In [70]:
hamming = compute(*hamming)

In [71]:
canberra = compute(*canberra)

In [72]:
hausdorff = compute(*hausdorff)

In [73]:
cityblock = compute(*cityblock)

In [74]:
euclidean = compute(*euclidean)

In [75]:
l1 = compute(*l1)

In [76]:
l2 = compute(*l2)

In [77]:
manhattan = compute(*manhattan)

In [78]:
dice = compute(*dice)

In [79]:
kulsinski = compute(*kulsinski)

In [80]:
rogerstanimoto = compute(*rogerstanimoto)

In [81]:
russellrao = compute(*russellrao)

In [82]:
sokalmichener = compute(*sokalmichener)

In [83]:
minkowski = compute(*minkowski)

In [84]:
seuclidean = compute(*seuclidean)

In [85]:
sokalsneath = compute(*sokalsneath)



In [86]:
sqeuclidean = compute(*sqeuclidean)

In [87]:
len(braycurtis)

50250

#### add above metrics to X_train

In [88]:
X_train = pd.concat([X_train,
                     pd.Series((x for x,_,_ in jaccard), name='jaccard_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in jaccard), name='jaccard_min',index=X_train.index), 
                     pd.Series((x for _,_,x in jaccard), name='jaccard_max',index=X_train.index), 
                     pd.Series((x for x,_,_ in chebyshev), name='chebyshev_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in chebyshev), name='chebyshev_min',index=X_train.index), 
                     pd.Series((x for _,_,x in chebyshev), name='chebyshev_max',index=X_train.index), 
                     pd.Series((x for x,_,_ in braycurtis), name='braycurtis_mean',index=X_train.index),
                     pd.Series((x for _,x,_ in braycurtis), name='braycurtis_min',index=X_train.index), 
                     pd.Series((x for _,_,x in braycurtis), name='braycurtis_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in cosine), name='cosine_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in cosine), name='cosine_min',index=X_train.index), 
                     pd.Series((x for _,_,x in cosine), name='cosine_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in correlation), name='correlation_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in correlation), name='correlation_min',index=X_train.index), 
                     pd.Series((x for _,_,x in correlation), name='correlation_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in hamming), name='hamming_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in hamming), name='hamming_min',index=X_train.index), 
                     pd.Series((x for _,_,x in hamming), name='hamming_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in canberra), name='canberra_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in canberra), name='canberra_min',index=X_train.index), 
                     pd.Series((x for _,_,x in canberra), name='canberra_max',index=X_train.index),  
                     pd.Series((x for _,_,x in hausdorff), name='hausdorff',index=X_train.index),
                     pd.Series((x for x,_,_ in cityblock), name='cityblock_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in cityblock), name='cityblock_min',index=X_train.index), 
                     pd.Series((x for _,_,x in cityblock), name='cityblock_max',index=X_train.index),  
                     pd.Series((x for x,_,_ in euclidean), name='euclidean_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in euclidean), name='euclidean_min',index=X_train.index), 
                     pd.Series((x for _,_,x in euclidean), name='euclidean_max',index=X_train.index),    
                     pd.Series((x for x,_,_ in l1), name='l1_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in l1), name='l1_min',index=X_train.index), 
                     pd.Series((x for _,_,x in l1), name='l1_max',index=X_train.index),
                     pd.Series((x for x,_,_ in l2), name='l2_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in l2), name='l2_min',index=X_train.index), 
                     pd.Series((x for _,_,x in l2), name='l2_max',index=X_train.index),
                     pd.Series((x for x,_,_ in manhattan), name='manhattan_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in manhattan), name='manhattan_min',index=X_train.index), 
                     pd.Series((x for _,_,x in manhattan), name='manhattan_max',index=X_train.index),
                     pd.Series((x for x,_,_ in dice), name='dice_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in dice), name='dice_min',index=X_train.index), 
                     pd.Series((x for _,_,x in dice), name='dice_max',index=X_train.index),
                     pd.Series((x for x,_,_ in kulsinski), name='kulsinski_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in kulsinski), name='kulsinski_min',index=X_train.index), 
                     pd.Series((x for _,_,x in kulsinski), name='kulsinski_max',index=X_train.index),
                     pd.Series((x for x,_,_ in rogerstanimoto), name='rogerstanimoto_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in rogerstanimoto), name='rogerstanimoto_min',index=X_train.index), 
                     pd.Series((x for _,_,x in rogerstanimoto), name='rogerstanimoto_max',index=X_train.index),
                     pd.Series((x for x,_,_ in russellrao), name='russellrao_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in russellrao), name='russellrao_min',index=X_train.index), 
                     pd.Series((x for _,_,x in russellrao), name='russellrao_max',index=X_train.index),
                     pd.Series((x for x,_,_ in sokalmichener), name='sokalmichener_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in sokalmichener), name='sokalmichener_min',index=X_train.index), 
                     pd.Series((x for _,_,x in sokalmichener), name='sokalmichener_max',index=X_train.index),
                     pd.Series((x for x,_,_ in minkowski), name='minkowski_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in minkowski), name='minkowski_min',index=X_train.index), 
                     pd.Series((x for _,_,x in minkowski), name='minkowski_max',index=X_train.index),
                     pd.Series((x for x,_,_ in seuclidean), name='seuclidean_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in seuclidean), name='seuclidean_min',index=X_train.index), 
                     pd.Series((x for _,_,x in seuclidean), name='seuclidean_max',index=X_train.index),
                     pd.Series((x for x,_,_ in sokalsneath), name='sokalsneath_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in sokalsneath), name='sokalsneath_min',index=X_train.index), 
                     pd.Series((x for _,_,x in sokalsneath), name='sokalsneath_max',index=X_train.index),
                     pd.Series((x for x,_,_ in sqeuclidean), name='sqeuclidean_mean',index=X_train.index), 
                     pd.Series((x for _,x,_ in sqeuclidean), name='sqeuclidean_min',index=X_train.index), 
                     pd.Series((x for _,_,x in sqeuclidean), name='sqeuclidean_max',index=X_train.index)
                    ], axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,minkowski_max,seuclidean_mean,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,0.0,0.0,0.0,5.886291,0.01494,15.20051,...,16.619627,1.733502,0.004492,3.727103,0.0,0.0,0.0,86.404961,0.000277,276.211989
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,0.0,0.0,0.0,5.61875,0.017163,11.22177,...,11.583529,1.553674,0.070014,3.059159,0.0,0.0,0.0,66.970934,0.000865,134.17815
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,0.0,0.0,0.0,17.502108,5.686578,31.353564,...,40.973358,2.358302,0.653995,4.167674,0.0,0.0,0.0,543.472354,43.444939,1678.816072
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,0.0,0.0,0.0,17.314123,0.040721,31.19108,...,35.723565,2.243507,0.005841,4.127567,0.0,0.0,0.0,557.545242,0.003035,1276.173114
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,0.0,0.0,0.0,13.465144,0.048531,31.48746,...,33.616724,2.063921,0.005808,4.096533,0.0,0.0,0.0,334.310971,0.002435,1130.084106


In [89]:
X_train[X_train.isnull().any(axis=1)]

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,minkowski_max,seuclidean_mean,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
67651,104950,117046,How do I move on in life?,How does one move on?,,,,,,,...,,,,,,,,,,
26795,49822,49823,What is quoro?,What is ∀?,,,,,,,...,,,,,,,,,,
25228,47035,47036,What?,What should Indians do if Donald Trump becomes...,,,,,,,...,,,,,,,,,,
71374,62051,29755,What is C.P.H.4?,How can you explain black holes to a kid?,,,,,,,...,,,,,,,,,,
33153,60947,60948,What is a Wi-Fi dongle?,What is Wi-Fi?,,,,,,,...,,,,,,,,,,
46087,82473,82474,What is aw?,What does AW stand for?,,,,,,,...,,,,,,,,,,
22364,41068,41978,What is quantum computing?,What is computer?,,,,,,,...,,,,,,,,,,
18950,35855,1601,How did you attract your ex to come back?,How do I get my ex back?,,,,,,,...,,,,,,,,,,
18455,34964,34965,What you have never done?,What is something you have never done?,,,,,,,...,,,,,,,,,,
19007,35957,35958,What is hirarki?,"What is ""what is""?",,,,,,,...,,,,,,,,,,


##### Test set

In [90]:
q_meta_test = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft_test), get_q_lengths(X2_ft_test))]

In [91]:
X_test_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft_test if x.size>0)] )
)

In [92]:
X_test_300.shape

(238305, 300)

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_test.mtx', X_test_300 )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_test.mtx', source='wor2vec_300_test.mtx')

In [None]:
#del X_test_300

In [93]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_test.mtx', filepath='embed_test.mtx')

<minio.definitions.Object at 0x7fc89bfc9e80>

In [94]:
from scipy.io import mmread
X_rd_test = mmread('embed_test.mtx')

In [95]:
X_rd_test.shape

(238305, 3)

In [96]:
# rebuild X1_rd_test and X2_rd_test
X1_list = []
X2_list = []
q1_ptr = 0
for len_q1, _ in q_meta_test:
    q1 = np.array(X_rd_test[q1_ptr:q1_ptr+len_q1])
    X1_list.append(q1)
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_test:
    q2 = np.array(X_rd_test[q2_ptr:q2_ptr+len_q2])
    X2_list.append(q2)
    q2_ptr = q2_ptr+len_q2
X1_rd_test = np.array(X1_list)
X2_rd_test = np.array(X2_list)

In [97]:
X1_rd_test.shape

(24750,)

In [None]:
#del X1_list, X2_list, q1_meta, q2_meta, X_rd_test, X1_rd_tmp, X2_rd_tmp

In [98]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
hausdorff = []
cityblock = []
euclidean = []
l1 = []
l2 = []
manhattan = []
dice = []
kulsinski = []
rogerstanimoto = []
russellrao = []
sokalmichener = []
minkowski = []
seuclidean = []
sokalsneath = []
sqeuclidean = []
for q_tuple in zip(X1_rd_test, X2_rd_test):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        cityblock.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cityblock'))
        euclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'euclidean'))
        l1.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l1'))
        l2.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l2'))
        manhattan.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'manhattan'))
        dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
        minkowski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'minkowski'))
        seuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'seuclidean'))
        sokalsneath.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalsneath'))
        sqeuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sqeuclidean'))
    else:
        jaccard.append(delayed((np.nan,np.nan,np.nan)))
        chebyshev.append(delayed((np.nan,np.nan,np.nan)))
        braycurtis.append(delayed((np.nan,np.nan,np.nan))) 
        cosine.append(delayed((np.nan,np.nan,np.nan)))
        correlation.append(delayed((np.nan,np.nan,np.nan)))
        hamming.append(delayed((np.nan,np.nan,np.nan))) 
        canberra.append(delayed((np.nan,np.nan,np.nan)))
        hausdorff.append(delayed((np.nan,np.nan,np.nan)))
        cityblock.append(delayed((np.nan,np.nan,np.nan)))
        euclidean.append(delayed((np.nan,np.nan,np.nan)))
        l1.append(delayed((np.nan,np.nan,np.nan)))
        l2.append(delayed((np.nan,np.nan,np.nan)))
        manhattan.append(delayed((np.nan,np.nan,np.nan)))
        dice.append(delayed((np.nan,np.nan,np.nan)))
        kulsinski.append(delayed((np.nan,np.nan,np.nan)))
        rogerstanimoto.append(delayed((np.nan,np.nan,np.nan))) 
        russellrao.append(delayed((np.nan,np.nan,np.nan)))
        sokalmichener.append(delayed((np.nan,np.nan,np.nan))) 
        minkowski.append(delayed((np.nan,np.nan,np.nan)))
        seuclidean.append(delayed((np.nan,np.nan,np.nan))) 
        sokalsneath.append(delayed((np.nan,np.nan,np.nan)))
        sqeuclidean.append(delayed((np.nan,np.nan,np.nan))) 

In [99]:
jaccard = compute(*jaccard)

In [100]:
chebyshev = compute(*chebyshev)

In [101]:
braycurtis = compute(*braycurtis)

In [102]:
cosine = compute(*cosine)

In [103]:
correlation = compute(*correlation)

In [104]:
hamming = compute(*hamming)

In [105]:
canberra = compute(*canberra)

In [106]:
hausdorff = compute(*hausdorff)

In [107]:
cityblock = compute(*cityblock)

In [108]:
euclidean = compute(*euclidean)

In [109]:
l1 = compute(*l1)

In [110]:
l2 = compute(*l2)

In [111]:
manhattan = compute(*manhattan)

In [112]:
dice = compute(*dice)

In [113]:
kulsinski = compute(*kulsinski)

In [114]:
rogerstanimoto = compute(*rogerstanimoto)

In [115]:
russellrao = compute(*russellrao)

In [116]:
sokalmichener = compute(*sokalmichener)

In [117]:
minkowski = compute(*minkowski)

In [118]:
seuclidean = compute(*seuclidean)

In [119]:
sokalsneath = compute(*sokalsneath)

In [120]:
sqeuclidean = compute(*sqeuclidean)

In [121]:
X_test = pd.concat([X_test,
                     pd.Series((x for x,_,_ in jaccard), name='jaccard_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in jaccard), name='jaccard_min',index=X_test.index), 
                     pd.Series((x for _,_,x in jaccard), name='jaccard_max',index=X_test.index), 
                     pd.Series((x for x,_,_ in chebyshev), name='chebyshev_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in chebyshev), name='chebyshev_min',index=X_test.index), 
                     pd.Series((x for _,_,x in chebyshev), name='chebyshev_max',index=X_test.index), 
                     pd.Series((x for x,_,_ in braycurtis), name='braycurtis_mean',index=X_test.index),
                     pd.Series((x for _,x,_ in braycurtis), name='braycurtis_min',index=X_test.index), 
                     pd.Series((x for _,_,x in braycurtis), name='braycurtis_max',index=X_test.index),  
                     pd.Series((x for x,_,_ in cosine), name='cosine_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in cosine), name='cosine_min',index=X_test.index), 
                     pd.Series((x for _,_,x in cosine), name='cosine_max',index=X_test.index),  
                     pd.Series((x for x,_,_ in correlation), name='correlation_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in correlation), name='correlation_min',index=X_test.index), 
                     pd.Series((x for _,_,x in correlation), name='correlation_max',index=X_test.index),  
                     pd.Series((x for x,_,_ in hamming), name='hamming_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in hamming), name='hamming_min',index=X_test.index), 
                     pd.Series((x for _,_,x in hamming), name='hamming_max',index=X_test.index),  
                     pd.Series((x for x,_,_ in canberra), name='canberra_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in canberra), name='canberra_min',index=X_test.index), 
                     pd.Series((x for _,_,x in canberra), name='canberra_max',index=X_test.index),  
                     pd.Series((x for _,_,x in hausdorff), name='hausdorff',index=X_test.index),
                     pd.Series((x for x,_,_ in cityblock), name='cityblock_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in cityblock), name='cityblock_min',index=X_test.index), 
                     pd.Series((x for _,_,x in cityblock), name='cityblock_max',index=X_test.index),  
                     pd.Series((x for x,_,_ in euclidean), name='euclidean_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in euclidean), name='euclidean_min',index=X_test.index), 
                     pd.Series((x for _,_,x in euclidean), name='euclidean_max',index=X_test.index),    
                     pd.Series((x for x,_,_ in l1), name='l1_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in l1), name='l1_min',index=X_test.index), 
                     pd.Series((x for _,_,x in l1), name='l1_max',index=X_test.index),
                     pd.Series((x for x,_,_ in l2), name='l2_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in l2), name='l2_min',index=X_test.index), 
                     pd.Series((x for _,_,x in l2), name='l2_max',index=X_test.index),
                     pd.Series((x for x,_,_ in manhattan), name='manhattan_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in manhattan), name='manhattan_min',index=X_test.index), 
                     pd.Series((x for _,_,x in manhattan), name='manhattan_max',index=X_test.index),
                     pd.Series((x for x,_,_ in dice), name='dice_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in dice), name='dice_min',index=X_test.index), 
                     pd.Series((x for _,_,x in dice), name='dice_max',index=X_test.index),
                     pd.Series((x for x,_,_ in kulsinski), name='kulsinski_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in kulsinski), name='kulsinski_min',index=X_test.index), 
                     pd.Series((x for _,_,x in kulsinski), name='kulsinski_max',index=X_test.index),
                     pd.Series((x for x,_,_ in rogerstanimoto), name='rogerstanimoto_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in rogerstanimoto), name='rogerstanimoto_min',index=X_test.index), 
                     pd.Series((x for _,_,x in rogerstanimoto), name='rogerstanimoto_max',index=X_test.index),
                     pd.Series((x for x,_,_ in russellrao), name='russellrao_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in russellrao), name='russellrao_min',index=X_test.index), 
                     pd.Series((x for _,_,x in russellrao), name='russellrao_max',index=X_test.index),
                     pd.Series((x for x,_,_ in sokalmichener), name='sokalmichener_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in sokalmichener), name='sokalmichener_min',index=X_test.index), 
                     pd.Series((x for _,_,x in sokalmichener), name='sokalmichener_max',index=X_test.index),
                     pd.Series((x for x,_,_ in minkowski), name='minkowski_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in minkowski), name='minkowski_min',index=X_test.index), 
                     pd.Series((x for _,_,x in minkowski), name='minkowski_max',index=X_test.index),
                     pd.Series((x for x,_,_ in seuclidean), name='seuclidean_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in seuclidean), name='seuclidean_min',index=X_test.index), 
                     pd.Series((x for _,_,x in seuclidean), name='seuclidean_max',index=X_test.index),
                     pd.Series((x for x,_,_ in sokalsneath), name='sokalsneath_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in sokalsneath), name='sokalsneath_min',index=X_test.index), 
                     pd.Series((x for _,_,x in sokalsneath), name='sokalsneath_max',index=X_test.index),
                     pd.Series((x for x,_,_ in sqeuclidean), name='sqeuclidean_mean',index=X_test.index), 
                     pd.Series((x for _,x,_ in sqeuclidean), name='sqeuclidean_min',index=X_test.index), 
                     pd.Series((x for _,_,x in sqeuclidean), name='sqeuclidean_max',index=X_test.index)
                    ], axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,minkowski_max,seuclidean_mean,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",0.0,0.0,0.0,14.78399,0.010015,31.995509,...,37.150326,2.196774,0.001384,4.564407,0.0,0.0,0.0,395.695767,0.000139,1380.146722
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,0.0,0.0,0.0,13.255647,0.007708,23.845628,...,27.810974,1.751829,0.001038,3.405401,0.0,0.0,0.0,332.944476,7.6e-05,773.450252
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",0.0,0.0,0.0,16.630839,3.220897,36.046506,...,39.357945,2.358938,0.569417,4.547128,0.0,0.0,0.0,464.230689,13.481676,1549.047836
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,0.0,0.0,0.0,14.48538,0.010952,23.526312,...,33.635775,1.935479,0.001982,3.713754,0.0,0.0,0.0,456.452404,0.000262,1131.36533
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,0.0,0.0,0.0,13.183021,0.017829,22.212469,...,29.748737,2.085917,0.002717,3.403112,0.0,0.0,0.0,411.08426,0.000434,884.98733


In [122]:
X_test[X_test.isnull().any(axis=1)]

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,minkowski_max,seuclidean_mean,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10093,19584,19585,How do you see PM2.5?,Pollution: How do the VOC and PM2.5 levels in ...,,,,,,,...,,,,,,,,,,
13980,26801,26802,What is 4g LTE?,What is 4G and why?,,,,,,,...,,,,,,,,,,
15495,29599,29600,What is 2+2 x 2-2?,What is (X/2)^2?,,,,,,,...,,,,,,,,,,
36767,67007,67008,What should I do with $140MM?,I am 24 years old. Should I invest my money in...,,,,,,,...,,,,,,,,,,
58566,102725,35958,What is goodwell?,"What is ""what is""?",,,,,,,...,,,,,,,,,,
3306,6553,6554,.,Why is Cornell's endowment the lowest in the I...,,,,,,,...,,,,,,,,,,
73083,125456,125457,What should one do in their 20s to avoid regre...,What should one do and should not do in 40s?,,,,,,,...,,,,,,,,,,
17513,33265,33266,6-5=2? What is that?,What is 2+5?,,,,,,,...,,,,,,,,,,
46596,83328,83329,How To Edit DNA?,What?,,,,,,,...,,,,,,,,,,
54029,95429,95430,I'm,I am a 39 year old single woman. Should I have...,,,,,,,...,,,,,,,,,,


### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

### Fuzzy-wuzzy

In [123]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,seuclidean_mean,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,0.0,0.0,0.0,5.886291,0.01494,15.20051,...,1.733502,0.004492,3.727103,0.0,0.0,0.0,86.404961,0.000277,276.211989,5
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,0.0,0.0,0.0,5.61875,0.017163,11.22177,...,1.553674,0.070014,3.059159,0.0,0.0,0.0,66.970934,0.000865,134.17815,16
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,0.0,0.0,0.0,17.502108,5.686578,31.353564,...,2.358302,0.653995,4.167674,0.0,0.0,0.0,543.472354,43.444939,1678.816072,82
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,0.0,0.0,0.0,17.314123,0.040721,31.19108,...,2.243507,0.005841,4.127567,0.0,0.0,0.0,557.545242,0.003035,1276.173114,3
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,0.0,0.0,0.0,13.465144,0.048531,31.48746,...,2.063921,0.005808,4.096533,0.0,0.0,0.0,334.310971,0.002435,1130.084106,3


In [124]:
from fuzzywuzzy import fuzz

In [125]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,0.0,0.0,0.0,5.886291,0.01494,15.20051,...,0.004492,3.727103,0.0,0.0,0.0,86.404961,0.000277,276.211989,5,68
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,0.0,0.0,0.0,5.61875,0.017163,11.22177,...,0.070014,3.059159,0.0,0.0,0.0,66.970934,0.000865,134.17815,16,64
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,0.0,0.0,0.0,17.502108,5.686578,31.353564,...,0.653995,4.167674,0.0,0.0,0.0,543.472354,43.444939,1678.816072,82,27
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,0.0,0.0,0.0,17.314123,0.040721,31.19108,...,0.005841,4.127567,0.0,0.0,0.0,557.545242,0.003035,1276.173114,3,62
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,0.0,0.0,0.0,13.465144,0.048531,31.48746,...,0.005808,4.096533,0.0,0.0,0.0,334.310971,0.002435,1130.084106,3,47


In [126]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,0.0,0.0,0.0,5.886291,0.01494,15.20051,...,3.727103,0.0,0.0,0.0,86.404961,0.000277,276.211989,5,68,78
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,0.0,0.0,0.0,5.61875,0.017163,11.22177,...,3.059159,0.0,0.0,0.0,66.970934,0.000865,134.17815,16,64,60
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,0.0,0.0,0.0,17.502108,5.686578,31.353564,...,4.167674,0.0,0.0,0.0,543.472354,43.444939,1678.816072,82,27,44
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,0.0,0.0,0.0,17.314123,0.040721,31.19108,...,4.127567,0.0,0.0,0.0,557.545242,0.003035,1276.173114,3,62,59
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,0.0,0.0,0.0,13.465144,0.048531,31.48746,...,4.096533,0.0,0.0,0.0,334.310971,0.002435,1130.084106,3,47,47


In [127]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,0.0,0.0,0.0,5.886291,0.01494,15.20051,...,0.0,0.0,0.0,86.404961,0.000277,276.211989,5,68,78,74
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,0.0,0.0,0.0,5.61875,0.017163,11.22177,...,0.0,0.0,0.0,66.970934,0.000865,134.17815,16,64,60,72
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,0.0,0.0,0.0,17.502108,5.686578,31.353564,...,0.0,0.0,0.0,543.472354,43.444939,1678.816072,82,27,44,29
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,0.0,0.0,0.0,17.314123,0.040721,31.19108,...,0.0,0.0,0.0,557.545242,0.003035,1276.173114,3,62,59,64
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,0.0,0.0,0.0,13.465144,0.048531,31.48746,...,0.0,0.0,0.0,334.310971,0.002435,1130.084106,3,47,47,48


In [128]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71916,123674,123675,What are the uses nitrous oxide?,When is nitrous oxide used?,0.0,0.0,0.0,5.886291,0.01494,15.20051,...,0.0,0.0,86.404961,0.000277,276.211989,5,68,78,74,74
43137,77630,77631,Why is everyone craving for my attention?,Why do I crave attention?,0.0,0.0,0.0,5.61875,0.017163,11.22177,...,0.0,0.0,66.970934,0.000865,134.17815,16,64,60,72,72
66647,115486,115487,Amazing facts about female body?,Is Fantasy cricket leauge is legit and safe in...,0.0,0.0,0.0,17.502108,5.686578,31.353564,...,0.0,0.0,543.472354,43.444939,1678.816072,82,27,44,29,32
21351,40208,24198,Why did not government changed 1000 rupees not...,Why did RBI choose to come out with a ₹2000 no...,0.0,0.0,0.0,17.314123,0.040721,31.19108,...,0.0,0.0,557.545242,0.003035,1276.173114,3,62,59,64,71
68289,118038,118039,What are ideas for Mexican themed party food?,Where can I find Mexican food in mainland China?,0.0,0.0,0.0,13.465144,0.048531,31.48746,...,0.0,0.0,334.310971,0.002435,1130.084106,3,47,47,48,55


In [129]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [130]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train_final = X_train.drop(columns=['qid1', 'qid2','question1','question2']).dropna()
X_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50152 entries, 71916 to 15795
Data columns (total 69 columns):
jaccard_mean           50152 non-null float64
jaccard_min            50152 non-null float64
jaccard_max            50152 non-null float64
chebyshev_mean         50152 non-null float64
chebyshev_min          50152 non-null float64
chebyshev_max          50152 non-null float64
braycurtis_mean        50152 non-null float64
braycurtis_min         50152 non-null float64
braycurtis_max         50152 non-null float64
cosine_mean            50152 non-null float64
cosine_min             50152 non-null float64
cosine_max             50152 non-null float64
correlation_mean       50152 non-null float64
correlation_min        50152 non-null float64
correlation_max        50152 non-null float64
hamming_mean           50152 non-null float64
hamming_min            50152 non-null float64
hamming_max            50152 non-null float64
canberra_mean          50152 non-null float64
canberra_min 

In [131]:
X_train_final.tail(20)

Unnamed: 0_level_0,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,braycurtis_mean,braycurtis_min,braycurtis_max,cosine_mean,...,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
71932,0.0,0.0,0.0,18.075379,0.12814,31.268395,1.33358,0.004905,5.335243,0.985253,...,0.0,0.0,565.744444,0.022934,1221.691495,24,71,85,71,93
28693,0.0,0.0,0.0,19.172817,0.039312,32.318418,1.380627,0.002117,5.117623,1.016253,...,0.0,0.0,609.643543,0.004,1394.465389,24,80,82,80,100
53707,0.0,0.0,0.0,15.318126,0.059015,27.517736,1.062881,0.006994,2.458312,0.912164,...,0.0,0.0,426.86367,0.007099,934.535146,16,60,57,57,72
5311,0.0,0.0,0.0,14.276984,0.022868,30.783912,0.989733,0.001004,4.150539,0.906543,...,0.0,0.0,359.305169,0.001201,1238.155986,3,80,89,86,91
67969,0.0,0.0,0.0,16.452165,5.954714,28.343291,1.091722,0.203234,3.416401,1.008159,...,0.0,0.0,438.349989,55.044241,1026.762701,26,36,37,39,41
64925,0.0,0.0,0.0,13.505726,0.023714,23.819968,0.770113,0.000798,1.983836,0.65765,...,0.0,0.0,338.949089,0.001131,606.511521,25,51,49,49,53
62955,0.0,0.0,0.0,15.359228,0.04904,29.729633,1.061056,0.002107,3.955148,0.802339,...,0.0,0.0,400.945356,0.005131,944.295762,14,46,47,56,62
59735,0.0,0.0,0.0,10.049058,0.030972,19.655878,0.624879,0.001899,1.384625,0.640746,...,0.0,0.0,202.772565,0.001001,415.666562,6,63,63,82,82
769,0.0,0.0,0.0,14.492143,0.002499,29.620662,1.192309,0.000103,3.963505,0.893002,...,0.0,0.0,405.154275,1.2e-05,956.02496,1,92,92,89,100
64820,0.0,0.0,0.0,19.78114,0.043448,35.186451,1.111842,0.001408,3.878147,0.921353,...,0.0,0.0,668.885671,0.003718,1908.754297,8,69,70,70,79


##### Test set

In [132]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,seuclidean_mean,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",0.0,0.0,0.0,14.78399,0.010015,31.995509,...,2.196774,0.001384,4.564407,0.0,0.0,0.0,395.695767,0.000139,1380.146722,30
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,0.0,0.0,0.0,13.255647,0.007708,23.845628,...,1.751829,0.001038,3.405401,0.0,0.0,0.0,332.944476,7.6e-05,773.450252,5
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",0.0,0.0,0.0,16.630839,3.220897,36.046506,...,2.358938,0.569417,4.547128,0.0,0.0,0.0,464.230689,13.481676,1549.047836,9
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,0.0,0.0,0.0,14.48538,0.010952,23.526312,...,1.935479,0.001982,3.713754,0.0,0.0,0.0,456.452404,0.000262,1131.36533,3
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,0.0,0.0,0.0,13.183021,0.017829,22.212469,...,2.085917,0.002717,3.403112,0.0,0.0,0.0,411.08426,0.000434,884.98733,7


In [133]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,seuclidean_min,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",0.0,0.0,0.0,14.78399,0.010015,31.995509,...,0.001384,4.564407,0.0,0.0,0.0,395.695767,0.000139,1380.146722,30,55
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,0.0,0.0,0.0,13.255647,0.007708,23.845628,...,0.001038,3.405401,0.0,0.0,0.0,332.944476,7.6e-05,773.450252,5,79
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",0.0,0.0,0.0,16.630839,3.220897,36.046506,...,0.569417,4.547128,0.0,0.0,0.0,464.230689,13.481676,1549.047836,9,40
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,0.0,0.0,0.0,14.48538,0.010952,23.526312,...,0.001982,3.713754,0.0,0.0,0.0,456.452404,0.000262,1131.36533,3,90
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,0.0,0.0,0.0,13.183021,0.017829,22.212469,...,0.002717,3.403112,0.0,0.0,0.0,411.08426,0.000434,884.98733,7,56


In [134]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,seuclidean_max,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",0.0,0.0,0.0,14.78399,0.010015,31.995509,...,4.564407,0.0,0.0,0.0,395.695767,0.000139,1380.146722,30,55,66
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,0.0,0.0,0.0,13.255647,0.007708,23.845628,...,3.405401,0.0,0.0,0.0,332.944476,7.6e-05,773.450252,5,79,79
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",0.0,0.0,0.0,16.630839,3.220897,36.046506,...,4.547128,0.0,0.0,0.0,464.230689,13.481676,1549.047836,9,40,39
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,0.0,0.0,0.0,14.48538,0.010952,23.526312,...,3.713754,0.0,0.0,0.0,456.452404,0.000262,1131.36533,3,90,90
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,0.0,0.0,0.0,13.183021,0.017829,22.212469,...,3.403112,0.0,0.0,0.0,411.08426,0.000434,884.98733,7,56,53


In [135]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,sokalsneath_mean,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",0.0,0.0,0.0,14.78399,0.010015,31.995509,...,0.0,0.0,0.0,395.695767,0.000139,1380.146722,30,55,66,56
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,0.0,0.0,0.0,13.255647,0.007708,23.845628,...,0.0,0.0,0.0,332.944476,7.6e-05,773.450252,5,79,79,81
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",0.0,0.0,0.0,16.630839,3.220897,36.046506,...,0.0,0.0,0.0,464.230689,13.481676,1549.047836,9,40,39,47
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,0.0,0.0,0.0,14.48538,0.010952,23.526312,...,0.0,0.0,0.0,456.452404,0.000262,1131.36533,3,90,90,87
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,0.0,0.0,0.0,13.183021,0.017829,22.212469,...,0.0,0.0,0.0,411.08426,0.000434,884.98733,7,56,53,65


In [136]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,jaccard_mean,jaccard_min,jaccard_max,chebyshev_mean,chebyshev_min,chebyshev_max,...,sokalsneath_min,sokalsneath_max,sqeuclidean_mean,sqeuclidean_min,sqeuclidean_max,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26837,49895,49896,Why can aquatic animals survive in frozen lake...,"When a lake freezes over entirely, how do the ...",0.0,0.0,0.0,14.78399,0.010015,31.995509,...,0.0,0.0,395.695767,0.000139,1380.146722,30,55,66,56,59
2592,5150,5151,What kind of bird is this please?,What kind of bird has teeth?,0.0,0.0,0.0,13.255647,0.007708,23.845628,...,0.0,0.0,332.944476,7.6e-05,773.450252,5,79,79,81,81
18359,34790,34791,Is it tacky for my step mom to wear the weddin...,"What did Jack Reacher mean when he said, ""The ...",0.0,0.0,0.0,16.630839,3.220897,36.046506,...,0.0,0.0,464.230689,13.481676,1549.047836,9,40,39,47,47
73292,125781,125782,What are the basic building blocks of matter?,What are the basic building blocks of SEO?,0.0,0.0,0.0,14.48538,0.010952,23.526312,...,0.0,0.0,456.452404,0.000262,1131.36533,3,90,90,87,95
60127,105210,105211,Do pigeons have feelings?,Does every country have pigeons?,0.0,0.0,0.0,13.183021,0.017829,22.212469,...,0.0,0.0,411.08426,0.000434,884.98733,7,56,53,65,67


In [137]:
X_test_final = X_test.drop(columns=['question1','question2', 'qid1', 'qid2']).dropna()
X_test_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24712 entries, 26837 to 4165
Data columns (total 69 columns):
jaccard_mean           24712 non-null float64
jaccard_min            24712 non-null float64
jaccard_max            24712 non-null float64
chebyshev_mean         24712 non-null float64
chebyshev_min          24712 non-null float64
chebyshev_max          24712 non-null float64
braycurtis_mean        24712 non-null float64
braycurtis_min         24712 non-null float64
braycurtis_max         24712 non-null float64
cosine_mean            24712 non-null float64
cosine_min             24712 non-null float64
cosine_max             24712 non-null float64
correlation_mean       24712 non-null float64
correlation_min        24712 non-null float64
correlation_max        24712 non-null float64
hamming_mean           24712 non-null float64
hamming_min            24712 non-null float64
hamming_max            24712 non-null float64
canberra_mean          24712 non-null float64
canberra_min  

# Modeling

### Logistic Regression

In [138]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
y_train_final = y_train.loc[X_train_final.index]
logr_cv.fit(X_train_final, y_train_final)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07]), 'tol': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [139]:
logr_cv.best_params_

{'tol': 1e-05, 'C': 100000.0}

In [140]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='liblinear', tol=1e-05, verbose=0, warm_start=False)

In [141]:
logr_pred = logr_model.predict(X_test_final)
y_test_final = y_test.loc[X_test_final.index]

In [142]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))

Logistic Regression
accuracy score : 0.6776060213661379
precision score : 0.5722578304510824
recall score : 0.5164811176598996


### XQBoost

In [148]:
#! conda install -c conda-forge py-xgboost -y

In [149]:
import xgboost as xgb
# Model selection
params_xgb = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'gamma':np.linspace(.01, 1, 10, endpoint=True), 
               'learning_rate' : np.linspace(.01, 1, 10, endpoint=True),
               'reg_lambda': np.linspace(0.01, 10, 20, endpoint=True),
               'max_depth' : np.linspace(1, 32, 32, endpoint=True, dtype=int)
                 }
cv_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=params_xgb, cv=5, n_jobs=3, random_state=42)
cv_xgb.fit(X_train_final, y_train_final)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=10, n_jobs=3,
          param_distributions={'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200], 'gamma': array([0.01, 0.12, 0.23, 0.34, 0.45, 0.56, 0.67, 0.78, 0.89, 1.  ]), 'learning_rate': array([0.01, 0.12, 0.23, 0.34, 0.45, 0.56, 0.67, 0.78, 0.89, 1.  ]), 'reg_lambda': array([ 0.01   ,  0.53579,  1.06158,  1.58737,  2...10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
       

In [150]:
cv_xgb.best_params_

{'reg_lambda': 6.319473684210527,
 'n_estimators': 200,
 'max_depth': 29,
 'learning_rate': 0.23,
 'gamma': 0.23}

In [151]:
clf_xgb_model = xgb.XGBClassifier(random_state=42,
                                  n_estimators=cv_xgb.best_params_['n_estimators'],
                                  gamma=cv_xgb.best_params_['gamma'],
                                  learning_rate=cv_xgb.best_params_['learning_rate'],
                                  reg_lambda=cv_xgb.best_params_['reg_lambda'],
                                  max_depth=cv_xgb.best_params_['max_depth'])
clf_xgb_model.fit(X_train_final, y_train_final)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.23, learning_rate=0.23,
       max_delta_step=0, max_depth=29, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=6.319473684210527, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [152]:
y_pred_xgb = clf_xgb_model.predict(X_test_final)
score_xgb = accuracy_score(y_test_final, y_pred_xgb)
rscore_xgb = recall_score(y_test_final, y_pred_xgb)
pscore_xgb = precision_score(y_test_final, y_pred_xgb)
print('Accuracy score for XGBoost ', score_xgb)
print('Recall score for XGBoost ', rscore_xgb)
print('Precision score for XGBoost ', pscore_xgb)

Accuracy score for XGBoost  0.7240207186791842
Recall score for XGBoost  0.6124208688059376
Precision score for XGBoost  0.6318693693693693
