In [None]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from toolz import partition_all

In [None]:
INPUT_BUCKET = 'dq-data'
HASH_BUCKET = 'dq-hashed'

In [5]:
#load train_set
data = 'train.csv'
filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404287 non-null int64
qid2            404287 non-null int64
question1       404287 non-null object
question2       404287 non-null object
is_duplicate    404287 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


### Train-test split

In [7]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
df = df.iloc[:75000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50250 entries, 71916 to 15795
Data columns (total 4 columns):
qid1         50250 non-null int64
qid2         50250 non-null int64
question1    50250 non-null object
question2    50250 non-null object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [None]:
#del X,y,df

# Feature Extraction

### Tokenizing and preprocessing

In [9]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [None]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')

In [None]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [10]:
#import os
#os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')

In [11]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [12]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(100500,)

In [13]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [None]:
#del X_ft

##### Test set

In [14]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [15]:
X_ft_test.shape

(49500,)

In [None]:
#del model

In [16]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [None]:
#del X_ft_test

### Pairwise Metrics

In [17]:
def get_q_lengths(X):
    #q_meta = []
    for q in X:
        #q_meta.append(len(q))
        yield len(q)
    #return q_meta

In [18]:
X1_ft.shape

(50250,)

In [19]:
q_meta_train = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft), get_q_lengths(X2_ft))]

In [20]:
X_train_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft if x.size>0)] )
)

In [21]:
X_train_300.shape

(482518, 300)

In [None]:
#del X1_ft, X2_ft

In [None]:
#import sys
## These are the usual ipython objects, including this one you are creating
#ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
#sorted([(x, sys.getsizeof(globals().get(x))) 
#        for x in dir() if not x.startswith('_') 
#        and x not in sys.modules and x not in ipython_vars], 
#       key=lambda x: x[1], reverse=True)

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_train.mtx', X_train_300 )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_train.mtx', source='wor2vec_300_train.mtx')

In [None]:
#del X_train_300

In [22]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_train.mtx', filepath='embed_train.mtx')

<minio.definitions.Object at 0x7f1b83097668>

In [23]:
from scipy.io import mmread
X_rd = mmread('embed_train.mtx')

In [24]:
X_rd.shape

(482518, 3)

In [25]:
len(q_meta_train)

50250

In [26]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
q1_ptr = 0
for len_q1, _ in q_meta_train:
    q1 = np.array(X_rd[q1_ptr:q1_ptr+len_q1])
    X1_list.append(q1)
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_train:
    q2 = np.array(X_rd[q2_ptr:q2_ptr+len_q2])
    X2_list.append(q2)
    q2_ptr = q2_ptr+len_q2
X1_rd = np.array(X1_list)
X2_rd = np.array(X2_list)

In [None]:
#del X1_list, X2_list, X_rd, X1_rd_tmp, X2_rd_tmp

In [27]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

In [28]:
from scipy.spatial import ConvexHull
from math import sqrt
import numpy as np
from math import atan2, cos, sin, pi
from collections import namedtuple


def unit_vector(pt0, pt1):
    # returns an unit vector that points in the direction of pt0 to pt1
    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
    return (pt1[0] - pt0[0]) / dis_0_to_1, \
           (pt1[1] - pt0[1]) / dis_0_to_1


def orthogonal_vector(vector):
    # from vector returns a orthogonal/perpendicular vector of equal length
    return -1 * vector[1], vector[0]


def bounding_area(index, hull):
    unit_vector_p = unit_vector(hull[index], hull[index+1])
    unit_vector_o = orthogonal_vector(unit_vector_p)

    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)

    min_p = min(dis_p)
    min_o = min(dis_o)
    len_p = max(dis_p) - min_p
    len_o = max(dis_o) - min_o

    return {'area': len_p * len_o,
            'length_parallel': len_p,
            'length_orthogonal': len_o,
            'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2),
            'unit_vector': unit_vector_p,
            }


def to_xy_coordinates(unit_vector_angle, point):
    # returns converted unit vector coordinates in x, y coordinates
    angle_orthogonal = unit_vector_angle + pi / 2
    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)


def rotate_points(center_of_rotation, angle, points):
    # Requires: center_of_rotation to be a 2d vector. ex: (1.56, -23.4)
    #           angle to be in radians
    #           points to be a list or tuple of points. ex: ((1.56, -23.4), (1.56, -23.4))
    # Effects: rotates a point cloud around the center_of_rotation point by angle
    rot_points = []
    ang = []
    for pt in points:
        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
        diff_angle = atan2(diff[1], diff[0]) + angle
        ang.append(diff_angle)
        diff_length = sqrt(sum([d**2 for d in diff]))
        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
                           center_of_rotation[1] + diff_length * sin(diff_angle)))

    return rot_points


def rectangle_corners(rectangle):
    # Requires: the output of mon_bounding_rectangle
    # Effects: returns the corner locations of the bounding rectangle
    corner_points = []
    for i1 in (.5, -.5):
        for i2 in (i1, -1 * i1):
            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))

    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)


BoundingBox = namedtuple('BoundingBox', ('area',
                                         'length_parallel',
                                         'length_orthogonal',
                                         'rectangle_center',
                                         'unit_vector',
                                         'unit_vector_angle',
                                         'corner_points'
                                        )
)


# use this function to find the listed properties of the minimum bounding box of a point cloud
def MinimumBoundingBox(points):
    # Requires: points to be a list or tuple of 2D points. ex: ((5, 2), (3, 4), (6, 8))
    #           needs to be more than 2 points
    # Effects:  returns a namedtuple that contains:
    #               area: area of the rectangle
    #               length_parallel: length of the side that is parallel to unit_vector
    #               length_orthogonal: length of the side that is orthogonal to unit_vector
    #               rectangle_center: coordinates of the rectangle center
    #                   (use rectangle_corners to get the corner points of the rectangle)
    #               unit_vector: direction of the length_parallel side. RADIANS
    #                   (it's orthogonal vector can be found with the orthogonal_vector function
    #               unit_vector_angle: angle of the unit vector
    #               corner_points: set that contains the corners of the rectangle

    if len(points) <= 2: raise ValueError('More than two points required.')

    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
    hull_ordered.append(hull_ordered[0])
    hull_ordered = tuple(hull_ordered)

    min_rectangle = bounding_area(0, hull_ordered)
    for i in range(1, len(hull_ordered)-1):
        rectangle = bounding_area(i, hull_ordered)
        if rectangle['area'] < min_rectangle['area']:
            min_rectangle = rectangle

    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])

    # this is ugly but a quick hack and is being changed in the speedup branch
    return BoundingBox(
        area = min_rectangle['area'],
        length_parallel = min_rectangle['length_parallel'],
        length_orthogonal = min_rectangle['length_orthogonal'],
        rectangle_center = min_rectangle['rectangle_center'],
        unit_vector = min_rectangle['unit_vector'],
        unit_vector_angle = min_rectangle['unit_vector_angle'],
        corner_points = set(rectangle_corners(min_rectangle))
    )

In [29]:
def make_array(points_set):
    points = []
    for point in points_set:
        points.append(list(point))
    return np.array(points)

In [30]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, directed_hausdorff
from fastdtw import fastdtw
import similaritymeasures
from scipy.spatial import procrustes
def compute_pairwise_kernel(pc1, pc2, method='linear'):
    if method=='polynomial':
        return polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        return rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        return sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        return laplacian_kernel(pc1, pc2)
    else:
        return linear_kernel(pc1, pc2)
    
def compute_pairwise_dist(pc1, pc2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='hausdorff':
        dist = directed_hausdorff(pc1, pc2)
        return dist[0]
    else:
        dist_mat = pairwise_distances(pc1, pc2, metric=method) 
    #dist_mat = cdist(pc1, pc2, metric=method)
    return np.linalg.norm(dist_mat, ord='fro')

def compute_pairwise_metric(pc1, pc2, method='dtw'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    #if method=='fdtw':
    #    dist, _ = fastdtw(pc1, pc2, dist=euclidean)
    if method=='pcm':
        dist = similaritymeasures.pcm(pc1[:,:2], pc2[:,:2])
    if method=='discrete_frechet':
        dist = similaritymeasures.frechet_dist(pc1[:,:2], pc2[:,:2])
    if method=='area':
        dist = similaritymeasures.area_between_two_curves(pc1[:,:2], pc2[:,:2])
    if method=='curve_length':
        dist = similaritymeasures.curve_length_measure(pc1[:,:2], pc2[:,:2])
    if method=='dtw':
        dist, _ = similaritymeasures.dtw(pc1[:,:2], pc2[:,:2])
    if method=='procrustes': 
        mbox1 = MinimumBoundingBox([x[:2] for x in pc1.tolist()])
        mbox2 = MinimumBoundingBox([x[:2] for x in pc2.tolist()])
        _,_,dist = procrustes(make_array(mbox1.corner_points), make_array(mbox2.corner_points))
    return dist

        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [None]:
#mbox2 = MinimumBoundingBox([x[:2] for x in q2_temp.tolist()])

In [None]:
#mbox1 = MinimumBoundingBox([x[:2] for x in q1_temp.tolist()])

In [None]:
#make_array(mbox2.corner_points)

#### Dynamic Time Warping

In [None]:
! pip install fastdtw

In [None]:
! pip install similaritymeasures

In [31]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
hausdorff = []
cityblock = []
euclidean = []
l1 = []
l2 = []
manhattan = []
dice = []
kulsinski = []
rogerstanimoto = []
russellrao = []
sokalmichener = []
minkowski = []
seuclidean = []
sokalsneath = []
sqeuclidean = []
#fdtw = []
dtw = []
#pcm = []
#area = []
curve_length = []
discrete_frechet = []
procrustes = []
for q_tuple in zip(X1_rd, X2_rd):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        cityblock.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cityblock'))
        euclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'euclidean'))
        l1.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l1'))
        l2.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l2'))
        manhattan.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'manhattan'))
        dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
        minkowski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'minkowski'))
        seuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'seuclidean'))
        sokalsneath.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalsneath'))
        sqeuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sqeuclidean'))
        #fdtw.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'fdtw'))
        dtw.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'dtw'))
        #pcm.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'pcm'))
        #area.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'area'))
        curve_length.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'curve_length'))
        discrete_frechet.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'discrete_frechet'))
        procrustes.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'procrustes'))
    else:
        jaccard.append(delayed(np.nan))
        chebyshev.append(delayed(np.nan))
        braycurtis.append(delayed(np.nan))
        cosine.append(delayed(np.nan))
        correlation.append(delayed(np.nan))
        hamming.append(delayed(np.nan))
        canberra.append(delayed(np.nan))
        hausdorff.append(delayed(np.nan))
        cityblock.append(delayed(np.nan))
        euclidean.append(delayed(np.nan))
        l1.append(delayed(np.nan))
        l2.append(delayed(np.nan))
        manhattan.append(delayed(np.nan))
        dice.append(delayed(np.nan))
        kulsinski.append(delayed(np.nan))
        rogerstanimoto.append(delayed(np.nan))
        russellrao.append(delayed(np.nan))
        sokalmichener.append(delayed(np.nan))
        minkowski.append(delayed(np.nan))
        seuclidean.append(delayed(np.nan))
        sokalsneath.append(delayed(np.nan))
        sqeuclidean.append(delayed(np.nan)) 
        #fdtw.append(delayed(np.nan)
        dtw.append(delayed(np.nan))
        #pcm.append(delayed(np.nan)
        #area.append(delayed(np.nan) 
        curve_length.append(delayed(np.nan))
        discrete_frechet.append(delayed(np.nan))
        procrustes.append(delayed(np.nan))

In [32]:
jaccard = compute(*jaccard)

In [33]:
chebyshev = compute(*chebyshev)

In [34]:
braycurtis = compute(*braycurtis)

In [35]:
cosine = compute(*cosine)

In [36]:
correlation = compute(*correlation)

In [37]:
hamming = compute(*hamming)

In [38]:
canberra = compute(*canberra)

In [39]:
hausdorff = compute(*hausdorff)

In [40]:
cityblock = compute(*cityblock)

In [41]:
euclidean = compute(*euclidean)



In [42]:
l1 = compute(*l1)



In [44]:
l2 = compute(*l2)



UnpicklingError: invalid load key, '\x16'.



In [None]:
manhattan = compute(*manhattan)

In [None]:
dice = compute(*dice)

In [None]:
kulsinski = compute(*kulsinski)

In [None]:
rogerstanimoto = compute(*rogerstanimoto)

In [None]:
russellrao = compute(*russellrao)

In [None]:
sokalmichener = compute(*sokalmichener)

In [None]:
minkowski = compute(*minkowski)

In [None]:
seuclidean = compute(*seuclidean)

In [None]:
sokalsneath = compute(*sokalsneath)

In [None]:
sqeuclidean = compute(*sqeuclidean)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
discrete_frechet = compute(*discrete_frechet)

In [None]:
procrustes = compute(*procrustes)

In [None]:
len(braycurtis)

In [None]:
np.sum(np.isnan(procrustes))

#### add above metrics to X_train

In [None]:
X_train = pd.concat([X_train,
                     pd.Series(jaccard, name='jaccard',index=X_train.index), 
                     pd.Series(chebyshev, name='chebyshev',index=X_train.index), 
                     pd.Series(braycurtis, name='braycurtis',index=X_train.index),
                     pd.Series(cosine, name='cosine',index=X_train.index),
                     pd.Series(correlation, name='correlation',index=X_train.index),  
                     pd.Series(hamming, name='hamming',index=X_train.index), 
                     pd.Series(canberra, name='canberra',index=X_train.index),  
                     pd.Series(hausdorff, name='hausdorff',index=X_train.index), 
                     #pd.Series((x for x,_,_ in fdtw), name='fdtw',index=X_train.index), 
                     pd.Series(dtw, name='dtw',index=X_train.index), 
                     #pd.Series((x for x,_,_ in pcm), name='pcm',index=X_train.index), 
                     #pd.Series((x for x,_,_ in area), name='area',index=X_train.index), 
                     #pd.Series(curve_length, name='curve_length',index=X_train.index), 
                     pd.Series(discrete_frechet, name='discrete_frechet',index=X_train.index),
                     pd.Series(procrustes, name='procrustes',index=X_train.index),
                     pd.Series(cityblock, name='cityblock',index=X_train.index),   
                     pd.Series(euclidean, name='euclidean',index=X_train.index),  
                     pd.Series(l1, name='l1',index=X_train.index), 
                     pd.Series(l2, name='l2',index=X_train.index), 
                     pd.Series(manhattan, name='manhattan',index=X_train.index), 
                     pd.Series(dice, name='dice',index=X_train.index), 
                     pd.Series(kulsinski, name='kulsinski',index=X_train.index), 
                     pd.Series(rogerstanimoto, name='rogerstanimoto',index=X_train.index), 
                     pd.Series(russellrao, name='russellrao',index=X_train.index), 
                     pd.Series(sokalmichener, name='sokalmichener',index=X_train.index),
                     pd.Series(minkowski, name='minkowski',index=X_train.index),
                     pd.Series(seuclidean, name='seuclidean',index=X_train.index), 
                     pd.Series(sokalsneath, name='sokalsneath',index=X_train.index),
                     pd.Series(sqeuclidean, name='sqeuclidean',index=X_train.index)
                    ], axis=1)
X_train.head()

In [None]:
#X_train = X_train.drop(columns=['curve_length'])

In [None]:
X_train[X_train.isnull().any(axis=1)]

##### Test set

In [None]:
q_meta_test = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft_test), get_q_lengths(X2_ft_test))]

In [None]:
X_test_300 = np.concatenate( 
    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft_test if x.size>0)] )
)

In [None]:
X_test_300.shape

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_test.mtx', X_test_300 )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_test.mtx', source='wor2vec_300_test.mtx')

In [None]:
#del X_test_300

In [None]:
ps.get_file(bucket=INPUT_BUCKET, filename='embed_test.mtx', filepath='embed_test.mtx')

In [None]:
from scipy.io import mmread
X_rd_test = mmread('embed_test.mtx')

In [None]:
X_rd_test.shape

In [None]:
# rebuild X1_rd_test and X2_rd_test
X1_list = []
X2_list = []
q1_ptr = 0
for len_q1, _ in q_meta_test:
    q1 = np.array(X_rd_test[q1_ptr:q1_ptr+len_q1])
    X1_list.append(q1)
    q1_ptr = q1_ptr+len_q1
q2_ptr = q1_ptr
for _, len_q2 in q_meta_test:
    q2 = np.array(X_rd_test[q2_ptr:q2_ptr+len_q2])
    X2_list.append(q2)
    q2_ptr = q2_ptr+len_q2
X1_rd_test = np.array(X1_list)
X2_rd_test = np.array(X2_list)

In [None]:
X1_rd_test.shape

In [None]:
#del X1_list, X2_list, q1_meta, q2_meta, X_rd_test, X1_rd_tmp, X2_rd_tmp

In [None]:
jaccard = []
chebyshev = []
braycurtis = []
cosine = []
correlation = []
hamming = []
canberra = []
hausdorff = []
cityblock = []
euclidean = []
l1 = []
l2 = []
manhattan = []
dice = []
kulsinski = []
rogerstanimoto = []
russellrao = []
sokalmichener = []
minkowski = []
seuclidean = []
sokalsneath = []
sqeuclidean = []
#fdtw = []
dtw = []
#pcm = []
#area = []
#curve_length = []
discrete_frechet = []
for q_tuple in zip(X1_rd_test, X2_rd_test):
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
        cosine.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cosine'))
        correlation.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'correlation'))
        hamming.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hamming'))
        canberra.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'canberra'))
        hausdorff.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'hausdorff'))
        cityblock.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'cityblock'))
        euclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'euclidean'))
        l1.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l1'))
        l2.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'l2'))
        manhattan.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'manhattan'))
        dice.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'dice'))
        kulsinski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'kulsinski'))
        rogerstanimoto.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'rogerstanimoto'))
        russellrao.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'russellrao'))
        sokalmichener.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalmichener'))
        minkowski.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'minkowski'))
        seuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'seuclidean'))
        sokalsneath.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sokalsneath'))
        sqeuclidean.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'sqeuclidean'))
        #fdtw.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'fdtw'))
        dtw.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'dtw'))
        #pcm.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'pcm'))
        #area.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'area'))
        #curve_length.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'curve_length'))
        discrete_frechet.append(delayed(compute_pairwise_metric)(q1_rd, q2_rd, 'discrete_frechet'))
    else:
        jaccard.append(delayed(np.nan))
        chebyshev.append(delayed(np.nan))
        braycurtis.append(delayed(np.nan))
        cosine.append(delayed(np.nan))
        correlation.append(delayed(np.nan))
        hamming.append(delayed(np.nan))
        canberra.append(delayed(np.nan))
        hausdorff.append(delayed(np.nan))
        cityblock.append(delayed(np.nan))
        euclidean.append(delayed(np.nan))
        l1.append(delayed(np.nan))
        l2.append(delayed(np.nan))
        manhattan.append(delayed(np.nan))
        dice.append(delayed(np.nan))
        kulsinski.append(delayed(np.nan))
        rogerstanimoto.append(delayed(np.nan))
        russellrao.append(delayed(np.nan))
        sokalmichener.append(delayed(np.nan))
        minkowski.append(delayed(np.nan))
        seuclidean.append(delayed(np.nan))
        sokalsneath.append(delayed(np.nan))
        sqeuclidean.append(delayed(np.nan)) 
        #fdtw.append(delayed(np.nan)
        dtw.append(delayed(np.nan))
        #pcm.append(delayed(np.nan)
        #area.append(delayed(np.nan) 
        #curve_length.append(delayed(np.nan))
        discrete_frechet.append(delayed(np.nan))

In [None]:
jaccard = compute(*jaccard)

In [None]:
chebyshev = compute(*chebyshev)

In [None]:
braycurtis = compute(*braycurtis)

In [None]:
cosine = compute(*cosine)

In [None]:
correlation = compute(*correlation)

In [None]:
hamming = compute(*hamming)

In [None]:
canberra = compute(*canberra)

In [None]:
hausdorff = compute(*hausdorff)

In [None]:
cityblock = compute(*cityblock)

In [None]:
euclidean = compute(*euclidean)

In [None]:
l1 = compute(*l1)

In [None]:
l2 = compute(*l2)

In [None]:
manhattan = compute(*manhattan)

In [None]:
dice = compute(*dice)

In [None]:
kulsinski = compute(*kulsinski)

In [None]:
rogerstanimoto = compute(*rogerstanimoto)

In [None]:
russellrao = compute(*russellrao)

In [None]:
sokalmichener = compute(*sokalmichener)

In [None]:
minkowski = compute(*minkowski)

In [None]:
seuclidean = compute(*seuclidean)

In [None]:
sokalsneath = compute(*sokalsneath)

In [None]:
sqeuclidean = compute(*sqeuclidean)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
discrete_frechet = compute(*discrete_frechet)

In [None]:
X_test = pd.concat([X_test,
                     pd.Series(jaccard, name='jaccard',index=X_test.index), 
                     pd.Series(chebyshev, name='chebyshev',index=X_test.index), 
                     pd.Series(braycurtis, name='braycurtis',index=X_test.index),
                     pd.Series(cosine, name='cosine',index=X_test.index),
                     pd.Series(correlation, name='correlation',index=X_test.index),  
                     pd.Series(hamming, name='hamming',index=X_test.index), 
                     pd.Series(canberra, name='canberra',index=X_test.index),  
                     pd.Series(hausdorff, name='hausdorff',index=X_test.index), 
                     #pd.Series((x for x,_,_ in fdtw), name='fdtw',index=X_test.index), 
                     pd.Series(dtw, name='dtw',index=X_test.index), 
                     #pd.Series((x for x,_,_ in pcm), name='pcm',index=X_test.index), 
                     #pd.Series((x for x,_,_ in area), name='area',index=X_test.index), 
                     #pd.Series(curve_length, name='curve_length',index=X_test.index), 
                     pd.Series(discrete_frechet, name='discrete_frechet',index=X_test.index),
                     pd.Series(cityblock, name='cityblock',index=X_test.index),   
                     pd.Series(euclidean, name='euclidean',index=X_test.index),  
                     pd.Series(l1, name='l1',index=X_test.index), 
                     pd.Series(l2, name='l2',index=X_test.index), 
                     pd.Series(manhattan, name='manhattan',index=X_test.index), 
                     pd.Series(dice, name='dice',index=X_test.index), 
                     pd.Series(kulsinski, name='kulsinski',index=X_test.index), 
                     pd.Series(rogerstanimoto, name='rogerstanimoto',index=X_test.index), 
                     pd.Series(russellrao, name='russellrao',index=X_test.index), 
                     pd.Series(sokalmichener, name='sokalmichener',index=X_test.index),
                     pd.Series(minkowski, name='minkowski',index=X_test.index),
                     pd.Series(seuclidean, name='seuclidean',index=X_test.index), 
                     pd.Series(sokalsneath, name='sokalsneath',index=X_test.index),
                     pd.Series(sqeuclidean, name='sqeuclidean',index=X_test.index)
                    ], axis=1)
X_test.head()

In [None]:
X_test[X_test.isnull().any(axis=1)]

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

### Fuzzy-wuzzy

In [None]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

In [None]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

In [None]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [None]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train_final = X_train.drop(columns=['qid1', 'qid2','question1','question2']).dropna()
X_train_final.info()

In [None]:
X_train_final.tail(20)

##### Test set

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
X_test_final = X_test.drop(columns=['question1','question2', 'qid1', 'qid2']).dropna()
X_test_final.info()

# Modeling

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
y_train_final = y_train.loc[X_train_final.index]
logr_cv.fit(X_train_final, y_train_final)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

In [None]:
logr_pred = logr_model.predict(X_test_final)
y_test_final = y_test.loc[X_test_final.index]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))

### XQBoost

In [None]:
import xgboost as xgb
# Model selection
params_xgb = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'gamma':np.linspace(.01, 1, 10, endpoint=True), 
               'learning_rate' : np.linspace(.01, 1, 10, endpoint=True),
               'reg_lambda': np.linspace(0.01, 10, 20, endpoint=True),
               'max_depth' : np.linspace(1, 32, 32, endpoint=True, dtype=int)
                 }
cv_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=params_xgb, cv=5, n_jobs=3, random_state=42)
cv_xgb.fit(X_train_final, y_train_final)

In [None]:
cv_xgb.best_params_

In [None]:
clf_xgb_model = xgb.XGBClassifier(random_state=42,
                                  n_estimators=cv_xgb.best_params_['n_estimators'],
                                  gamma=cv_xgb.best_params_['gamma'],
                                  learning_rate=cv_xgb.best_params_['learning_rate'],
                                  reg_lambda=cv_xgb.best_params_['reg_lambda'],
                                  max_depth=cv_xgb.best_params_['max_depth'])
clf_xgb_model.fit(X_train_final, y_train_final)

In [None]:
y_pred_xgb = clf_xgb_model.predict(X_test_final)
score_xgb = accuracy_score(y_test_final, y_pred_xgb)
rscore_xgb = recall_score(y_test_final, y_pred_xgb)
pscore_xgb = precision_score(y_test_final, y_pred_xgb)
print('Accuracy score for XGBoost ', score_xgb)
print('Recall score for XGBoost ', rscore_xgb)
print('Precision score for XGBoost ', pscore_xgb)