In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
INPUT_BUCKET = 'dq-data'
HASH_BUCKET = 'dq-hashed'

In [11]:
#load train_set
data = 'train.csv'
filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404287 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404287 non-null int64
qid2            404287 non-null int64
question1       404287 non-null object
question2       404287 non-null object
is_duplicate    404287 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


### Train-test split

In [13]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
df = df.iloc[:150000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100500 entries, 87935 to 121959
Data columns (total 4 columns):
qid1         100500 non-null int64
qid2         100500 non-null int64
question1    100500 non-null object
question2    100500 non-null object
dtypes: int64(2), object(2)
memory usage: 3.8+ MB


In [15]:
del X,y,df

# Feature Extraction

### Tokenizing and preprocessing

In [16]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [17]:
ps.get_file(bucket=INPUT_BUCKET, filename='cc.en.300.bin.gz', filepath='/tmp/cc.en.300.bin.gz')

<minio.definitions.Object at 0x7f4f8d95ca20>

In [18]:
import gzip
import shutil
with gzip.open('/tmp/cc.en.300.bin.gz', 'rb') as f_in:
    with open('/tmp/cc.en.300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)  

In [19]:
import os
os.remove('/tmp/cc.en.300.bin.gz')
from gensim.models import FastText
model = FastText.load_fasttext_format('/tmp/cc.en.300.bin')

In [20]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [21]:
X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
X_ft.shape

(201000,)

In [22]:
# split back into two
X1_ft = X_ft[:len(X_train)]
X2_ft = X_ft[len(X_train):]

In [23]:
del X_ft

In [24]:
#X_train = pd.concat([X_train, pd.Series(X1_ft, name='q1_ft',index=X_train.index), pd.Series(X2_ft, name='q2_ft',index=X_train.index)], axis=1)
#X_train.head()

##### Test set

In [25]:
X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [26]:
X_ft_test.shape

(99000,)

In [27]:
del model

In [28]:
# split back into two
X1_ft_test = X_ft_test[:len(X_test)]
X2_ft_test = X_ft_test[len(X_test):]

In [29]:
del X_ft_test

In [30]:
#X_test = pd.concat([X_test, pd.Series(X1_ft_test, name='q1_ft',index=X_test.index), pd.Series(X2_ft_test, name='q2_ft',index=X_test.index)], axis=1)
#X_test.head()

### MDS, LLE Embedding

In [31]:
#from sklearn.manifold import MDS, LocallyLinearEmbedding
#def reduce_dim(X, method, dimensions=3):
#    n_jobs = -1
#    n_neighbors = 5
#    if X.shape[0] <= 5:
#        n_neighbors = X.shape[0] - 1
#    if method == 'LLE':
#        embedding = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=dimensions, random_state=42, n_jobs=n_jobs)
#    elif method == 'MLLE':
#        embedding = LocallyLinearEmbedding(n_components=dimensions, method='modified', random_state=42, n_jobs=n_jobs)
#    elif method == 'Hessian':
#        embedding = LocallyLinearEmbedding(n_components=dimensions, method='hessian', random_state=42, n_jobs=n_jobs)
#    else: #method == 'MDS':
#        embedding = MDS(n_components=dimensions, random_state=42, n_jobs=n_jobs)
#    X_trfmd = embedding.fit_transform(X)
#    return X_trfmd

In [32]:
from megaman.geometry.geometry import Geometry

### Pairwise Metrics

In [33]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from scipy.spatial.distance import cdist
def compute_pairwise_kernel(pc1, pc2, method='linear'):
    if method=='polynomial':
        return polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        return rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        return sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        return laplacian_kernel(pc1, pc2)
    else:
        return linear_kernel(pc1, pc2)
    
def compute_pairwise_dist(pc1, pc2, method='euclidean'):
    if pc1.size == 0:
        return []
    if pc2.size == 0:
        return []
    return cdist(pc1, pc2, metric=method)

#def compute_red_dim(pc1, pc2, method='MDS'):
#    if pc1.size == 0:
#        if pc2.size > 0:
#            pc = pc2
#            len_pc1 = 0
#        else:
#            return ()
#    elif pc2.size == 0:
#        pc = pc1
#        len_pc1 = pc1.shape[0]
#    else:
#        pc = np.vstack((pc1,pc2))
#        len_pc1 = pc1.shape[0]
#    pc_embd = reduce_dim(pc, method=method, dimensions=3)
#    pc1_embd = pc_embd[:len_pc1]
#    pc2_embd = pc_embd[len_pc1:]
#    return pc1_embd, pc2_embd
        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [34]:
#X_train['linear_kernel'] = X_train.apply(rowwise_pwkernel, axis=1)
#X_train.head()
#X_train.to_csv('/tmp/X_train.csv')

In [35]:
#import dask.dataframe as dd
#from dask import delayed, compute
#from dask.distributed import Client
#from utils import dask
#client = dask.create_dask_client(num_workers=8)

In [36]:
#dask.perform_dask_test()

In [37]:
def get_q_lengths(X):
    q_meta = []
    for q in X:
        q_meta.append(len(q))
    return q_meta

In [38]:
X1_ft.shape

(100500,)

In [39]:
def split_arrays(X):
    for y in (x for x in X if x.size>0):
        yield np.vsplit(y,len(y))[0]

In [40]:
q_meta = get_q_lengths(X1_ft) + get_q_lengths(X2_ft)

In [41]:
X = np.vstack((np.concatenate([x for x in split_arrays(X1_ft)]), np.concatenate([x for x in split_arrays(X2_ft)])))

In [42]:
X.shape

(200781, 300)

In [43]:
del X1_ft, X2_ft

In [44]:
import sys
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
y = sorted([(x, sys.getsizeof(globals().get(x))) 
        for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], 
       key=lambda x: x[1], reverse=True)

In [45]:
y

[('X', 240937312),
 ('X_train', 26087192),
 ('X_test', 12830824),
 ('q_meta', 1608064),
 ('y_train', 1608024),
 ('y_test', 792024),
 ('FastText', 2000),
 ('HTTPResponse', 1464),
 ('Geometry', 1056),
 ('Dict', 888),
 ('List', 888),
 ('Tuple', 888),
 ('dtypes', 368),
 ('f_out', 176),
 ('assign_pwmetric', 136),
 ('cdist', 136),
 ('compute_pairwise_dist', 136),
 ('compute_pairwise_kernel', 136),
 ('get_ft_vectors', 136),
 ('get_q_lengths', 136),
 ('get_tokens', 136),
 ('laplacian_kernel', 136),
 ('linear_kernel', 136),
 ('partition_all', 136),
 ('polynomial_kernel', 136),
 ('preprocess_string', 136),
 ('rbf_kernel', 136),
 ('sigmoid_kernel', 136),
 ('split_arrays', 136),
 ('train_test_split', 136),
 ('X1_ft_test', 96),
 ('X2_ft_test', 96),
 ('np', 80),
 ('pd', 80),
 ('ps', 80),
 ('HASH_BUCKET', 58),
 ('data', 58),
 ('INPUT_BUCKET', 56),
 ('f_in', 56),
 ('filestream', 56)]

In [None]:
from scipy.io import mmwrite, mmread
mmwrite( 'wor2vec_300_train.mtx', X )

In [None]:
ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_train.mtx', source='wor2vec_300_train.mtx')

In [None]:
import time
from megaman.embedding import SpectralEmbedding
radius = 20
adjacency_method = 'cyflann'
cyflann_kwds = {'index_type':'kmeans', 'branching':64, 'iterations':20, 'cb_index':0.4}
adjacency_kwds = {'radius':radius, 'cyflann_kwds':cyflann_kwds}
affinity_method = 'gaussian'
affinity_kwds = {'radius':radius}
laplacian_method = 'geometric'
laplacian_kwds = {'scaling_epps':radius}

geom = Geometry(adjacency_method=adjacency_method, adjacency_kwds=adjacency_kwds,
                affinity_method=affinity_method, affinity_kwds=affinity_kwds,
                laplacian_method=laplacian_method, laplacian_kwds=laplacian_kwds)
#Geom = Geometry(X, neighborhood_radius = radius, affinity_radius = radius,
#                distance_method = 'cython', input_type = 'distance',
#                laplacian_type = 'geometric')
geom.set_data_matrix(X)

In [None]:
t0 = time.time()
spec = SpectralEmbedding(n_components=2, eigen_solver='amg',geom=geom)
X_spec = spec.fit_transform(X=X.astype(np.float))
#adjacency_matrix = geom.compute_adjacency_matrix()
t1 = time.time() - t0
print(t1)



In [None]:
#X_rd =  reduce_dim(X, 'LLE', 3)

In [None]:
# rebuild X1_rd and X2_rd
X1_list = []
X2_list = []
for len_q1, len_q2 in q_meta:
    q1 = X_rd[:len_q1]
    q2 = X_rd[len_q1:(len_q1+len_q2)]
    X1_list.append(q1)
    X2_list.append(q2)
    X_rd = X_rd[(len_q1+len_q2):]
X1_rd = np.array(X1_list)
X2_rd = np.array(X2_list)

In [None]:
del X1_list, X2_list, q_meta, X_rd

In [None]:
jaccard = []
chebyshev = []
braycurtis = []
for q_tuple in q_rd:
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
    else:
        jaccard.append(delayed([]))
        chebyshev.append(delayed([]))
        braycurtis.append(delayed([])) 

In [None]:
jaccard = compute(*jaccard)

In [None]:
chebyshev = compute(*chebyshev)

In [None]:
braycurtis = compute(*braycurtis)

In [None]:
len(braycurtis)

In [None]:
#X_train_ddf = X_train_ddf.map_partitions(reduce_dim,'MDS', meta={'q1_ft':np.float32, 'q2_ft':np.float32})
#X_train_ddf['jaccard'] = X_train_ddf.map_partitions(assign_pwmetric,'jaccard', meta=('jaccard',np.float32))
#X_train_ddf['chebyshev'] = X_train_ddf.map_partitions(assign_pwmetric,'chebyshev', meta=('chebyshev',np.float32))
#X_train_ddf['braycurtis'] = X_train_ddf.map_partitions(assign_pwmetric,'braycurtis', meta=('braycurtis',np.float32))
#X_train = X_train_ddf.compute()
#X_train.head()

##### Test set

In [None]:
X1_ft_test.shape

In [None]:
# dask delayed
q_rd_test = []
for q1, q2 in zip(X1_ft_test, X2_ft_test):
    q = delayed(compute_red_dim)(q1, q2, 'MDS')
    q_rd_test.append(q)

In [None]:
q_rd_test = compute(*q_rd_test)

In [None]:
len(q_rd_test)

In [None]:
del X1_ft_test, X2_ft_test

In [None]:
jaccard_test = []
chebyshev_test = []
braycurtis_test = []
for q_tuple in q_rd_test:
    if q_tuple:
        q1_rd, q2_rd = q_tuple
        jaccard_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'jaccard'))
        chebyshev_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'chebyshev'))
        braycurtis_test.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, 'braycurtis'))
    else:
        jaccard_test.append(delayed([]))
        chebyshev_test.append(delayed([]))
        braycurtis_test.append(delayed([]))        

In [None]:
jaccard_test = compute(*jaccard_test)

In [None]:
chebyshev_test = compute(*chebyshev_test)

In [None]:
braycurtis_test = compute(*braycurtis_test)

#### Graph Kernel estimation

In [None]:
#! pip install grakel-dev

In [None]:
from grakel import GraphKernel
from grakel import Graph
def estimate_grakel(pc1, pc2):
    sp_kernel = GraphKernel(kernel={"name": "shortest_path"})
    sp_kernel.fit_transform([Graph(pc1)])
    return sp_kernel.transform([Graph(pc2)])

In [None]:
def rowwise_grakel(row):
    pc1 = row['q1_ft']
    pc2 = row['q2_ft']
    pc1_embd, pc2_embd = embed_pointclouds(pc1, pc2, method='LLE', dimensions=3)
    return estimate_grakel(pc1_embd, pc2_embd)

In [None]:
X_train['grakel'] = X_train.apply(rowwise_grakel, axis=1)
X_train.head()

In [None]:
pc1 = X_train.iloc[0]['q1_ft']
pc2 = X_train.iloc[0]['q2_ft']
pc1_embd, pc2_embd = embed_pointclouds(pc1, pc2, method='LLE', dimensions=3)
print(len(pc1))
print(pc1_embd.shape)
print(len(pc2))
print(pc2_embd)

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens('train'))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
from sklearn.decomposition import TruncatedSVD
import time
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens('test'))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

### Fuzzy-wuzzy

In [None]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

In [None]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

In [None]:
# build complete feature dataframe
X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
X_train_temp.head()

In [None]:
X_train = pd.concat([X_train_temp, X_train], axis=1)
del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

In [None]:
X_train.info()

##### Test set

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2', 'qid1', 'qid2'])
X_test.head()

# Modeling

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train, y_train)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))