In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np
import pickle
from scipy.io import mmwrite, mmread

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
#INPUT_BUCKET = 'dq-data'
data_folder = '/media/siri/78C6823EC681FD1E/minio/data/dq-data/'
#HASH_BUCKET = 'dq-hashed'

In [None]:
#load train_set
data = 'train.csv'
#filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
filestream = data_folder+data
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [None]:
#del df

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split
#shrink df to 150,000 records
#df = df.iloc[:100000]

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.info()

In [None]:
import pickle
pickle.dump(X_train, open(data_folder+'X_train.p', 'wb'))
pickle.dump(y_train, open(data_folder+'y_train.p', 'wb'))
pickle.dump(X_test, open(data_folder+'X_test.p', 'wb'))
pickle.dump(y_test, open(data_folder+'y_test.p', 'wb'))

In [5]:
X_train = pickle.load(open(data_folder+'X_train.p', 'rb'))
X_test = pickle.load(open(data_folder+'X_test.p', 'rb'))

In [6]:
y_train = pickle.load(open(data_folder+'y_train.p', 'rb'))
y_test = pickle.load(open(data_folder+'y_test.p', 'rb'))

In [None]:
del X,y,df

In [None]:
del X_train,X_test

In [None]:
del y_train, y_test

### Memory Check

In [7]:
import sys
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) 
        for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], 
       key=lambda x: x[1], reverse=True)

[('X_train', 70341065),
 ('X_test', 34641540),
 ('y_train', 4333976),
 ('y_test', 2134664),
 ('HTTPResponse', 1056),
 ('Dict', 888),
 ('List', 888),
 ('Tuple', 888),
 ('mmread', 136),
 ('mmwrite', 136),
 ('partition_all', 136),
 ('data_folder', 97),
 ('np', 80),
 ('pd', 80),
 ('ps', 80)]

In [None]:
#del y_train, y_test

# Feature Extraction

### Tokenizing and preprocessing

In [None]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [None]:
from gensim.models import FastText
model = FastText.load_fasttext_format(data_folder+'cc.en.300.bin')

In [None]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [None]:
def get_tfidf_for_valid_vectors(model, process):
    for tokens in get_tokens(process):
        tf_idf_tokens = []
        for token in tokens:
            try:
                vector = model.wv[token]
                tf_idf_tokens.append(token)
            except:
                continue
        yield np.array(tf_idf_tokens)

In [None]:
#X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
#X_ft.shape

In [None]:
# split back into two
#X1_ft = X_ft[:len(X_train)]
#X2_ft = X_ft[len(X_train):]

##### Test set

In [None]:
#X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [None]:
#X_ft_test.shape

In [None]:
#del model

In [None]:
# split back into two
#X1_ft_test = X_ft_test[:len(X_test)]
#X2_ft_test = X_ft_test[len(X_test):]

### TFIDF and Word2Vec

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tfidf_for_valid_vectors(model,'train'))

In [None]:
X_trfmd

In [None]:
# split back into two
X1_trfmd = X_trfmd[:len(X_train)]
X2_trfmd = X_trfmd[len(X_train):]

In [None]:
def get_weights_and_w2vectors(tfidf_matrix, tfidf_vectorizer, w2v_model):
    weights = []
    w2v = []
    rows = tfidf_matrix.shape[0]
    inverse_vocab_dict = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}
    for doc in range(rows):
        features = tfidf_matrix[doc,:].nonzero()[1]
        weights.append(np.array([tfidf_matrix[doc, x] for x in features]))
        w2v.append(np.array([w2v_model.wv[inverse_vocab_dict[x]] for x in features]))
    return np.array(weights), np.array(w2v)

In [None]:
X1_w, X1 = get_weights_and_w2vectors(X1_trfmd, tfidf, model)
X1_w.shape

In [None]:
X1.shape

In [None]:
X1_w.shape

In [None]:
X2_w, X2 = get_weights_and_w2vectors(X2_trfmd, tfidf, model)
X2_w.shape

In [None]:
pickle.dump(X1_w, open(data_folder+'X1_w.p','wb'))
pickle.dump(X2_w, open(data_folder+'X2_w.p','wb'))

In [None]:
pickle.dump(X1, open(data_folder+'X1.p','wb'))
pickle.dump(X2, open(data_folder+'X2.p','wb'))

In [None]:
X1_w[420].shape

In [None]:
X1[420].shape

In [None]:
del X1_w, X2_w, X1, X2

In [None]:
# dimension reduction using SVD
#from sklearn.decomposition import TruncatedSVD
#import time
#start = time.time()
#svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
#X_svd = svd.fit_transform(X_trfmd)
#end =  time.time()
#print('created SVD transform in time {}'.format(end-start))

In [None]:
#X_svd.shape

In [None]:
# split back into two
#X1 = X_svd[:len(X_train), :]
#X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tfidf_for_valid_vectors(model, 'test'))

In [None]:
X_test_trfmd

In [None]:
# split back into two
X1_test_trfmd = X_test_trfmd[:len(X_test)]
X2_test_trfmd = X_test_trfmd[len(X_test):]

In [None]:
X1_test_w, X1_test = get_weights_and_w2vectors(X1_test_trfmd, tfidf, model)
X1_test_w.shape

In [None]:
X2_test_w, X2_test = get_weights_and_w2vectors(X2_test_trfmd, tfidf, model)
X2_test_w.shape

In [None]:
pickle.dump(X1_test_w, open(data_folder+'X1_test_w.p','wb'))
pickle.dump(X2_test_w, open(data_folder+'X2_test_w.p','wb'))

In [None]:
pickle.dump(X1_test, open(data_folder+'X1_test.p','wb'))
pickle.dump(X2_test, open(data_folder+'X2_test.p','wb'))

In [None]:
del X1_test_w, X2_test_w, X1_test, X2_test

In [None]:
# dimension reduction using SVD
#start = time.time()
#X_test_svd = svd.transform(X_test_trfmd)
#end =  time.time()
#print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
#X1_test = X_test_svd[:len(X_test), :]
#X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
#X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
#                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
#X_test_temp.head()

### Pairwise Metrics

In [None]:
#def get_q_lengths(X):
#    #q_meta = []
#    for q in X:
#        #q_meta.append(len(q))
#        yield len(q)
#    #return q_meta

In [None]:
#q_meta_train = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft), get_q_lengths(X2_ft))]

In [None]:
#pickle.dump(q_meta_train, open(data_folder+'q_meta_train.p', 'wb'))

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='q_meta_train.p', source='q_meta_train.p')

In [None]:
#X_train_300 = np.concatenate( 
#    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft if x.size>0)] )
#)

In [None]:
#X_train_300.shape

In [None]:
#mmwrite( data_folder+'wor2vec_300_full_train.mtx', X_train_300 )
#np.savez(data_folder+'wor2vec_300_full_train', data=X_train_300)

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_full_train.mtx', source='wor2vec_300_full_train.mtx')

In [None]:
#del X_ft

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='embed2_full_train.mtx', filepath='embed2_full_train.mtx')

In [None]:
#X_rd = mmread('embed2_full_train.mtx')

In [None]:
#X_rd.shape

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='q_meta_train.p', filepath='q_meta_train.p')

In [None]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, directed_hausdorff
from fastdtw import fastdtw
import similaritymeasures
from scipy.spatial import procrustes
def compute_pairwise_kernel(pc1, pc2, w1, w2, method='linear'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='polynomial':
        dist_mat = polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        dist_mat = rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        dist_mat = sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        dist_mat = laplacian_kernel(pc1, pc2)
    else:
        dist_mat = linear_kernel(pc1, pc2)
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))
    
def compute_pairwise_dist(pc1, pc2, w1, w2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='hausdorff':
        dist = directed_hausdorff(pc1, pc2)
        return dist[0]
    else:
        dist_mat = pairwise_distances(pc1, pc2, metric=method) 
    #dist_mat = cdist(pc1, pc2, metric=method)
    #return np.linalg.norm(dist_mat, ord='fro')
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))

def compute_weighted_mean(pc, w):
    return np.average(pc, axis=0, weights=w)

def compute_pairwise_metric(pc1, pc2, method='dtw'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    #if method=='fdtw':
    #    dist, _ = fastdtw(pc1, pc2, dist=euclidean)
    if method=='pcm':
        dist = similaritymeasures.pcm(pc1[:,:2], pc2[:,:2])
    if method=='discrete_frechet':
        dist = similaritymeasures.frechet_dist(pc1[:,:2], pc2[:,:2])
    if method=='area':
        dist = similaritymeasures.area_between_two_curves(pc1[:,:2], pc2[:,:2])
    if method=='curve_length':
        dist = similaritymeasures.curve_length_measure(pc1[:,:2], pc2[:,:2])
    if method=='dtw':
        dist, _ = similaritymeasures.dtw(pc1[:,:2], pc2[:,:2])
    #if method=='procrustes': 
    #    mbox1 = MinimumBoundingBox([x[:2] for x in pc1.tolist()])
    #    mbox2 = MinimumBoundingBox([x[:2] for x in pc2.tolist()])
    #    _,_,dist = procrustes(make_array(mbox1.corner_points), make_array(mbox2.corner_points))
    return dist

        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [None]:
def compute_delayed(X1, X2, X1_w, X2_w, method):
    temp = []
    for q_tuple in zip(X1, X2, X1_w, X2_w):
        if q_tuple:
            q1_rd, q2_rd, q1_w, q2_w = q_tuple
            if method in ['polynomial', 'rbf', 'sigmoid', 'laplacian', 'linear']:
                temp.append(delayed(compute_pairwise_kernel)(q1_rd, q2_rd, q1_w, q2_w, method))
            else:
                temp.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, q1_w, q2_w, method))
        else:
            temp.append(delayed(np.nan))
    return compute(*temp)

In [None]:
def create_nan_array(r,c):
    arr = np.empty((r,c))
    arr[:] = np.nan
    return arr

In [None]:
def pickle_and_del(obj, file, data_folder=data_folder):
    pickle.dump(obj, open(data_folder+file+'.p', 'wb'))
    del obj

In [None]:
def compute_delayed_wmean(X, X_w, file, data_folder=data_folder):
    temp = []
    for q_tuple in zip(X, X_w):
        if q_tuple:
            q_rd, q_w = q_tuple
            if np.sum(q_w) != 0:
                temp.append(delayed(compute_weighted_mean)(q_rd, q_w))
            else:
                temp.append(delayed(create_nan_array)(1,300))                
        else:
            temp.append(delayed(create_nan_array)(1,300))
    temp_arr = np.array(temp)
    computed_obj = compute(*temp_arr)
    pickle_and_del(computed_obj, file, data_folder)    

In [None]:
def compute_and_save(X1, X2, X1_w, X2_w, method, file, data_folder=data_folder):
    computed_obj = compute_delayed(X1, X2, X1_w, X2_w, method)
    pickle_and_del(computed_obj, file, data_folder)    

In [None]:
#q_meta_train = pickle.load(open(data_folder+'q_meta_train.p','rb'))

In [None]:
#len(q_meta_train)

In [None]:
#mmwrite( data_folder+'wor2vec_300_full_train.mtx', X_train_300 )
#X_train_300 = np.load(data_folder+'wor2vec_300_full_train.npz')['data']

In [None]:
#X_train_300.shape

In [None]:
# rebuild X1_rd and X2_rd
#X1_list = []
#X2_list = []
#X1_rd_list = []
#X2_rd_list = []
#q1_ptr = 0
#for len_q1, _ in q_meta_train:
#    q1 = np.array(X_train_300[q1_ptr:q1_ptr+len_q1])
#    #q1_rd = np.array(X_rd[q1_ptr:q1_ptr+len_q1])
#    X1_list.append(q1)
#    #X1_rd_list.append(q1_rd)
#    q1_ptr = q1_ptr+len_q1
#q2_ptr = q1_ptr
#for _, len_q2 in q_meta_train:
#    q2 = np.array(X_train_300[q2_ptr:q2_ptr+len_q2])
#    #q2_rd = np.array(X_rd[q2_ptr:q2_ptr+len_q2])
#    X2_list.append(q2)
#    #X2_rd_list.append(q2_rd)
#    q2_ptr = q2_ptr+len_q2
#X1 = np.array(X1_list)
#X2 = np.array(X2_list)
#X1_rd = np.array(X1_rd_list)
#X2_rd = np.array(X2_rd_list)

In [None]:
#del X1_list, X2_list, X_train_300, q_meta_train

#### Initialize Dask

In [None]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

#### Compute Features

In [None]:
X1_w = pickle.load(open(data_folder+'X1_w.p','rb'))
X2_w = pickle.load(open(data_folder+'X2_w.p','rb'))

In [None]:
X1 = pickle.load(open(data_folder+'X1.p','rb'))
X2 = pickle.load(open(data_folder+'X2.p','rb'))

In [None]:
def compute_wmean(X, X_w):
    temp = []
    for q_tuple in zip(X, X_w):
        if q_tuple:
            q_rd, q_w = q_tuple
            temp.append(compute_weighted_mean(q_rd, q_w))
        else:
            temp.append(create_nan_array(1,300))
    #computed_obj = compute(*temp)
    #pickle_and_del(computed_obj, file, data_folder) 
    return np.array(temp)

In [None]:
temp = compute_wmean(X1[:1],X1_w[:1])

In [None]:
temp.shape

In [None]:
#jaccard = compute(*jaccard)

In [None]:
#chebyshev = compute(*chebyshev)
#compute_and_save(X1, X2, 'chebyshev', 'chebyshev_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'chebyshev', 'chebyshev_train_w', data_folder)

In [None]:
#braycurtis = compute(*braycurtis)
#compute_and_save(X1, X2, 'braycurtis', 'braycurtis_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'braycurtis', 'braycurtis_train_w', data_folder)

In [None]:
#cosine = compute(*cosine)
#compute_and_save(X1, X2, 'cosine', 'cosine_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'cosine', 'cosine_train_w', data_folder)

In [None]:
#correlation = compute(*correlation)
#compute_and_save(X1, X2, 'correlation', 'correlation_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'correlation', 'correlation_train_w', data_folder)

In [None]:
#hamming = compute(*hamming)

In [None]:
#canberra = compute(*canberra)
#compute_and_save(X1, X2, 'canberra', 'canberra_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'canberra', 'canberra_train_w', data_folder)

In [None]:
#hausdorff = compute(*hausdorff)
#compute_and_save(X1, X2, 'hausdorff', 'hausdorff_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'hausdorff', 'hausdorff_train_w', data_folder)

In [None]:
#cityblock = compute(*cityblock)
#compute_and_save(X1, X2, 'cityblock', 'cityblock_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'cityblock', 'cityblock_train_w', data_folder)

In [None]:
#euclidean = compute(*euclidean)
#compute_and_save(X1, X2, 'euclidean', 'euclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'euclidean', 'euclidean_train_w', data_folder)

In [None]:
#l1 = compute(*l1)
#compute_and_save(X1, X2, 'l1', 'l1_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'l1', 'l1_train_w', data_folder)

In [None]:
#l2 = compute(*l2)
#compute_and_save(X1, X2, 'l2', 'l2_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'l2', 'l2_train_w', data_folder)

In [None]:
#manhattan = compute(*manhattan)
#compute_and_save(X1, X2, 'manhattan', 'manhattan_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'manhattan', 'manhattan_train_w', data_folder)

In [None]:
#dice = compute(*dice)

In [None]:
#kulsinski = compute(*kulsinski)

In [None]:
#rogerstanimoto = compute(*rogerstanimoto)

In [None]:
#russellrao = compute(*russellrao)

In [None]:
#sokalmichener = compute(*sokalmichener)

In [None]:
#minkowski = compute(*minkowski)
#compute_and_save(X1, X2, 'minkowski', 'minkowski_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'minkowski', 'minkowski_train_w', data_folder)

In [None]:
#seuclidean = compute(*seuclidean)
#compute_and_save(X1, X2, 'seuclidean', 'seuclidean_train', data_folder)

In [None]:
#sokalsneath = compute(*sokalsneath)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'sqeuclidean', 'sqeuclidean_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'polynomial', 'polynomial_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'rbf', 'rbf_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'sigmoid', 'sigmoid_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'linear', 'linear_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'laplacian', 'laplacian_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X1, X1_w, 'weighted_mean1_train', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X2, X2_w, 'weighted_mean2_train', data_folder)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
#dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
#discrete_frechet = compute(*discrete_frechet)

In [None]:
#procrustes = compute(*procrustes)

In [None]:
del X1, X2, X1_w, X2_w

#### add above metrics to X_train

In [8]:
def add_column(df, column, train_or_test, data_folder=data_folder):
    col_arr = pickle.load(open(data_folder+column+'_'+train_or_test+'_w.p', 'rb'))
    return pd.concat([df,
                     pd.Series(col_arr, name=column,index=df.index)
                      ], axis=1)

In [9]:
def add_d2v_columns(df, d2v, train_or_test, data_folder=data_folder):
    col_arr = pickle.load(open(data_folder+d2v+'_'+train_or_test+'_red.p', 'rb'))
    return pd.concat([df,
                     pd.DataFrame(col_arr, columns=[d2v+'_'+str(i) for i in range(col_arr.shape[1])],index=df.index)
                      ], axis=1)

In [10]:
X_train = add_column(X_train, 'chebyshev', 'train')

In [11]:
X_train = add_column(X_train, 'braycurtis', 'train')

In [12]:
X_train = add_column(X_train, 'cosine', 'train')

In [13]:
X_train = add_column(X_train, 'correlation', 'train')

In [14]:
X_train = add_column(X_train, 'canberra', 'train')

In [15]:
X_train = add_column(X_train, 'hausdorff', 'train')

In [16]:
X_train = add_column(X_train, 'cityblock', 'train')

In [17]:
X_train = add_column(X_train, 'euclidean', 'train')

In [18]:
X_train = add_column(X_train, 'l1', 'train')

In [19]:
X_train = add_column(X_train, 'l2', 'train')

In [20]:
X_train = add_column(X_train, 'manhattan', 'train')

In [21]:
X_train = add_column(X_train, 'minkowski', 'train')

In [22]:
X_train = add_column(X_train, 'sqeuclidean', 'train')

In [23]:
X_train = add_d2v_columns(X_train, 'd2v_1', 'train')

In [24]:
X_train = add_d2v_columns(X_train, 'd2v_2', 'train')

In [25]:
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_2_40,d2v_2_41,d2v_2_42,d2v_2_43,d2v_2_44,d2v_2_45,d2v_2_46,d2v_2_47,d2v_2_48,d2v_2_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
186150,284024,284025,"What is the difference between ""visa on arriva...",Do I need a visa to visit Ireland from England?,0.292102,0.825892,0.788286,0.788564,200.35101,1.615564,...,-0.752246,-0.765466,-0.337335,0.838323,-0.012084,0.85796,0.500386,-0.608321,-0.180905,-0.386673
208652,312703,312704,What are the chances of receiving invitation t...,What are the chances of receiving invitation t...,0.476484,0.732037,0.674718,0.674567,183.345052,0.0,...,-1.015671,-1.837046,-0.389282,1.504601,0.042996,0.853886,-0.149812,-0.064253,-0.020672,-0.304634
268163,385651,385652,"Is the phrase ""the pressure in an incompressib...",What is the use of Bash in Windows?,0.448201,0.963075,0.93944,0.939968,225.509213,1.700451,...,0.505762,-0.411201,-0.083322,-0.445053,-0.16891,0.786781,0.080461,0.344001,0.147636,-0.604211
155539,243655,243656,How do I buy goods from Amazon and ship to Vie...,How can I sell Vietnam traditional silk ties o...,0.430242,0.820454,0.786592,0.786038,194.753906,1.885054,...,-0.753396,-1.081508,-0.334577,0.733598,-0.077231,0.526569,-0.479279,-0.009617,-0.096563,-0.343635
212779,317931,317932,How do you read tarot cards?,How do I read tarot cards?,0.228653,0.468721,0.445017,0.446258,120.422093,0.0,...,0.22695,-0.722068,-0.116974,-0.689803,-0.044381,1.024583,0.623106,-0.302407,-0.08179,-0.973787


In [None]:
X_train[X_train.isnull().any(axis=1)]

##### Test set

In [None]:
#q_meta_test = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft_test), get_q_lengths(X2_ft_test))]

In [None]:
#pickle.dump(q_meta_test, open(data_folder+'q_meta_test.p', 'wb'))

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='q_meta_test.p', source='q_meta_test.p')

In [None]:
#X_test_300 = np.concatenate( 
#    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft_test if x.size>0)] )
#)

In [None]:
#X_test_300.shape

In [None]:
#from scipy.io import mmwrite, mmread
#mmwrite( 'wor2vec_300_full_test.mtx', X_test_300 )
#np.savez(data_folder+'wor2vec_300_full_test', data=X_test_300)

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_full_test.mtx', source='wor2vec_300_full_test.mtx')

In [None]:
#del X_test_300, q_meta_test, X_ft_test

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='embed2_full_test.mtx', filepath='embed2_full_test.mtx')

In [None]:
#from scipy.io import mmread
#X_rd_test = mmread('embed2_full_test.mtx')

In [None]:
#X_rd_test.shape

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='q_meta_test.p', filepath='q_meta_test.p')

In [None]:
#q_meta_test = pickle.load(open(data_folder+'q_meta_test.p','rb'))

In [None]:
#mmwrite( data_folder+'wor2vec_300_full_train.mtx', X_train_300 )
#X_test_300 = np.load(data_folder+'wor2vec_300_full_test.npz')['data']

In [None]:
#X_test_300.shape

In [None]:
# rebuild X1_rd_test and X2_rd_test
#X1_list = []
#X2_list = []
#X1_rd_list = []
#X2_rd_list = []
#q1_ptr = 0
#for len_q1, _ in q_meta_test:
#    q1 = np.array(X_test_300[q1_ptr:q1_ptr+len_q1])
#    #q1_rd = np.array(X_rd_test[q1_ptr:q1_ptr+len_q1])
#    X1_list.append(q1)
#    #X1_rd_list.append(q1_rd)
#    q1_ptr = q1_ptr+len_q1
#q2_ptr = q1_ptr
#for _, len_q2 in q_meta_test:
#    q2 = np.array(X_test_300[q2_ptr:q2_ptr+len_q2])
#    #q2_rd = np.array(X_rd_test[q2_ptr:q2_ptr+len_q2])
#    X2_list.append(q2)
#    #X2_rd_list.append(q2_rd)
#    q2_ptr = q2_ptr+len_q2
#X1_test = np.array(X1_list)
#X2_test = np.array(X2_list)
#X1_rd_test = np.array(X1_rd_list)
#X2_rd_test = np.array(X2_rd_list)

In [None]:
#X2_test.shape

In [None]:
#del X1_list, X2_list, q_meta_test, X_test_300, X1_rd_list, X2_rd_list

In [None]:
X1_test_w = pickle.load(open(data_folder+'X1_test_w.p','rb'))
X2_test_w = pickle.load(open(data_folder+'X2_test_w.p','rb'))

In [None]:
X1_test = pickle.load(open(data_folder+'X1_test.p','rb'))
X2_test = pickle.load(open(data_folder+'X2_test.p','rb'))

In [None]:
#jaccard = compute(*jaccard)

In [None]:
#chebyshev = compute(*chebyshev)
#compute_and_save(X1_test, X2_test, 'chebyshev', 'chebyshev_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'chebyshev', 'chebyshev_test_w', data_folder)

In [None]:
#braycurtis = compute(*braycurtis)
#compute_and_save(X1_test, X2_test, 'braycurtis', 'braycurtis_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'braycurtis', 'braycurtis_test_w', data_folder)

In [None]:
#cosine = compute(*cosine)
#compute_and_save(X1_test, X2_test, 'cosine', 'cosine_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'cosine', 'cosine_test_w', data_folder)

In [None]:
#correlation = compute(*correlation)
#compute_and_save(X1_test, X2_test, 'correlation', 'correlation_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'correlation', 'correlation_test_w', data_folder)

In [None]:
#hamming = compute(*hamming)

In [None]:
#canberra = compute(*canberra)
#compute_and_save(X1_test, X2_test, 'canberra', 'canberra_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'canberra', 'canberra_test_w', data_folder)

In [None]:
#hausdorff = compute(*hausdorff)
#compute_and_save(X1_test, X2_test, 'hausdorff', 'hausdorff_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'hausdorff', 'hausdorff_test_w', data_folder)

In [None]:
#cityblock = compute(*cityblock)
#compute_and_save(X1_test, X2_test, 'cityblock', 'cityblock_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'cityblock', 'cityblock_test_w', data_folder)

In [None]:
#euclidean = compute(*euclidean)
#compute_and_save(X1_test, X2_test, 'euclidean', 'euclidean_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'euclidean', 'euclidean_test_w', data_folder)

In [None]:
#l1 = compute(*l1)
#compute_and_save(X1_test, X2_test, 'l1', 'l1_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'l1', 'l1_test_w', data_folder)

In [None]:
#l2 = compute(*l2)
#compute_and_save(X1_test, X2_test, 'l2', 'l2_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'l2', 'l2_test_w', data_folder)

In [None]:
#manhattan = compute(*manhattan)
#compute_and_save(X1_test, X2_test, 'manhattan', 'manhattan_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'manhattan', 'manhattan_test_w', data_folder)

In [None]:
#dice = compute(*dice)

In [None]:
#kulsinski = compute(*kulsinski)

In [None]:
#rogerstanimoto = compute(*rogerstanimoto)

In [None]:
#russellrao = compute(*russellrao)

In [None]:
#sokalmichener = compute(*sokalmichener)

In [None]:
#minkowski = compute(*minkowski)
#compute_and_save(X1_test, X2_test, 'minkowski', 'minkowski_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'minkowski', 'minkowski_test_w', data_folder)

In [None]:
#seuclidean = compute(*seuclidean)

In [None]:
#sokalsneath = compute(*sokalsneath)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1_test, X2_test, 'sqeuclidean', 'sqeuclidean_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'sqeuclidean', 'sqeuclidean_test_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X1_test, X1_test_w, 'weighted_mean1_test', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X2_test, X2_test_w, 'weighted_mean2_test', data_folder)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
#dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
#discrete_frechet = compute(*discrete_frechet)

In [None]:
del X1_test, X2_test, X1_test_w, X2_test_w

#### add above metrics to X_test

In [26]:
X_test = add_column(X_test, 'chebyshev', 'test')

In [27]:
X_test = add_column(X_test, 'braycurtis', 'test')

In [28]:
X_test = add_column(X_test, 'cosine', 'test')

In [29]:
X_test = add_column(X_test, 'correlation', 'test')

In [30]:
X_test = add_column(X_test, 'canberra', 'test')

In [31]:
X_test = add_column(X_test, 'hausdorff', 'test')

In [32]:
X_test = add_column(X_test, 'cityblock', 'test')

In [33]:
X_test = add_column(X_test, 'euclidean', 'test')

In [34]:
X_test = add_column(X_test, 'l1', 'test')

In [35]:
X_test = add_column(X_test, 'l2', 'test')

In [36]:
X_test = add_column(X_test, 'manhattan', 'test')

In [37]:
X_test = add_column(X_test, 'minkowski', 'test')

In [38]:
X_test = add_column(X_test, 'sqeuclidean', 'test')

In [39]:
X_test = add_d2v_columns(X_test, 'd2v_1', 'test')

In [40]:
X_test = add_d2v_columns(X_test, 'd2v_2', 'test')

In [41]:
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_2_40,d2v_2_41,d2v_2_42,d2v_2_43,d2v_2_44,d2v_2_45,d2v_2_46,d2v_2_47,d2v_2_48,d2v_2_49
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0.305258,0.664617,0.653823,0.651883,160.494443,1.346215,...,0.880558,0.659818,0.085401,-1.700012,0.220587,0.333161,-0.566693,-0.165682,-0.177057,0.18985
224279,332326,332327,Will a breathing treatment help a cough?,How can I help someone that is unconscious but...,0.198065,0.7686,0.749187,0.748293,183.113717,1.331043,...,-0.496634,-0.664818,0.042327,-1.121576,-0.163005,0.535517,-0.23826,-0.126078,0.011768,0.816518
252452,336023,366789,Is Kellyanne Conway annoying in your opinion?,Did Kellyanne Conway really imply that we shou...,0.23727,0.855352,0.847278,0.846889,197.037204,0.935798,...,-0.583868,-0.684443,-0.20319,-0.320953,-0.070671,0.322278,-0.204754,-0.10076,-0.158545,0.328979
174039,268330,268331,How do you rate (1-10) and review Maruti Baleno?,What career options does one have after comple...,0.304343,0.901249,0.866577,0.866264,212.224992,1.479857,...,-0.823866,-1.09213,-0.09032,0.610005,-0.035683,0.464907,0.034878,-0.34224,-0.077522,0.43133
384863,28901,233483,What are some good books on marketing?,What are some of the best books ever written a...,0.196309,0.704832,0.655896,0.655889,169.794796,1.05539,...,0.010735,-0.959856,-0.059919,-1.411303,-0.118132,0.407064,0.144557,-0.190387,0.011818,0.880851


In [None]:
X_test[X_test.isnull().any(axis=1)]

### Fuzzy-wuzzy

In [42]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
#X_train.head()

In [43]:
from fuzzywuzzy import fuzz

In [44]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
#X_train.head()

In [45]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
#X_train.head()

In [46]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
#X_train.head()

In [47]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
#X_train.head()

In [48]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [49]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train_final = X_train.drop(columns=['qid1', 'qid2','question1','question2']).dropna()
X_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270404 entries, 186150 to 121959
Columns: 118 entries, chebyshev to token_set_ratio
dtypes: float64(113), int64(5)
memory usage: 245.5 MB


In [50]:
X_train_final.tail(20)

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_2_45,d2v_2_46,d2v_2_47,d2v_2_48,d2v_2_49,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
262915,0.354437,0.841399,0.81166,0.811295,197.35666,2.789291,26.665156,1.945469,26.665156,1.945469,...,9.956107e-309,1.045739e-308,1.0621199999999997e-308,1.073064e-308,1.078415e-308,31,42,43,41,46
64820,0.32648,0.891661,0.848653,0.848096,210.551333,1.366273,24.67385,1.787294,24.67385,1.787294,...,9.274098e-309,1.062903e-308,1.774837e-310,4.175239e-310,8.175838e-310,8,69,70,70,79
329367,0.291575,0.908435,0.88397,0.883678,209.881889,2.865792,20.321822,1.483544,20.321822,1.483544,...,5.647734e-309,5.79133e-309,5.919265e-309,6.152663e-309,6.341669e-309,190,22,49,25,43
41090,0.212105,0.693298,0.646222,0.644558,173.490028,0.885681,15.826455,1.160013,15.826455,1.160013,...,6.800763e-309,7.159232e-309,7.251433e-309,7.422699e-309,7.739513e-309,18,75,84,67,91
278169,0.295629,0.80906,0.769617,0.770032,191.028426,1.233788,21.154065,1.547494,21.154065,1.547494,...,3.617069e-309,4.09942e-309,4.850246e-309,5.137224000000003e-309,5.754937e-309,22,45,53,60,65
191336,0.384656,0.726543,0.710085,0.709904,174.365754,1.345086,28.713537,2.114356,28.713537,2.114356,...,8.227551e-309,1.062193e-308,1.072705e-308,1.1400060000000003e-308,2.553185e-310,12,47,51,52,56
175204,0.294775,0.756318,0.713284,0.713169,184.057866,1.000259,18.531281,1.367763,18.531281,1.367763,...,6.426655e-309,7.531048e-309,8.483888e-309,8.674379e-309,9.305885e-309,9,78,82,74,88
388471,0.362091,0.901343,0.869831,0.869676,210.967917,1.800405,23.381786,1.739984,23.381786,1.739984,...,2.601991e-310,2.853236e-310,7.976794e-310,1.144541e-309,1.267001e-309,20,43,43,52,56
374874,0.366401,0.939084,0.917431,0.917606,220.728527,2.864478,25.076347,1.845196,25.076347,1.845196,...,5.441837e-309,6.223007e-309,7.559355e-309,7.976158e-309,9.621978e-309,29,37,39,43,43
87498,0.300929,0.750685,0.725615,0.724199,182.587097,1.183038,21.832395,1.60885,21.832395,1.60885,...,7.743354e-309,7.905856e-309,7.991054e-309,8.582454e-309,8.811142e-309,39,73,95,80,97


##### Test set

In [51]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
#X_test.head()

In [52]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
#X_test.head()

In [53]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
#X_test.head()

In [54]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
#X_test.head()

In [55]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
#X_test.head()

In [56]:
X_test_final = X_test.drop(columns=['question1','question2', 'qid1', 'qid2']).dropna()
X_test_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133032 entries, 8067 to 346580
Columns: 118 entries, chebyshev to token_set_ratio
dtypes: float64(113), int64(5)
memory usage: 120.8 MB


In [57]:
X_test_final.head()

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_2_45,d2v_2_46,d2v_2_47,d2v_2_48,d2v_2_49,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.305258,0.664617,0.653823,0.651883,160.494443,1.346215,18.418028,1.375237,18.418028,1.375237,...,0.333161,-0.566693,-0.165682,-0.177057,0.18985,0,88,88,81,90
224279,0.198065,0.7686,0.749187,0.748293,183.113717,1.331043,14.162174,1.044846,14.162174,1.044846,...,0.535517,-0.23826,-0.126078,0.011768,0.816518,23,33,38,48,55
252452,0.23727,0.855352,0.847278,0.846889,197.037204,0.935798,16.121011,1.185337,16.121011,1.185337,...,0.322278,-0.204754,-0.10076,-0.158545,0.328979,73,40,56,43,53
174039,0.304343,0.901249,0.866577,0.866264,212.224992,1.479857,21.203832,1.558916,21.203832,1.558916,...,0.464907,0.034878,-0.34224,-0.077522,0.43133,79,30,42,32,33
384863,0.196309,0.704832,0.655896,0.655889,169.794796,1.05539,14.301773,1.041679,14.301773,1.041679,...,0.407064,0.144557,-0.190387,0.011818,0.880851,23,69,61,66,88


# Modeling

### Logistic Regression

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
y_train_final = y_train.loc[X_train_final.index]
logr_cv.fit(X_train_final, y_train_final)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07]), 'tol': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [59]:
logr_cv.best_params_

{'tol': 1e-05, 'C': 100.0}

In [60]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=42, solver='warn', tol=1e-05,
          verbose=0, warm_start=False)

In [61]:
logr_pred = logr_model.predict(X_test_final)
y_test_final = y_test.loc[X_test_final.index]

In [62]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))

Logistic Regression
accuracy score : 0.6962460159961513
precision score : 0.6005188067444877
recall score : 0.528183503432541


### XGBoost

In [63]:
import xgboost as xgb
# Model selection
params_xgb = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'gamma':np.linspace(.01, 1, 10, endpoint=True), 
               'learning_rate' : np.linspace(.01, 1, 10, endpoint=True),
               'reg_lambda': np.linspace(0.01, 10, 20, endpoint=True),
               'max_depth' : np.linspace(1, 32, 32, endpoint=True, dtype=int)
                 }
cv_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=params_xgb, cv=5, n_jobs=3, random_state=42)
cv_xgb.fit(X_train_final, y_train_final)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=3,
          param_distributions={'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200], 'gamma': array([0.01, 0.12, 0.23, 0.34, 0.45, 0.56, 0.67, 0.78, 0.89, 1.  ]), 'learning_rate': array([0.01, 0.12, 0.23, 0.34, 0.45, 0.56, 0.67, 0.78, 0.89, 1.  ]), 'reg_lambda': array([ 0.01   ,  0.53579,  1.06158,  1.58737,  2...10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32])},
          pre_dispatch='2*n_jobs', random_state=42, refit

In [64]:
cv_xgb.best_params_

{'reg_lambda': 6.319473684210527,
 'n_estimators': 200,
 'max_depth': 29,
 'learning_rate': 0.23,
 'gamma': 0.23}

In [65]:
clf_xgb_model = xgb.XGBClassifier(random_state=42,
                                  n_estimators=cv_xgb.best_params_['n_estimators'],
                                  gamma=cv_xgb.best_params_['gamma'],
                                  learning_rate=cv_xgb.best_params_['learning_rate'],
                                  reg_lambda=cv_xgb.best_params_['reg_lambda'],
                                  max_depth=cv_xgb.best_params_['max_depth'])
clf_xgb_model.fit(X_train_final, y_train_final)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.23, learning_rate=0.23,
       max_delta_step=0, max_depth=29, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=6.319473684210527, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [66]:
y_pred_xgb = clf_xgb_model.predict(X_test_final)
score_xgb = accuracy_score(y_test_final, y_pred_xgb)
rscore_xgb = recall_score(y_test_final, y_pred_xgb)
pscore_xgb = precision_score(y_test_final, y_pred_xgb)
print('Accuracy score for XGBoost ', score_xgb)
print('Recall score for XGBoost ', rscore_xgb)
print('Precision score for XGBoost ', pscore_xgb)

Accuracy score for XGBoost  0.8154504179445546
Recall score for XGBoost  0.736600867811526
Precision score for XGBoost  0.7567810799497697
