In [201]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np
import pickle
from scipy.io import mmwrite, mmread
from joblib import dump, load

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
#INPUT_BUCKET = 'dq-data'
data_folder = '/media/siri/78C6823EC681FD1E/minio/data/dq-data/'
#HASH_BUCKET = 'dq-hashed'

In [None]:
#load train_set
data = 'train.csv'
#filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
filestream = data_folder+data
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [None]:
#del df

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.info()

In [None]:
import pickle
pickle.dump(X_train, open(data_folder+'X_train.p', 'wb'))
pickle.dump(y_train, open(data_folder+'y_train.p', 'wb'))
pickle.dump(X_test, open(data_folder+'X_test.p', 'wb'))
pickle.dump(y_test, open(data_folder+'y_test.p', 'wb'))

In [202]:
X_train = pickle.load(open(data_folder+'X_train.p', 'rb'))
X_test = pickle.load(open(data_folder+'X_test.p', 'rb'))

In [203]:
y_train = pickle.load(open(data_folder+'y_train.p', 'rb'))
y_test = pickle.load(open(data_folder+'y_test.p', 'rb'))

In [None]:
del X,y,df

In [None]:
del X_train,X_test

In [None]:
del y_train, y_test

### Memory Check

In [204]:
import sys
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) 
        for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], 
       key=lambda x: x[1], reverse=True)

[('X_train_final', 84366072),
 ('X_train', 70341145),
 ('X_test_final', 41506008),
 ('X_test', 34641564),
 ('y_train', 4333976),
 ('y_train_final', 4326488),
 ('y_test', 2134664),
 ('y_test_final', 2128536),
 ('logr_pred', 1064352),
 ('y_pred_xgb', 1064352),
 ('STOPWORDS', 8416),
 ('kbest_scores', 4440),
 ('variances', 4136),
 ('LogisticRegression', 1464),
 ('GridSearchCV', 1056),
 ('HTTPResponse', 1056),
 ('RandomizedSearchCV', 1056),
 ('SelectKBest', 1056),
 ('VarianceThreshold', 1056),
 ('Dict', 888),
 ('List', 888),
 ('Tuple', 888),
 ('var_df', 408),
 ('RE_PUNCT', 204),
 ('RE_TAGS', 196),
 ('RE_WHITESPACE', 176),
 ('RE_NUMERIC', 164),
 ('accuracy_score', 136),
 ('add_column', 136),
 ('add_d2v_columns', 136),
 ('chi2', 136),
 ('classification_report', 136),
 ('compute_partial_ratio', 136),
 ('compute_ratio', 136),
 ('compute_size_diff', 136),
 ('compute_token_set_ratio', 136),
 ('compute_token_sort_ratio', 136),
 ('dump', 136),
 ('f_classif', 136),
 ('load', 136),
 ('mmread', 136),


In [None]:
#del y_train, y_test

# Feature Extraction

### Tokenizing and preprocessing

In [None]:
from gensim.parsing.preprocessing import preprocess_string
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question)

### Word2Vec (fasttext)

In [None]:
from gensim.models import FastText
model = FastText.load_fasttext_format(data_folder+'cc.en.300.bin')

In [None]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

In [None]:
def get_tfidf_for_valid_vectors(model, process):
    for tokens in get_tokens(process):
        tf_idf_tokens = []
        for token in tokens:
            try:
                vector = model.wv[token]
                tf_idf_tokens.append(token)
            except:
                continue
        yield np.array(tf_idf_tokens)

In [None]:
#X_ft = np.array([vectors for vectors in get_ft_vectors(model,'train')])
#X_ft.shape

In [None]:
# split back into two
#X1_ft = X_ft[:len(X_train)]
#X2_ft = X_ft[len(X_train):]

##### Test set

In [None]:
#X_ft_test = np.array([vectors for vectors in get_ft_vectors(model,'test')])

In [None]:
#X_ft_test.shape

In [None]:
#del model

In [None]:
# split back into two
#X1_ft_test = X_ft_test[:len(X_test)]
#X2_ft_test = X_ft_test[len(X_test):]

### TFIDF and Word2Vec

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tfidf_for_valid_vectors(model,'train'))

In [None]:
X_trfmd

In [None]:
# split back into two
X1_trfmd = X_trfmd[:len(X_train)]
X2_trfmd = X_trfmd[len(X_train):]

In [None]:
def get_weights_and_w2vectors(tfidf_matrix, tfidf_vectorizer, w2v_model):
    weights = []
    w2v = []
    rows = tfidf_matrix.shape[0]
    inverse_vocab_dict = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}
    for doc in range(rows):
        features = tfidf_matrix[doc,:].nonzero()[1]
        weights.append(np.array([tfidf_matrix[doc, x] for x in features]))
        w2v.append(np.array([w2v_model.wv[inverse_vocab_dict[x]] for x in features]))
    return np.array(weights), np.array(w2v)

In [None]:
X1_w, X1 = get_weights_and_w2vectors(X1_trfmd, tfidf, model)
X1_w.shape

In [None]:
X1.shape

In [None]:
X1_w.shape

In [None]:
X2_w, X2 = get_weights_and_w2vectors(X2_trfmd, tfidf, model)
X2_w.shape

In [None]:
pickle.dump(X1_w, open(data_folder+'X1_w.p','wb'))
pickle.dump(X2_w, open(data_folder+'X2_w.p','wb'))

In [None]:
pickle.dump(X1, open(data_folder+'X1.p','wb'))
pickle.dump(X2, open(data_folder+'X2.p','wb'))

In [None]:
X1_w[420].shape

In [None]:
X1[420].shape

In [None]:
del X1_w, X2_w, X1, X2

In [None]:
# dimension reduction using SVD
#from sklearn.decomposition import TruncatedSVD
#import time
#start = time.time()
#svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
#X_svd = svd.fit_transform(X_trfmd)
#end =  time.time()
#print('created SVD transform in time {}'.format(end-start))

In [None]:
#X_svd.shape

In [None]:
# split back into two
#X1 = X_svd[:len(X_train), :]
#X2 = X_svd[len(X_train):, :]

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tfidf_for_valid_vectors(model, 'test'))

In [None]:
X_test_trfmd

In [None]:
# split back into two
X1_test_trfmd = X_test_trfmd[:len(X_test)]
X2_test_trfmd = X_test_trfmd[len(X_test):]

In [None]:
X1_test_w, X1_test = get_weights_and_w2vectors(X1_test_trfmd, tfidf, model)
X1_test_w.shape

In [None]:
X2_test_w, X2_test = get_weights_and_w2vectors(X2_test_trfmd, tfidf, model)
X2_test_w.shape

In [None]:
pickle.dump(X1_test_w, open(data_folder+'X1_test_w.p','wb'))
pickle.dump(X2_test_w, open(data_folder+'X2_test_w.p','wb'))

In [None]:
pickle.dump(X1_test, open(data_folder+'X1_test.p','wb'))
pickle.dump(X2_test, open(data_folder+'X2_test.p','wb'))

In [None]:
del X1_test_w, X2_test_w, X1_test, X2_test

In [None]:
# dimension reduction using SVD
#start = time.time()
#X_test_svd = svd.transform(X_test_trfmd)
#end =  time.time()
#print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
#X1_test = X_test_svd[:len(X_test), :]
#X2_test = X_test_svd[len(X_test):, :]

In [None]:
# build complete feature dataframe
#X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
#                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
#X_test_temp.head()

### Pairwise Metrics

In [None]:
#def get_q_lengths(X):
#    #q_meta = []
#    for q in X:
#        #q_meta.append(len(q))
#        yield len(q)
#    #return q_meta

In [None]:
#q_meta_train = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft), get_q_lengths(X2_ft))]

In [None]:
#pickle.dump(q_meta_train, open(data_folder+'q_meta_train.p', 'wb'))

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='q_meta_train.p', source='q_meta_train.p')

In [None]:
#X_train_300 = np.concatenate( 
#    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft if x.size>0)] )
#)

In [None]:
#X_train_300.shape

In [None]:
#mmwrite( data_folder+'wor2vec_300_full_train.mtx', X_train_300 )
#np.savez(data_folder+'wor2vec_300_full_train', data=X_train_300)

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_full_train.mtx', source='wor2vec_300_full_train.mtx')

In [None]:
#del X_ft

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='embed2_full_train.mtx', filepath='embed2_full_train.mtx')

In [None]:
#X_rd = mmread('embed2_full_train.mtx')

In [None]:
#X_rd.shape

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='q_meta_train.p', filepath='q_meta_train.p')

In [None]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, directed_hausdorff
from fastdtw import fastdtw
import similaritymeasures
from scipy.spatial import procrustes
def compute_pairwise_kernel(pc1, pc2, w1, w2, method='linear'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='polynomial':
        dist_mat = polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        dist_mat = rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        dist_mat = sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        dist_mat = laplacian_kernel(pc1, pc2)
    else:
        dist_mat = linear_kernel(pc1, pc2)
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))
    
def compute_pairwise_dist(pc1, pc2, w1, w2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='hausdorff':
        dist = directed_hausdorff(pc1, pc2)
        return dist[0]
    else:
        dist_mat = pairwise_distances(pc1, pc2, metric=method) 
    #dist_mat = cdist(pc1, pc2, metric=method)
    #return np.linalg.norm(dist_mat, ord='fro')
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))

def compute_weighted_mean(pc, w):
    return np.average(pc, axis=0, weights=w)

def compute_pairwise_metric(pc1, pc2, method='dtw'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    #if method=='fdtw':
    #    dist, _ = fastdtw(pc1, pc2, dist=euclidean)
    if method=='pcm':
        dist = similaritymeasures.pcm(pc1[:,:2], pc2[:,:2])
    if method=='discrete_frechet':
        dist = similaritymeasures.frechet_dist(pc1[:,:2], pc2[:,:2])
    if method=='area':
        dist = similaritymeasures.area_between_two_curves(pc1[:,:2], pc2[:,:2])
    if method=='curve_length':
        dist = similaritymeasures.curve_length_measure(pc1[:,:2], pc2[:,:2])
    if method=='dtw':
        dist, _ = similaritymeasures.dtw(pc1[:,:2], pc2[:,:2])
    #if method=='procrustes': 
    #    mbox1 = MinimumBoundingBox([x[:2] for x in pc1.tolist()])
    #    mbox2 = MinimumBoundingBox([x[:2] for x in pc2.tolist()])
    #    _,_,dist = procrustes(make_array(mbox1.corner_points), make_array(mbox2.corner_points))
    return dist

        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [None]:
def compute_delayed(X1, X2, X1_w, X2_w, method):
    temp = []
    for q_tuple in zip(X1, X2, X1_w, X2_w):
        if q_tuple:
            q1_rd, q2_rd, q1_w, q2_w = q_tuple
            if method in ['polynomial', 'rbf', 'sigmoid', 'laplacian', 'linear']:
                temp.append(delayed(compute_pairwise_kernel)(q1_rd, q2_rd, q1_w, q2_w, method))
            else:
                temp.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, q1_w, q2_w, method))
        else:
            temp.append(delayed(np.nan))
    return compute(*temp)

In [None]:
def create_nan_array(r,c):
    arr = np.empty((r,c))
    arr[:] = np.nan
    return arr

In [None]:
def pickle_and_del(obj, file, data_folder=data_folder):
    pickle.dump(obj, open(data_folder+file+'.p', 'wb'))
    del obj

In [None]:
def compute_delayed_wmean(X, X_w, file, data_folder=data_folder):
    temp = []
    for q_tuple in zip(X, X_w):
        if q_tuple:
            q_rd, q_w = q_tuple
            if np.sum(q_w) != 0:
                temp.append(delayed(compute_weighted_mean)(q_rd, q_w))
            else:
                temp.append(delayed(create_nan_array)(1,300))                
        else:
            temp.append(delayed(create_nan_array)(1,300))
    temp_arr = np.array(temp)
    computed_obj = compute(*temp_arr)
    pickle_and_del(computed_obj, file, data_folder)    

In [None]:
def compute_and_save(X1, X2, X1_w, X2_w, method, file, data_folder=data_folder):
    computed_obj = compute_delayed(X1, X2, X1_w, X2_w, method)
    pickle_and_del(computed_obj, file, data_folder)    

In [None]:
#q_meta_train = pickle.load(open(data_folder+'q_meta_train.p','rb'))

In [None]:
#len(q_meta_train)

In [None]:
#mmwrite( data_folder+'wor2vec_300_full_train.mtx', X_train_300 )
#X_train_300 = np.load(data_folder+'wor2vec_300_full_train.npz')['data']

In [None]:
#X_train_300.shape

In [None]:
# rebuild X1_rd and X2_rd
#X1_list = []
#X2_list = []
#X1_rd_list = []
#X2_rd_list = []
#q1_ptr = 0
#for len_q1, _ in q_meta_train:
#    q1 = np.array(X_train_300[q1_ptr:q1_ptr+len_q1])
#    #q1_rd = np.array(X_rd[q1_ptr:q1_ptr+len_q1])
#    X1_list.append(q1)
#    #X1_rd_list.append(q1_rd)
#    q1_ptr = q1_ptr+len_q1
#q2_ptr = q1_ptr
#for _, len_q2 in q_meta_train:
#    q2 = np.array(X_train_300[q2_ptr:q2_ptr+len_q2])
#    #q2_rd = np.array(X_rd[q2_ptr:q2_ptr+len_q2])
#    X2_list.append(q2)
#    #X2_rd_list.append(q2_rd)
#    q2_ptr = q2_ptr+len_q2
#X1 = np.array(X1_list)
#X2 = np.array(X2_list)
#X1_rd = np.array(X1_rd_list)
#X2_rd = np.array(X2_rd_list)

In [None]:
#del X1_list, X2_list, X_train_300, q_meta_train

#### Initialize Dask

In [None]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

#### Compute Features

In [None]:
X1_w = pickle.load(open(data_folder+'X1_w.p','rb'))
X2_w = pickle.load(open(data_folder+'X2_w.p','rb'))

In [None]:
X1 = pickle.load(open(data_folder+'X1.p','rb'))
X2 = pickle.load(open(data_folder+'X2.p','rb'))

In [None]:
def compute_wmean(X, X_w):
    temp = []
    for q_tuple in zip(X, X_w):
        if q_tuple:
            q_rd, q_w = q_tuple
            temp.append(compute_weighted_mean(q_rd, q_w))
        else:
            temp.append(create_nan_array(1,300))
    #computed_obj = compute(*temp)
    #pickle_and_del(computed_obj, file, data_folder) 
    return np.array(temp)

In [None]:
temp = compute_wmean(X1[:1],X1_w[:1])

In [None]:
temp.shape

In [None]:
#jaccard = compute(*jaccard)

In [None]:
#chebyshev = compute(*chebyshev)
#compute_and_save(X1, X2, 'chebyshev', 'chebyshev_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'chebyshev', 'chebyshev_train_w', data_folder)

In [None]:
#braycurtis = compute(*braycurtis)
#compute_and_save(X1, X2, 'braycurtis', 'braycurtis_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'braycurtis', 'braycurtis_train_w', data_folder)

In [None]:
#cosine = compute(*cosine)
#compute_and_save(X1, X2, 'cosine', 'cosine_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'cosine', 'cosine_train_w', data_folder)

In [None]:
#correlation = compute(*correlation)
#compute_and_save(X1, X2, 'correlation', 'correlation_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'correlation', 'correlation_train_w', data_folder)

In [None]:
#hamming = compute(*hamming)

In [None]:
#canberra = compute(*canberra)
#compute_and_save(X1, X2, 'canberra', 'canberra_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'canberra', 'canberra_train_w', data_folder)

In [None]:
#hausdorff = compute(*hausdorff)
#compute_and_save(X1, X2, 'hausdorff', 'hausdorff_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'hausdorff', 'hausdorff_train_w', data_folder)

In [None]:
#cityblock = compute(*cityblock)
#compute_and_save(X1, X2, 'cityblock', 'cityblock_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'cityblock', 'cityblock_train_w', data_folder)

In [None]:
#euclidean = compute(*euclidean)
#compute_and_save(X1, X2, 'euclidean', 'euclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'euclidean', 'euclidean_train_w', data_folder)

In [None]:
#l1 = compute(*l1)
#compute_and_save(X1, X2, 'l1', 'l1_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'l1', 'l1_train_w', data_folder)

In [None]:
#l2 = compute(*l2)
#compute_and_save(X1, X2, 'l2', 'l2_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'l2', 'l2_train_w', data_folder)

In [None]:
#manhattan = compute(*manhattan)
#compute_and_save(X1, X2, 'manhattan', 'manhattan_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'manhattan', 'manhattan_train_w', data_folder)

In [None]:
#dice = compute(*dice)

In [None]:
#kulsinski = compute(*kulsinski)

In [None]:
#rogerstanimoto = compute(*rogerstanimoto)

In [None]:
#russellrao = compute(*russellrao)

In [None]:
#sokalmichener = compute(*sokalmichener)

In [None]:
#minkowski = compute(*minkowski)
#compute_and_save(X1, X2, 'minkowski', 'minkowski_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'minkowski', 'minkowski_train_w', data_folder)

In [None]:
#seuclidean = compute(*seuclidean)
#compute_and_save(X1, X2, 'seuclidean', 'seuclidean_train', data_folder)

In [None]:
#sokalsneath = compute(*sokalsneath)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'sqeuclidean', 'sqeuclidean_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'polynomial', 'polynomial_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'rbf', 'rbf_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'sigmoid', 'sigmoid_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'linear', 'linear_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'laplacian', 'laplacian_train_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X1, X1_w, 'weighted_mean1_train', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X2, X2_w, 'weighted_mean2_train', data_folder)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
#dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
#discrete_frechet = compute(*discrete_frechet)

In [None]:
#procrustes = compute(*procrustes)

In [None]:
del X1, X2, X1_w, X2_w

#### add above metrics to X_train

In [7]:
def add_column(df, column, train_or_test, data_folder=data_folder):
    col_arr = pickle.load(open(data_folder+column+'_'+train_or_test+'_w.p', 'rb'))
    return pd.concat([df,
                     pd.Series(col_arr, name=column,index=df.index)
                      ], axis=1)

In [266]:
def add_d2v_columns(df, d2v, train_or_test, red_type='umap', data_folder=data_folder):
    if red_type in ['svd','umap']:
        if red_type == 'svd':
            file = d2v+'_'+train_or_test+'_svd_red.p'
        else:
            file = d2v+'_'+train_or_test+'_red.p'
        col_arr = pickle.load(open(data_folder+file, 'rb'))
        return pd.concat([df,
                         pd.DataFrame(col_arr, columns=[d2v+'_'+str(i) for i in range(col_arr.shape[1])],index=df.index)
                          ], axis=1)
    else:
        file1 = 'weighted_mean1_'+train_or_test+'.p'        
        col_arr1 = pickle.load(open(data_folder+file1, 'rb'))
        file2 = 'weighted_mean2_'+train_or_test+'.p'        
        col_arr2 = pickle.load(open(data_folder+file2, 'rb'))
        col_arr = np.hstack((np.concatenate([x.reshape(1,-1) for x in col_arr1]), 
                   np.concatenate([x.reshape(1,-1) for x in col_arr2])))
        return pd.concat([df,
                     pd.DataFrame(col_arr, columns=[d2v+'_'+str(i) for i in range(col_arr.shape[1])],index=df.index)
                      ], axis=1)

In [206]:
X_train = add_column(X_train, 'chebyshev', 'train')

In [207]:
X_train = add_column(X_train, 'braycurtis', 'train')

In [208]:
X_train = add_column(X_train, 'cosine', 'train')

In [209]:
X_train = add_column(X_train, 'correlation', 'train')

In [210]:
X_train = add_column(X_train, 'canberra', 'train')

In [211]:
X_train = add_column(X_train, 'hausdorff', 'train')

In [212]:
X_train = add_column(X_train, 'cityblock', 'train')

In [213]:
X_train = add_column(X_train, 'euclidean', 'train')

In [214]:
X_train = add_column(X_train, 'l1', 'train')

In [215]:
X_train = add_column(X_train, 'l2', 'train')

In [216]:
X_train = add_column(X_train, 'manhattan', 'train')

In [217]:
X_train = add_column(X_train, 'minkowski', 'train')

In [218]:
X_train = add_column(X_train, 'sqeuclidean', 'train')

In [226]:
X1_temp = pickle.load(open(data_folder+'weighted_mean1_train.p','rb'))

In [223]:
X2_temp = pickle.load(open(data_folder+'weighted_mean2_train.p','rb'))

In [264]:
X_temp = np.hstack((np.concatenate([x.reshape(1,-1) for x in X1_temp]), 
                   np.concatenate([x.reshape(1,-1) for x in X2_temp])))

In [265]:
X_temp.shape

(270872, 600)

In [267]:
#X_train = add_d2v_columns(X_train, 'd2v_1_10', 'train', 'svd')
X_train = add_d2v_columns(X_train, 'd2v_1_10', 'train', 'none')

In [268]:
#X_train = add_d2v_columns(X_train, 'd2v_2_10', 'train', 'svd')

In [269]:
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
186150,284024,284025,"What is the difference between ""visa on arriva...",Do I need a visa to visit Ireland from England?,0.292102,0.825892,0.788286,0.788564,200.35101,1.615564,...,0.047302,0.057037,-0.093061,0.059292,-0.044104,0.029644,0.058024,0.059764,-0.062192,0.004974
208652,312703,312704,What are the chances of receiving invitation t...,What are the chances of receiving invitation t...,0.476484,0.732037,0.674718,0.674567,183.345052,0.0,...,-0.039467,0.046083,0.032889,0.122521,0.002809,0.107034,0.027127,0.168911,0.033952,-0.124828
268163,385651,385652,"Is the phrase ""the pressure in an incompressib...",What is the use of Bash in Windows?,0.448201,0.963075,0.93944,0.939968,225.509213,1.700451,...,0.097501,0.082464,-0.196504,-0.005057,-0.079654,0.011361,-0.012546,0.24815,-0.080769,0.059498
155539,243655,243656,How do I buy goods from Amazon and ship to Vie...,How can I sell Vietnam traditional silk ties o...,0.430242,0.820454,0.786592,0.786038,194.753906,1.885054,...,0.035835,0.009841,-0.094877,0.044817,-0.034017,0.007981,-0.031795,0.12816,-0.039345,0.009957
212779,317931,317932,How do you read tarot cards?,How do I read tarot cards?,0.228653,0.468721,0.445017,0.446258,120.422093,0.0,...,0.03636,-0.119229,-0.125466,0.102856,0.011574,-0.062571,-0.038238,0.107485,-0.091161,0.018797


In [270]:
X_train[X_train.isnull().any(axis=1)]

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13587,26077,26078,Why did you cry?,"In ten words or less, what makes you cry?",,,,,,,...,0.123527,-0.050053,-0.073590,0.028933,0.004458,-0.038151,0.010032,0.018047,0.017813,-0.016061
239614,173260,3007,"In mathematics, how many zeros does a million ...","If 204=8, 503=13, 305=11 and 907=25, what is 705?",,,,,,,...,,,,,,,,,,
375111,506098,1896,Why is 1/0=∞?,Why does zero factorial (0!) equal one (1)?,,,,,,,...,0.056619,0.056430,-0.032141,0.008893,0.045101,-0.037878,0.059401,0.102472,0.022640,-0.025139
43536,78296,61064,What is 9*3?,What is 3+3*3+3?,,,,,,,...,,,,,,,,,,
253902,368527,368528,Yf v. B.?,Name written as x kr y till 12…nw its written ...,,,,,,,...,0.021585,-0.015178,-0.095069,0.048851,0.019086,0.000249,0.028259,0.041393,-0.037058,-0.046561
174113,268421,268422,Hi how are you doing?,Is this good to learn AS400..? how is the futu...,,,,,,,...,0.053815,-0.012664,-0.068203,-0.012380,0.019457,0.008989,0.005739,0.070869,0.013417,-0.021066
360644,490443,490444,Why am I here?,Have you ever asked to yourself why do you exist?,,,,,,,...,0.126702,0.065438,-0.125550,-0.027496,-0.043945,0.096968,0.107387,0.036555,0.013562,0.036042
348201,476742,476743,What is the best way to describe yourself?,How do you describe yourself?,,,,,,,...,,,,,,,,,,
178801,274516,1601,What are some good reasons for going back with...,How do I get my ex back?,,,,,,,...,,,,,,,,,,
303738,426917,346728,What is 3D?,What is 3d digitization?,,,,,,,...,0.065861,0.058634,-0.087141,0.110068,-0.012238,-0.133271,0.143346,0.068215,0.041022,-0.000155


##### Test set

In [None]:
#q_meta_test = [(q1_len, q2_len) for q1_len, q2_len in zip(get_q_lengths(X1_ft_test), get_q_lengths(X2_ft_test))]

In [None]:
#pickle.dump(q_meta_test, open(data_folder+'q_meta_test.p', 'wb'))

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='q_meta_test.p', source='q_meta_test.p')

In [None]:
#X_test_300 = np.concatenate( 
#    np.vstack( [np.array(np.vsplit(y, y.shape[0])) for y in (x for x in X_ft_test if x.size>0)] )
#)

In [None]:
#X_test_300.shape

In [None]:
#from scipy.io import mmwrite, mmread
#mmwrite( 'wor2vec_300_full_test.mtx', X_test_300 )
#np.savez(data_folder+'wor2vec_300_full_test', data=X_test_300)

In [None]:
#ps.copy_file(dest_bucket=INPUT_BUCKET, file='wor2vec_300_full_test.mtx', source='wor2vec_300_full_test.mtx')

In [None]:
#del X_test_300, q_meta_test, X_ft_test

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='embed2_full_test.mtx', filepath='embed2_full_test.mtx')

In [None]:
#from scipy.io import mmread
#X_rd_test = mmread('embed2_full_test.mtx')

In [None]:
#X_rd_test.shape

In [None]:
#ps.get_file(bucket=INPUT_BUCKET, filename='q_meta_test.p', filepath='q_meta_test.p')

In [None]:
#q_meta_test = pickle.load(open(data_folder+'q_meta_test.p','rb'))

In [None]:
#mmwrite( data_folder+'wor2vec_300_full_train.mtx', X_train_300 )
#X_test_300 = np.load(data_folder+'wor2vec_300_full_test.npz')['data']

In [None]:
#X_test_300.shape

In [None]:
# rebuild X1_rd_test and X2_rd_test
#X1_list = []
#X2_list = []
#X1_rd_list = []
#X2_rd_list = []
#q1_ptr = 0
#for len_q1, _ in q_meta_test:
#    q1 = np.array(X_test_300[q1_ptr:q1_ptr+len_q1])
#    #q1_rd = np.array(X_rd_test[q1_ptr:q1_ptr+len_q1])
#    X1_list.append(q1)
#    #X1_rd_list.append(q1_rd)
#    q1_ptr = q1_ptr+len_q1
#q2_ptr = q1_ptr
#for _, len_q2 in q_meta_test:
#    q2 = np.array(X_test_300[q2_ptr:q2_ptr+len_q2])
#    #q2_rd = np.array(X_rd_test[q2_ptr:q2_ptr+len_q2])
#    X2_list.append(q2)
#    #X2_rd_list.append(q2_rd)
#    q2_ptr = q2_ptr+len_q2
#X1_test = np.array(X1_list)
#X2_test = np.array(X2_list)
#X1_rd_test = np.array(X1_rd_list)
#X2_rd_test = np.array(X2_rd_list)

In [None]:
#X2_test.shape

In [None]:
#del X1_list, X2_list, q_meta_test, X_test_300, X1_rd_list, X2_rd_list

In [None]:
X1_test_w = pickle.load(open(data_folder+'X1_test_w.p','rb'))
X2_test_w = pickle.load(open(data_folder+'X2_test_w.p','rb'))

In [None]:
X1_test = pickle.load(open(data_folder+'X1_test.p','rb'))
X2_test = pickle.load(open(data_folder+'X2_test.p','rb'))

In [None]:
#jaccard = compute(*jaccard)

In [None]:
#chebyshev = compute(*chebyshev)
#compute_and_save(X1_test, X2_test, 'chebyshev', 'chebyshev_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'chebyshev', 'chebyshev_test_w', data_folder)

In [None]:
#braycurtis = compute(*braycurtis)
#compute_and_save(X1_test, X2_test, 'braycurtis', 'braycurtis_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'braycurtis', 'braycurtis_test_w', data_folder)

In [None]:
#cosine = compute(*cosine)
#compute_and_save(X1_test, X2_test, 'cosine', 'cosine_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'cosine', 'cosine_test_w', data_folder)

In [None]:
#correlation = compute(*correlation)
#compute_and_save(X1_test, X2_test, 'correlation', 'correlation_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'correlation', 'correlation_test_w', data_folder)

In [None]:
#hamming = compute(*hamming)

In [None]:
#canberra = compute(*canberra)
#compute_and_save(X1_test, X2_test, 'canberra', 'canberra_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'canberra', 'canberra_test_w', data_folder)

In [None]:
#hausdorff = compute(*hausdorff)
#compute_and_save(X1_test, X2_test, 'hausdorff', 'hausdorff_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'hausdorff', 'hausdorff_test_w', data_folder)

In [None]:
#cityblock = compute(*cityblock)
#compute_and_save(X1_test, X2_test, 'cityblock', 'cityblock_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'cityblock', 'cityblock_test_w', data_folder)

In [None]:
#euclidean = compute(*euclidean)
#compute_and_save(X1_test, X2_test, 'euclidean', 'euclidean_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'euclidean', 'euclidean_test_w', data_folder)

In [None]:
#l1 = compute(*l1)
#compute_and_save(X1_test, X2_test, 'l1', 'l1_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'l1', 'l1_test_w', data_folder)

In [None]:
#l2 = compute(*l2)
#compute_and_save(X1_test, X2_test, 'l2', 'l2_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'l2', 'l2_test_w', data_folder)

In [None]:
#manhattan = compute(*manhattan)
#compute_and_save(X1_test, X2_test, 'manhattan', 'manhattan_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'manhattan', 'manhattan_test_w', data_folder)

In [None]:
#dice = compute(*dice)

In [None]:
#kulsinski = compute(*kulsinski)

In [None]:
#rogerstanimoto = compute(*rogerstanimoto)

In [None]:
#russellrao = compute(*russellrao)

In [None]:
#sokalmichener = compute(*sokalmichener)

In [None]:
#minkowski = compute(*minkowski)
#compute_and_save(X1_test, X2_test, 'minkowski', 'minkowski_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'minkowski', 'minkowski_test_w', data_folder)

In [None]:
#seuclidean = compute(*seuclidean)

In [None]:
#sokalsneath = compute(*sokalsneath)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1_test, X2_test, 'sqeuclidean', 'sqeuclidean_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'sqeuclidean', 'sqeuclidean_test_w', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X1_test, X1_test_w, 'weighted_mean1_test', data_folder)

In [None]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X2_test, X2_test_w, 'weighted_mean2_test', data_folder)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
#dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
#discrete_frechet = compute(*discrete_frechet)

In [None]:
del X1_test, X2_test, X1_test_w, X2_test_w

#### add above metrics to X_test

In [271]:
X_test = add_column(X_test, 'chebyshev', 'test')

In [272]:
X_test = add_column(X_test, 'braycurtis', 'test')

In [273]:
X_test = add_column(X_test, 'cosine', 'test')

In [274]:
X_test = add_column(X_test, 'correlation', 'test')

In [275]:
X_test = add_column(X_test, 'canberra', 'test')

In [276]:
X_test = add_column(X_test, 'hausdorff', 'test')

In [277]:
X_test = add_column(X_test, 'cityblock', 'test')

In [278]:
X_test = add_column(X_test, 'euclidean', 'test')

In [279]:
X_test = add_column(X_test, 'l1', 'test')

In [280]:
X_test = add_column(X_test, 'l2', 'test')

In [281]:
X_test = add_column(X_test, 'manhattan', 'test')

In [282]:
X_test = add_column(X_test, 'minkowski', 'test')

In [283]:
X_test = add_column(X_test, 'sqeuclidean', 'test')

In [284]:
#X_test = add_d2v_columns(X_test, 'd2v_1_10', 'test', 'svd')
X_test = add_d2v_columns(X_test, 'd2v_1_10', 'test', 'none')

In [285]:
#X_test = add_d2v_columns(X_test, 'd2v_2_10', 'test', 'svd')

In [286]:
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0.305258,0.664617,0.653823,0.651883,160.494443,1.346215,...,-0.022502,0.022948,-0.00659,-0.010511,0.026114,-0.032092,-0.026524,0.07714,0.008119,0.048658
224279,332326,332327,Will a breathing treatment help a cough?,How can I help someone that is unconscious but...,0.198065,0.7686,0.749187,0.748293,183.113717,1.331043,...,0.076291,0.021852,-0.009984,-0.02006,-0.048285,-0.019296,0.009989,0.043566,0.053058,-0.007563
252452,336023,366789,Is Kellyanne Conway annoying in your opinion?,Did Kellyanne Conway really imply that we shou...,0.23727,0.855352,0.847278,0.846889,197.037204,0.935798,...,0.070254,-0.017391,-0.082422,0.057816,0.033491,-0.023722,-0.01835,0.060198,0.017798,-0.018267
174039,268330,268331,How do you rate (1-10) and review Maruti Baleno?,What career options does one have after comple...,0.304343,0.901249,0.866577,0.866264,212.224992,1.479857,...,0.019796,0.042134,-0.065417,0.030623,0.052441,-0.005064,-0.002095,0.07108,-0.014265,-0.020331
384863,28901,233483,What are some good books on marketing?,What are some of the best books ever written a...,0.196309,0.704832,0.655896,0.655889,169.794796,1.05539,...,0.015662,-0.004726,-0.047093,0.018822,-0.025694,0.045619,-0.022476,0.077497,-0.066463,0.013118


In [287]:
X_test[X_test.isnull().any(axis=1)]

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
245880,358814,358815,sss,What is sss?,,,,,,,...,,,,,,,,,,
394708,47433,527619,When and how will atrocities on Hindu dalits s...,If (x+y) =7 and xy=127 find x^2-y^2?,,,,,,,...,,,,,,,,,,
204991,308036,308037,What is pilaf?,Why is pilaf called pilaf?,,,,,,,...,0.192447,0.082672,-0.018892,0.021294,-0.056245,0.093795,-0.040314,-0.036908,0.016061,0.034877
393288,3007,193368,"If 204=8, 503=13, 305=11 and 907=25, what is 705?",Can you add 5 odd numbers to get 30?,,,,,,,...,0.162578,-0.027279,-0.095742,0.082652,-0.068494,-0.027259,-0.031793,0.060851,-0.027516,-0.008244
110983,68218,181872,Why do we say hi?,"Why do we say ""hi"" when we talk with others?",,,,,,,...,-0.037276,-0.054328,-0.175421,0.022517,0.009102,0.043732,-0.005332,0.167264,-0.077245,0.003345
318371,443715,35958,What is)'(?,"What is ""what is""?",,,,,,,...,,,,,,,,,,
321352,326142,165846,What is 4 2/3-1 1/3÷2?,What is 1/ (3^1/2)?,,,,,,,...,,,,,,,,,,
166112,257824,257825,"What are B2B, B2C, C2B and C2C?","If B2B has ""business customers"", then what doe...",,,,,,,...,-0.033328,0.026473,-0.066705,0.042385,0.017349,-0.048965,-0.074524,0.119320,-0.061360,0.015579
316274,99469,441288,"I have no interest in anything, what should I do?","Don't have interest in anything, What to do?",,,,,,,...,,,,,,,,,,
295593,58523,326142,What is 2^ (1/2!) ^ (1/3!) ^ (1/4!)^...?,What is 4 2/3-1 1/3÷2?,,,,,,,...,,,,,,,,,,


### Fuzzy-wuzzy

In [288]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
#X_train.head()

In [289]:
from fuzzywuzzy import fuzz

In [290]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
#X_train.head()

In [291]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
#X_train.head()

In [292]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
#X_train.head()

In [293]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
#X_train.head()

In [294]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [295]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train_final = X_train.drop(columns=['qid1', 'qid2','question1','question2']).dropna()
#X_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270404 entries, 186150 to 121959
Columns: 618 entries, chebyshev to token_set_ratio
dtypes: float64(613), int64(5)
memory usage: 1.2 GB


In [296]:
X_train_final.tail(5)

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
259180,0.179109,0.720196,0.690909,0.690372,167.869835,0.0,12.122265,0.886436,12.122265,0.886436,...,0.007481,-0.010785,0.014501,0.008827,0.001853,13,75,87,74,88
365841,0.457809,0.806038,0.74921,0.748635,194.065797,1.224629,27.730834,2.085522,27.730834,2.085522,...,0.017472,0.085011,0.100675,-0.008494,-0.069221,3,95,95,95,99
131933,0.280458,0.849733,0.818295,0.817711,203.945924,0.653896,20.406913,1.494367,20.406913,1.494367,...,-0.020402,0.000372,0.057352,-0.003456,-0.000172,42,56,75,56,73
146868,0.316919,0.913964,0.894774,0.894553,213.237735,1.657228,22.573019,1.665006,22.573019,1.665006,...,0.05092,-0.007397,0.023598,0.008791,-0.007472,13,36,39,43,43
121959,0.311823,0.8083,0.789728,0.789644,192.657182,1.773726,22.48727,1.641493,22.48727,1.641493,...,0.001344,0.023961,0.06071,0.083313,-0.037359,19,39,42,56,64


In [182]:
X_train_final.describe()

Unnamed: 0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_2_10_5,d2v_2_10_6,d2v_2_10_7,d2v_2_10_8,d2v_2_10_9,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
count,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,...,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0,270404.0
mean,0.274382,0.723928,0.692657,0.692444,174.086482,1.266665,18.622987,1.367701,18.622987,1.367701,...,-0.013156,-0.001577,-0.006141,-0.000203,-0.003482,20.167949,61.282873,64.942408,64.076212,73.239542
std,0.109293,0.150107,0.148843,0.1488,34.018752,0.936567,6.073456,0.452109,6.073456,0.452109,...,0.106823,0.098718,0.09282,0.090845,0.088015,25.554385,18.566539,16.782432,16.855906,18.172722
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.470849,-0.962353,-0.479112,-1.091656,-0.525072,0.0,0.0,0.0,0.0,0.0
25%,0.207306,0.651325,0.619027,0.618843,158.797341,0.812605,14.837361,1.085528,14.837361,1.085528,...,-0.080603,-0.060035,-0.062895,-0.05039,-0.058618,5.0,46.0,51.0,52.0,60.0
50%,0.260703,0.748781,0.715934,0.71573,180.608489,1.265624,18.275128,1.338528,18.275128,1.338528,...,-0.024164,-0.000478,-0.00992,-0.002248,-0.008811,12.0,60.0,64.0,63.0,75.0
75%,0.325509,0.823729,0.791532,0.791305,196.622411,1.694931,22.048823,1.619104,22.048823,1.619104,...,0.041081,0.055425,0.045524,0.046331,0.044541,26.0,76.0,78.0,77.0,89.0
max,1.491804,1.074981,1.07024,1.070946,250.658354,9.786334,72.14203,5.498763,72.14203,5.498763,...,1.456854,0.89084,1.215859,1.152105,0.99845,1080.0,100.0,100.0,100.0,100.0


##### Test set

In [297]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
#X_test.head()

In [298]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
#X_test.head()

In [299]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
#X_test.head()

In [300]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
#X_test.head()

In [301]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
#X_test.head()

In [302]:
X_test_final = X_test.drop(columns=['question1','question2', 'qid1', 'qid2']).dropna()
X_test_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133032 entries, 8067 to 346580
Columns: 618 entries, chebyshev to token_set_ratio
dtypes: float64(613), int64(5)
memory usage: 628.3 MB


In [303]:
X_test_final.head()

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.305258,0.664617,0.653823,0.651883,160.494443,1.346215,18.418028,1.375237,18.418028,1.375237,...,-0.032092,-0.026524,0.07714,0.008119,0.048658,0,88,88,81,90
224279,0.198065,0.7686,0.749187,0.748293,183.113717,1.331043,14.162174,1.044846,14.162174,1.044846,...,-0.019296,0.009989,0.043566,0.053058,-0.007563,23,33,38,48,55
252452,0.23727,0.855352,0.847278,0.846889,197.037204,0.935798,16.121011,1.185337,16.121011,1.185337,...,-0.023722,-0.01835,0.060198,0.017798,-0.018267,73,40,56,43,53
174039,0.304343,0.901249,0.866577,0.866264,212.224992,1.479857,21.203832,1.558916,21.203832,1.558916,...,-0.005064,-0.002095,0.07108,-0.014265,-0.020331,79,30,42,32,33
384863,0.196309,0.704832,0.655896,0.655889,169.794796,1.05539,14.301773,1.041679,14.301773,1.041679,...,0.045619,-0.022476,0.077497,-0.066463,0.013118,23,69,61,66,88


# Feature Selection
#### (Assuming only 10 dimensions for the doc2vectors)

In [82]:
from sklearn.feature_selection import VarianceThreshold

In [90]:
threshold=(.8 * (1 - .8))

In [91]:
sel = VarianceThreshold(threshold=threshold)

In [101]:
sel.fit_transform(X_train_final)

array([[200.35100996,   1.61556404,  21.58206122, ...,  47.        ,
         44.        ,  43.        ],
       [183.34505199,   0.        ,  36.69303501, ...,  99.        ,
         99.        ,  99.        ],
       [225.50921255,   1.70045108,  27.44077803, ...,  51.        ,
         25.        ,  42.        ],
       ...,
       [203.94592403,   0.65389598,  20.40691333, ...,  75.        ,
         56.        ,  73.        ],
       [213.23773494,   1.65722777,  22.57301894, ...,  39.        ,
         43.        ,  43.        ],
       [192.65718182,   1.77372589,  22.48727018, ...,  42.        ,
         56.        ,  64.        ]])

In [102]:
sel.variances_.shape

(38,)

In [104]:
variances = pd.Series(sel.variances_, index=X_train_final.columns)

In [107]:
variances[variances > threshold]

canberra            1.157271e+03
hausdorff           8.771537e-01
cityblock           3.688673e+01
euclidean           2.044017e-01
l1                  3.688673e+01
l2                  2.044017e-01
manhattan           3.688673e+01
minkowski           2.044140e-01
sqeuclidean         3.466815e+00
d2v_1_10_0          1.672534e+00
d2v_1_10_1          2.120373e+00
d2v_1_10_2          2.631686e+00
d2v_1_10_3          2.232898e+00
d2v_1_10_4          1.835181e+00
d2v_1_10_5          2.513549e+00
d2v_1_10_6          1.857024e+00
d2v_1_10_7          1.933012e+00
d2v_1_10_8          1.981471e+00
d2v_1_10_9          2.207737e+00
d2v_2_10_0          3.937075e+03
d2v_2_10_1          2.331074e+00
d2v_2_10_2          6.858201e+04
d2v_2_10_3          2.172293e+01
d2v_2_10_4          7.324084e+01
d2v_2_10_5          6.255328e+01
d2v_2_10_6          2.006884e+00
d2v_2_10_7          1.587619e+04
d2v_2_10_8          5.070692e+08
d2v_2_10_9          6.224597e+06
size_diff           6.530242e+02
ratio     

In [110]:
y_train_final = y_train.loc[X_train_final.index]

In [120]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif
kbest = SelectKBest(f_classif, k=10)
X_new = kbest.fit_transform(X_train_final, y_train_final)
X_new.shape

(270404, 10)

In [128]:
kbest_scores = pd.DataFrame(np.hstack((kbest.scores_.reshape(-1,1),kbest.pvalues_.reshape(-1,1))),index=X_train_final.columns, columns=['score','p-value'])

In [132]:
kbest_scores.sort_values(by=['score'],ascending=False)

Unnamed: 0,score,p-value
token_set_ratio,50683.819352,0.0
token_sort_ratio,43676.343872,0.0
ratio,43444.154506,0.0
partial_ratio,39940.512411,0.0
cosine,21171.860695,0.0
correlation,21171.322556,0.0
braycurtis,21083.430843,0.0
canberra,20172.7803,0.0
hausdorff,18943.235557,0.0
size_diff,12524.841479,0.0


# Modeling

### Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train_final, y_train_final)

In [None]:
dump(logr_cv, data_folder+'logr_ramdomcv.joblib')

In [57]:
logr_cv = load(data_folder+'logr_ramdomcv.joblib')

In [190]:
logr_cv.best_params_

{'tol': 1e-05, 'C': 100.0}

In [304]:
y_train_final = y_train.loc[X_train_final.index]

In [305]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=42, solver='warn', tol=1e-05,
          verbose=0, warm_start=False)

In [306]:
logr_pred = logr_model.predict(X_test_final)
y_test_final = y_test.loc[X_test_final.index]

In [307]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))

Logistic Regression
accuracy score : 0.7201425220999459
precision score : 0.634703196347032
recall score : 0.5691499113854428


In [308]:
from sklearn.metrics import classification_report
target_names = ['not duplicate', 'duplicate']
print(classification_report(y_test_final, logr_pred, target_names=target_names))

               precision    recall  f1-score   support

not duplicate       0.76      0.81      0.78     83943
    duplicate       0.63      0.57      0.60     49089

    micro avg       0.72      0.72      0.72    133032
    macro avg       0.70      0.69      0.69    133032
 weighted avg       0.72      0.72      0.72    133032



### XGBoost

In [66]:
import xgboost as xgb

In [None]:
# Model selection
params_xgb = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'gamma':np.linspace(.01, 1, 10, endpoint=True), 
               'learning_rate' : np.linspace(.01, 1, 10, endpoint=True),
               'reg_lambda': np.linspace(0.01, 10, 20, endpoint=True),
               'max_depth' : np.linspace(1, 32, 32, endpoint=True, dtype=int)
                 }
cv_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=params_xgb, cv=5, n_jobs=3, random_state=42)
cv_xgb.fit(X_train_final, y_train_final)

In [None]:
dump(cv_xgb, data_folder+'xgb_ramdomcv.joblib')

In [67]:
cv_xgb = load(data_folder+'xgb_ramdomcv.joblib')

In [309]:
cv_xgb.best_params_

{'reg_lambda': 6.319473684210527,
 'n_estimators': 200,
 'max_depth': 29,
 'learning_rate': 0.23,
 'gamma': 0.23}

In [310]:
clf_xgb_model = xgb.XGBClassifier(random_state=42,
                                  n_estimators=cv_xgb.best_params_['n_estimators'],
                                  gamma=cv_xgb.best_params_['gamma'],
                                  learning_rate=cv_xgb.best_params_['learning_rate'],
                                  reg_lambda=cv_xgb.best_params_['reg_lambda'],
                                  max_depth=cv_xgb.best_params_['max_depth'])
clf_xgb_model.fit(X_train_final, y_train_final)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.23, learning_rate=0.23,
       max_delta_step=0, max_depth=29, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=6.319473684210527, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [311]:
y_pred_xgb = clf_xgb_model.predict(X_test_final)
score_xgb = accuracy_score(y_test_final, y_pred_xgb)
rscore_xgb = recall_score(y_test_final, y_pred_xgb)
pscore_xgb = precision_score(y_test_final, y_pred_xgb)
print('Accuracy score for XGBoost ', score_xgb)
print('Recall score for XGBoost ', rscore_xgb)
print('Precision score for XGBoost ', pscore_xgb)

Accuracy score for XGBoost  0.8321080642251488
Recall score for XGBoost  0.7472142435168775
Precision score for XGBoost  0.7870231300690899


In [312]:
print(classification_report(y_test_final, y_pred_xgb, target_names=target_names))

               precision    recall  f1-score   support

not duplicate       0.86      0.88      0.87     83943
    duplicate       0.79      0.75      0.77     49089

    micro avg       0.83      0.83      0.83    133032
    macro avg       0.82      0.81      0.82    133032
 weighted avg       0.83      0.83      0.83    133032

