In [4]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np
import pickle
from scipy.io import mmwrite, mmread
from joblib import dump, load

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from toolz import partition_all

In [7]:
#INPUT_BUCKET = 'dq-data'
data_folder = '/media/siri/78C6823EC681FD1E/minio/data/dq-data/v3/'
input_folder = '/media/siri/78C6823EC681FD1E/minio/data/dq-data/'
#HASH_BUCKET = 'dq-hashed'

In [None]:
#load train_set
data = 'train.csv'
#filestream = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
filestream = input_folder+data
dtypes = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     #names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')
df = df.dropna()

In [None]:
#del df

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['is_duplicate'])

y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.info()

In [None]:
pickle.dump(X_train, open(input_folder+'X_train.p', 'wb'))
pickle.dump(y_train, open(input_folder+'y_train.p', 'wb'))
pickle.dump(X_test, open(input_folder+'X_test.p', 'wb'))
pickle.dump(y_test, open(input_folder+'y_test.p', 'wb'))

In [8]:
X_train = pickle.load(open(input_folder+'X_train.p', 'rb'))
X_test = pickle.load(open(input_folder+'X_test.p', 'rb'))

In [9]:
y_train = pickle.load(open(input_folder+'y_train.p', 'rb'))
y_test = pickle.load(open(input_folder+'y_test.p', 'rb'))

In [None]:
del X,y,df

In [None]:
del X_train,X_test

In [None]:
del y_train, y_test

### Memory Check

In [10]:
import sys
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) 
        for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], 
       key=lambda x: x[1], reverse=True)

[('X_train', 70341065),
 ('X_test', 34641540),
 ('y_train', 4333976),
 ('y_test', 2134664),
 ('HTTPResponse', 1056),
 ('Dict', 888),
 ('List', 888),
 ('Tuple', 888),
 ('dump', 136),
 ('load', 136),
 ('mmread', 136),
 ('mmwrite', 136),
 ('partition_all', 136),
 ('data_folder', 100),
 ('input_folder', 97),
 ('np', 80),
 ('pd', 80),
 ('ps', 80)]

In [None]:
#del y_train, y_test

# Feature Extraction

### Tokenizing and preprocessing

In [None]:
from gensim.parsing.preprocessing import preprocess_string
custom_filters = [strip_tags, strip_multiple_whitespaces, remove_stopwords, stem_text]
def get_tokens(process='train'):
    if process=='test':
        X = X_test
    else:
        X = X_train
    series = pd.Series(pd.concat([X['question1'], X['question2']]),dtype=str)
    series.dropna()
    for question in series:
        yield preprocess_string(question, custom_filters)

In [None]:
def get_tokens_for_valid_vectors(model, process):
    for tokens in get_tokens(process):
        tf_idf_tokens = []
        for token in tokens:
            try:
                vector = model.wv[token]
                tf_idf_tokens.append(token)
            except:
                continue
        yield np.array(tf_idf_tokens)

### Word2Vec (fasttext)

In [None]:
from gensim.models import FastText
model = FastText.load_fasttext_format(data_folder+'cc.en.300.bin')

In [None]:
def get_ft_vectors(model, process):
    for tokens in get_tokens(process):
        vectors = []
        for token in tokens:
            try:
                vector = model.wv[token]
            except:
                continue
            vectors.append(vector)
        yield np.array(vectors)

### Word2Vec (pretrained word2vec retrained on training data)

In [None]:
import gensim

In [None]:
model_w2v = gensim.models.Word2Vec([token for token in get_tokens('train')], size=300)

In [None]:
model_w2v.intersect_word2vec_format(input_folder+'GoogleNews-vectors-negative300.bin',
                                lockf=1.0,
                                binary=True)

In [None]:
model_w2v.train([token for token in get_tokens('train')],total_examples=model_w2v.corpus_count, epochs=10)

### TFIDF and Word2Vec

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens_for_valid_vectors(model_w2v,'train'))

In [None]:
X_trfmd

In [None]:
# split back into two
X1_trfmd = X_trfmd[:len(X_train)]
X2_trfmd = X_trfmd[len(X_train):]

In [None]:
def get_weights_and_w2vectors(tfidf_matrix, tfidf_vectorizer, w2v_model):
    weights = []
    w2v = []
    rows = tfidf_matrix.shape[0]
    inverse_vocab_dict = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}
    for doc in range(rows):
        features = tfidf_matrix[doc,:].nonzero()[1]
        weights.append(np.array([tfidf_matrix[doc, x] for x in features]))
        w2v.append(np.array([w2v_model.wv[inverse_vocab_dict[x]] for x in features]))
    return np.array(weights), np.array(w2v)

In [None]:
X1_w, X1 = get_weights_and_w2vectors(X1_trfmd, tfidf, model_w2v)
X1_w.shape

In [None]:
X1.shape

In [None]:
X1_w.shape

In [None]:
X2_w, X2 = get_weights_and_w2vectors(X2_trfmd, tfidf, model_w2v)
X2_w.shape

In [None]:
pickle.dump(X1_w, open(data_folder+'X1_w.p','wb'))
pickle.dump(X2_w, open(data_folder+'X2_w.p','wb'))

In [None]:
pickle.dump(X1, open(data_folder+'X1.p','wb'))
pickle.dump(X2, open(data_folder+'X2.p','wb'))

In [None]:
X1_w[420].shape

In [None]:
X1[420].shape

In [None]:
del X1_w, X2_w, X1, X2

##### Test set

In [None]:
X_test_trfmd = tfidf.transform(get_tokens_for_valid_vectors(model_w2v, 'test'))

In [None]:
X_test_trfmd

In [None]:
# split back into two
X1_test_trfmd = X_test_trfmd[:len(X_test)]
X2_test_trfmd = X_test_trfmd[len(X_test):]

In [None]:
X1_test_w, X1_test = get_weights_and_w2vectors(X1_test_trfmd, tfidf, model_w2v)
X1_test_w.shape

In [None]:
X2_test_w, X2_test = get_weights_and_w2vectors(X2_test_trfmd, tfidf, model_w2v)
X2_test_w.shape

In [None]:
pickle.dump(X1_test_w, open(data_folder+'X1_test_w.p','wb'))
pickle.dump(X2_test_w, open(data_folder+'X2_test_w.p','wb'))

In [None]:
pickle.dump(X1_test, open(data_folder+'X1_test.p','wb'))
pickle.dump(X2_test, open(data_folder+'X2_test.p','wb'))

In [None]:
del X1_test_w, X2_test_w, X1_test, X2_test

In [None]:
del model_w2v

### Pairwise Metrics

In [5]:
from sklearn.metrics.pairwise import linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, rbf_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist, directed_hausdorff
from fastdtw import fastdtw
import similaritymeasures
from scipy.spatial import procrustes
def compute_pairwise_kernel(pc1, pc2, w1, w2, method='linear'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='polynomial':
        dist_mat = polynomial_kernel(pc1, pc2, 2)
    elif method=='rbf':
        dist_mat = rbf_kernel(pc1, pc2)
    elif method=='sigmoid':
        dist_mat = sigmoid_kernel(pc1, pc2)
    elif method=='laplacian':
        dist_mat = laplacian_kernel(pc1, pc2)
    else:
        dist_mat = linear_kernel(pc1, pc2)
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))
    
def compute_pairwise_dist(pc1, pc2, w1, w2, method='euclidean'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    if method=='hausdorff':
        dist = directed_hausdorff(pc1, pc2)
        return dist[0]
    else:
        dist_mat = pairwise_distances(pc1, pc2, metric=method) 
    #dist_mat = cdist(pc1, pc2, metric=method)
    #return np.linalg.norm(dist_mat, ord='fro')
    return np.average(dist_mat, weights=np.matmul(w1.reshape(-1,1),w2.reshape(-1,1).T))

def compute_weighted_mean(pc, w):
    return np.average(pc, axis=0, weights=w)

def compute_pairwise_metric(pc1, pc2, method='dtw'):
    if pc1.size == 0 or pc2.size == 0:
        return np.nan
    #if method=='fdtw':
    #    dist, _ = fastdtw(pc1, pc2, dist=euclidean)
    if method=='pcm':
        dist = similaritymeasures.pcm(pc1[:,:2], pc2[:,:2])
    if method=='discrete_frechet':
        dist = similaritymeasures.frechet_dist(pc1[:,:2], pc2[:,:2])
    if method=='area':
        dist = similaritymeasures.area_between_two_curves(pc1[:,:2], pc2[:,:2])
    if method=='curve_length':
        dist = similaritymeasures.curve_length_measure(pc1[:,:2], pc2[:,:2])
    if method=='dtw':
        dist, _ = similaritymeasures.dtw(pc1[:,:2], pc2[:,:2])
    #if method=='procrustes': 
    #    mbox1 = MinimumBoundingBox([x[:2] for x in pc1.tolist()])
    #    mbox2 = MinimumBoundingBox([x[:2] for x in pc2.tolist()])
    #    _,_,dist = procrustes(make_array(mbox1.corner_points), make_array(mbox2.corner_points))
    return dist

        
def assign_pwmetric(df, method='euclidean'):
    #return compute_pairwise_kernel(pc1_embd, pc2_embd, method=method)
    return df.apply(compute_pairwise_dist, method, axis=1)

In [6]:
def compute_delayed(X1, X2, X1_w, X2_w, method):
    temp = []
    for q_tuple in zip(X1, X2, X1_w, X2_w):
        if q_tuple:
            q1_rd, q2_rd, q1_w, q2_w = q_tuple
            if method in ['polynomial', 'rbf', 'sigmoid', 'laplacian', 'linear']:
                #temp.append(delayed(compute_pairwise_kernel)(q1_rd, q2_rd, q1_w, q2_w, method))
                temp.append(compute_pairwise_kernel(q1_rd, q2_rd, q1_w, q2_w, method))
            else:
                #temp.append(delayed(compute_pairwise_dist)(q1_rd, q2_rd, q1_w, q2_w, method))
                temp.append(compute_pairwise_dist(q1_rd, q2_rd, q1_w, q2_w, method))
        else:
            #temp.append(delayed(np.nan))
            temp.append(np.nan)
    #return compute(*temp)
    return temp

In [7]:
def create_nan_array(r,c):
    arr = np.empty((r,c))
    arr[:] = np.nan
    return arr

In [8]:
def pickle_and_del(obj, file, data_folder=data_folder):
    pickle.dump(obj, open(data_folder+file+'.p', 'wb'))
    del obj

In [9]:
def compute_delayed_wmean(X, X_w, file, data_folder=data_folder):
    temp = []
    for q_tuple in zip(X, X_w):
        if q_tuple:
            q_rd, q_w = q_tuple
            if np.sum(q_w) != 0:
                #temp.append(delayed(compute_weighted_mean)(q_rd, q_w))
                temp.append(compute_weighted_mean(q_rd, q_w))
            else:
                #temp.append(delayed(create_nan_array)(1,300)) 
                temp.append(create_nan_array(1,300))                    
        else:
            #temp.append(delayed(create_nan_array)(1,300))
            temp.append(create_nan_array(1,300))
    temp_arr = np.array(temp)
    #computed_obj = compute(*temp_arr)
    #pickle_and_del(computed_obj, file, data_folder)    
    pickle_and_del(temp_arr, file, data_folder)   

In [15]:
def compute_and_save(X1, X2, X1_w, X2_w, method, file, data_folder=data_folder):
    computed_obj = compute_delayed(X1, X2, X1_w, X2_w, method)
    pickle_and_del(computed_obj, file, data_folder)    

#### Initialize Dask

In [None]:
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from utils import dask
client = dask.create_dask_client(num_workers=8)

#### Compute Features

In [11]:
X1_w = pickle.load(open(data_folder+'X1_w.p','rb'))
X2_w = pickle.load(open(data_folder+'X2_w.p','rb'))

In [12]:
X1 = pickle.load(open(data_folder+'X1.p','rb'))
X2 = pickle.load(open(data_folder+'X2.p','rb'))

In [13]:
#jaccard = compute(*jaccard)

In [16]:
#chebyshev = compute(*chebyshev)
#compute_and_save(X1, X2, 'chebyshev', 'chebyshev_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'chebyshev', 'chebyshev_train_w', data_folder)

In [17]:
#braycurtis = compute(*braycurtis)
#compute_and_save(X1, X2, 'braycurtis', 'braycurtis_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'braycurtis', 'braycurtis_train_w', data_folder)

In [18]:
#cosine = compute(*cosine)
#compute_and_save(X1, X2, 'cosine', 'cosine_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'cosine', 'cosine_train_w', data_folder)

In [19]:
#correlation = compute(*correlation)
#compute_and_save(X1, X2, 'correlation', 'correlation_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'correlation', 'correlation_train_w', data_folder)

In [20]:
#hamming = compute(*hamming)

In [21]:
#canberra = compute(*canberra)
#compute_and_save(X1, X2, 'canberra', 'canberra_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'canberra', 'canberra_train_w', data_folder)

In [22]:
#hausdorff = compute(*hausdorff)
#compute_and_save(X1, X2, 'hausdorff', 'hausdorff_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'hausdorff', 'hausdorff_train_w', data_folder)

In [23]:
#cityblock = compute(*cityblock)
#compute_and_save(X1, X2, 'cityblock', 'cityblock_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'cityblock', 'cityblock_train_w', data_folder)

In [24]:
#euclidean = compute(*euclidean)
#compute_and_save(X1, X2, 'euclidean', 'euclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'euclidean', 'euclidean_train_w', data_folder)

In [25]:
#l1 = compute(*l1)
#compute_and_save(X1, X2, 'l1', 'l1_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'l1', 'l1_train_w', data_folder)

In [26]:
#l2 = compute(*l2)
#compute_and_save(X1, X2, 'l2', 'l2_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'l2', 'l2_train_w', data_folder)

In [27]:
#manhattan = compute(*manhattan)
#compute_and_save(X1, X2, 'manhattan', 'manhattan_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'manhattan', 'manhattan_train_w', data_folder)

In [None]:
#dice = compute(*dice)

In [None]:
#kulsinski = compute(*kulsinski)

In [None]:
#rogerstanimoto = compute(*rogerstanimoto)

In [None]:
#russellrao = compute(*russellrao)

In [None]:
#sokalmichener = compute(*sokalmichener)

In [28]:
#minkowski = compute(*minkowski)
#compute_and_save(X1, X2, 'minkowski', 'minkowski_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'minkowski', 'minkowski_train_w', data_folder)

In [29]:
#seuclidean = compute(*seuclidean)
#compute_and_save(X1, X2, 'seuclidean', 'seuclidean_train', data_folder)

In [30]:
#sokalsneath = compute(*sokalsneath)

In [31]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'sqeuclidean', 'sqeuclidean_train_w', data_folder)

In [32]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'polynomial', 'polynomial_train_w', data_folder)

In [33]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'rbf', 'rbf_train_w', data_folder)

In [34]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'sigmoid', 'sigmoid_train_w', data_folder)

In [35]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'linear', 'linear_train_w', data_folder)

In [36]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_and_save(X1, X2, X1_w, X2_w, 'laplacian', 'laplacian_train_w', data_folder)

In [37]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X1, X1_w, 'weighted_mean1_train', data_folder)

In [38]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X2, X2_w, 'weighted_mean2_train', data_folder)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
#dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
#discrete_frechet = compute(*discrete_frechet)

In [None]:
#procrustes = compute(*procrustes)

In [39]:
del X1, X2, X1_w, X2_w

#### add above metrics to X_train

In [11]:
def add_column(df, column, train_or_test, data_folder=data_folder):
    col_arr = pickle.load(open(data_folder+column+'_'+train_or_test+'_w.p', 'rb'))
    return pd.concat([df,
                     pd.Series(col_arr, name=column,index=df.index)
                      ], axis=1)

In [12]:
def add_d2v_columns(df, d2v, train_or_test, red_type='umap', data_folder=data_folder):
    if red_type in ['svd','umap']:
        if red_type == 'svd':
            file = d2v+'_'+train_or_test+'_svd_red.p'
        else:
            file = d2v+'_'+train_or_test+'_red.p'
        col_arr = pickle.load(open(data_folder+file, 'rb'))
        return pd.concat([df,
                         pd.DataFrame(col_arr, columns=[d2v+'_'+str(i) for i in range(col_arr.shape[1])],index=df.index)
                          ], axis=1)
    else:
        file1 = 'weighted_mean1_'+train_or_test+'.p'        
        col_arr1 = pickle.load(open(data_folder+file1, 'rb'))
        file2 = 'weighted_mean2_'+train_or_test+'.p'        
        col_arr2 = pickle.load(open(data_folder+file2, 'rb'))
        col_arr = np.hstack((np.concatenate([x.reshape(1,-1) for x in col_arr1]), 
                   np.concatenate([x.reshape(1,-1) for x in col_arr2])))
        return pd.concat([df,
                     pd.DataFrame(col_arr, columns=[d2v+'_'+str(i) for i in range(col_arr.shape[1])],index=df.index)
                      ], axis=1)

In [13]:
X_train = add_column(X_train, 'chebyshev', 'train')

In [14]:
X_train = add_column(X_train, 'braycurtis', 'train')

In [15]:
X_train = add_column(X_train, 'cosine', 'train')

In [16]:
X_train = add_column(X_train, 'correlation', 'train')

In [17]:
X_train = add_column(X_train, 'canberra', 'train')

In [18]:
X_train = add_column(X_train, 'hausdorff', 'train')

In [19]:
X_train = add_column(X_train, 'cityblock', 'train')

In [20]:
X_train = add_column(X_train, 'euclidean', 'train')

In [21]:
X_train = add_column(X_train, 'l1', 'train')

In [22]:
X_train = add_column(X_train, 'l2', 'train')

In [23]:
X_train = add_column(X_train, 'manhattan', 'train')

In [24]:
X_train = add_column(X_train, 'minkowski', 'train')

In [25]:
X_train = add_column(X_train, 'sqeuclidean', 'train')

In [26]:
#X_train = add_d2v_columns(X_train, 'd2v_1_10', 'train', 'svd')
X_train = add_d2v_columns(X_train, 'd2v_1_10', 'train', 'none')

In [27]:
#X_train = add_d2v_columns(X_train, 'd2v_2_10', 'train', 'svd')

In [28]:
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
186150,284024,284025,"What is the difference between ""visa on arriva...",Do I need a visa to visit Ireland from England?,3.009628,0.850558,0.823387,0.82236,201.812626,16.390596,...,-0.415902,0.458732,0.032973,-0.493698,0.248707,0.529565,-0.941626,-0.502443,-0.207765,-0.126182
208652,312703,312704,What are the chances of receiving invitation t...,What are the chances of receiving invitation t...,2.459311,0.763014,0.740781,0.741421,181.337538,0.0,...,-0.455677,0.184503,-0.245574,0.072377,0.31952,0.566395,-0.330956,0.098119,-0.517079,-0.082095
268163,385651,385652,"Is the phrase ""the pressure in an incompressib...",What is the use of Bash in Windows?,2.08011,0.967896,0.966104,0.966124,222.834354,13.742155,...,0.186224,0.629503,0.270513,0.020527,-0.40359,0.209928,0.686775,-0.24526,-0.122667,0.065093
155539,243655,243656,How do I buy goods from Amazon and ship to Vie...,How can I sell Vietnam traditional silk ties o...,2.41289,0.828076,0.810841,0.810741,192.414478,14.124109,...,-0.04772,0.040462,0.244607,0.298988,0.285106,0.110609,0.09811,-0.294825,0.063957,0.110872
212779,317931,317932,How do you read tarot cards?,How do I read tarot cards?,2.068734,0.500189,0.496738,0.49697,110.118314,0.0,...,0.636884,0.982756,0.098122,-1.263094,-0.408385,-0.332053,0.276793,-1.450347,-1.392726,-0.100079


In [29]:
X_train[X_train.isnull().any(axis=1)]

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13587,26077,26078,Why did you cry?,"In ten words or less, what makes you cry?",,,,,,,...,-0.742878,-0.201852,0.168679,0.273762,0.391834,0.221135,0.889142,-0.398619,-0.179211,0.040091
239614,173260,3007,"In mathematics, how many zeros does a million ...","If 204=8, 503=13, 305=11 and 907=25, what is 705?",,,,,,,...,,,,,,,,,,
375111,506098,1896,Why is 1/0=∞?,Why does zero factorial (0!) equal one (1)?,,,,,,,...,-0.103980,0.026666,0.045165,0.185024,0.303186,0.613329,-0.088575,0.725432,0.056811,-0.286494
43536,78296,61064,What is 9*3?,What is 3+3*3+3?,,,,,,,...,,,,,,,,,,
253902,368527,368528,Yf v. B.?,Name written as x kr y till 12…nw its written ...,,,,,,,...,-0.467990,-0.086419,0.334974,-0.242673,-0.198193,0.133573,-0.550290,-0.444515,-0.312405,0.184293
174113,268421,268422,Hi how are you doing?,Is this good to learn AS400..? how is the futu...,,,,,,,...,0.362841,-0.076976,-0.862404,0.191627,-0.799764,0.233135,0.048831,0.049959,-0.085095,-0.064086
322698,448559,448560,How is the carrer of neuroscientist related to...,Who is tribesman?,,,,,,,...,,,,,,,,,,
360644,490443,490444,Why am I here?,Have you ever asked to yourself why do you exist?,,,,,,,...,0.350030,0.497604,-0.209575,0.218461,-0.178335,0.624382,-0.320465,-0.291778,-0.284331,0.721787
348201,476742,476743,What is the best way to describe yourself?,How do you describe yourself?,,,,,,,...,,,,,,,,,,
178801,274516,1601,What are some good reasons for going back with...,How do I get my ex back?,,,,,,,...,,,,,,,,,,


##### Test set

In [40]:
X1_test_w = pickle.load(open(data_folder+'X1_test_w.p','rb'))
X2_test_w = pickle.load(open(data_folder+'X2_test_w.p','rb'))

In [41]:
X1_test = pickle.load(open(data_folder+'X1_test.p','rb'))
X2_test = pickle.load(open(data_folder+'X2_test.p','rb'))

In [None]:
#jaccard = compute(*jaccard)

In [42]:
#chebyshev = compute(*chebyshev)
#compute_and_save(X1_test, X2_test, 'chebyshev', 'chebyshev_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'chebyshev', 'chebyshev_test_w', data_folder)

In [43]:
#braycurtis = compute(*braycurtis)
#compute_and_save(X1_test, X2_test, 'braycurtis', 'braycurtis_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'braycurtis', 'braycurtis_test_w', data_folder)

In [44]:
#cosine = compute(*cosine)
#compute_and_save(X1_test, X2_test, 'cosine', 'cosine_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'cosine', 'cosine_test_w', data_folder)

In [45]:
#correlation = compute(*correlation)
#compute_and_save(X1_test, X2_test, 'correlation', 'correlation_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'correlation', 'correlation_test_w', data_folder)

In [None]:
#hamming = compute(*hamming)

In [46]:
#canberra = compute(*canberra)
#compute_and_save(X1_test, X2_test, 'canberra', 'canberra_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'canberra', 'canberra_test_w', data_folder)

In [47]:
#hausdorff = compute(*hausdorff)
#compute_and_save(X1_test, X2_test, 'hausdorff', 'hausdorff_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'hausdorff', 'hausdorff_test_w', data_folder)

In [48]:
#cityblock = compute(*cityblock)
#compute_and_save(X1_test, X2_test, 'cityblock', 'cityblock_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'cityblock', 'cityblock_test_w', data_folder)

In [49]:
#euclidean = compute(*euclidean)
#compute_and_save(X1_test, X2_test, 'euclidean', 'euclidean_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'euclidean', 'euclidean_test_w', data_folder)

In [50]:
#l1 = compute(*l1)
#compute_and_save(X1_test, X2_test, 'l1', 'l1_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'l1', 'l1_test_w', data_folder)

In [51]:
#l2 = compute(*l2)
#compute_and_save(X1_test, X2_test, 'l2', 'l2_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'l2', 'l2_test_w', data_folder)

In [52]:
#manhattan = compute(*manhattan)
#compute_and_save(X1_test, X2_test, 'manhattan', 'manhattan_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'manhattan', 'manhattan_test_w', data_folder)

In [53]:
#dice = compute(*dice)

In [54]:
#kulsinski = compute(*kulsinski)

In [55]:
#rogerstanimoto = compute(*rogerstanimoto)

In [56]:
#russellrao = compute(*russellrao)

In [57]:
#sokalmichener = compute(*sokalmichener)

In [58]:
#minkowski = compute(*minkowski)
#compute_and_save(X1_test, X2_test, 'minkowski', 'minkowski_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'minkowski', 'minkowski_test_w', data_folder)

In [59]:
#seuclidean = compute(*seuclidean)

In [60]:
#sokalsneath = compute(*sokalsneath)

In [61]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1_test, X2_test, 'sqeuclidean', 'sqeuclidean_test', data_folder)
compute_and_save(X1_test, X2_test, X1_test_w, X2_test_w, 'sqeuclidean', 'sqeuclidean_test_w', data_folder)

In [62]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X1_test, X1_test_w, 'weighted_mean1_test', data_folder)

In [63]:
#sqeuclidean = compute(*sqeuclidean)
#compute_and_save(X1, X2, 'sqeuclidean', 'sqeuclidean_train', data_folder)
compute_delayed_wmean(X2_test, X2_test_w, 'weighted_mean2_test', data_folder)

In [None]:
#fdtw = compute(*fdtw)

In [None]:
#dtw = compute(*dtw)

In [None]:
#pcm = compute(*pcm)

In [None]:
#area = compute(*area)

In [None]:
#curve_length = compute(*curve_length)

In [None]:
#discrete_frechet = compute(*discrete_frechet)

In [64]:
del X1_test, X2_test, X1_test_w, X2_test_w

#### add above metrics to X_test

In [30]:
X_test = add_column(X_test, 'chebyshev', 'test')

In [31]:
X_test = add_column(X_test, 'braycurtis', 'test')

In [32]:
X_test = add_column(X_test, 'cosine', 'test')

In [33]:
X_test = add_column(X_test, 'correlation', 'test')

In [34]:
X_test = add_column(X_test, 'canberra', 'test')

In [35]:
X_test = add_column(X_test, 'hausdorff', 'test')

In [36]:
X_test = add_column(X_test, 'cityblock', 'test')

In [37]:
X_test = add_column(X_test, 'euclidean', 'test')

In [38]:
X_test = add_column(X_test, 'l1', 'test')

In [39]:
X_test = add_column(X_test, 'l2', 'test')

In [40]:
X_test = add_column(X_test, 'manhattan', 'test')

In [41]:
X_test = add_column(X_test, 'minkowski', 'test')

In [42]:
X_test = add_column(X_test, 'sqeuclidean', 'test')

In [43]:
#X_test = add_d2v_columns(X_test, 'd2v_1_10', 'test', 'svd')
X_test = add_d2v_columns(X_test, 'd2v_1_10', 'test', 'none')

In [44]:
#X_test = add_d2v_columns(X_test, 'd2v_2_10', 'test', 'svd')

In [45]:
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,2.324107,0.684826,0.668119,0.667194,162.296297,14.440866,...,0.426299,-0.007877,0.022327,-0.149624,0.003377,0.637098,0.126302,-0.015207,-0.44221,0.224155
224279,332326,332327,Will a breathing treatment help a cough?,How can I help someone that is unconscious but...,1.687532,0.754933,0.696495,0.696357,185.567222,10.447112,...,-0.266867,-0.198401,-0.257828,-0.523029,-0.430051,0.121519,-0.0642,0.409033,-0.501404,0.238251
252452,336023,366789,Is Kellyanne Conway annoying in your opinion?,Did Kellyanne Conway really imply that we shou...,1.713,0.816789,0.771746,0.771643,201.519622,9.628905,...,-0.019247,0.035912,0.358948,-0.14347,-0.14654,0.048287,0.100115,0.2138,0.156566,-0.248331
174039,268330,268331,How do you rate (1-10) and review Maruti Baleno?,What career options does one have after comple...,2.667595,1.004311,0.99603,0.995986,222.078077,16.518101,...,0.358588,0.304108,0.02911,0.449796,0.165839,-0.234513,-0.416389,0.129206,0.092125,0.000758
384863,28901,233483,What are some good books on marketing?,What are some of the best books ever written a...,2.487492,0.739946,0.745139,0.744625,167.726263,10.1662,...,-0.412181,-0.054335,0.134291,0.109921,0.313826,0.153791,0.189489,0.461311,-0.20149,0.195373


In [46]:
X_test[X_test.isnull().any(axis=1)]

Unnamed: 0_level_0,qid1,qid2,question1,question2,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,...,d2v_1_10_590,d2v_1_10_591,d2v_1_10_592,d2v_1_10_593,d2v_1_10_594,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
245880,358814,358815,sss,What is sss?,,,,,,,...,,,,,,,,,,
394708,47433,527619,When and how will atrocities on Hindu dalits s...,If (x+y) =7 and xy=127 find x^2-y^2?,,,,,,,...,,,,,,,,,,
204991,308036,308037,What is pilaf?,Why is pilaf called pilaf?,,,,,,,...,-1.212181,0.258687,0.670402,-1.354472,0.651107,-0.330969,-0.335130,0.520469,-2.273302,-0.620540
199885,43630,301493,How is Disaronno made?,Where and how is Disaronno made?,,,,,,,...,,,,,,,,,,
153926,241502,241503,How can you get ayahuasca?,How do you get Ayahuasca?,,,,,,,...,,,,,,,,,,
393288,3007,193368,"If 204=8, 503=13, 305=11 and 907=25, what is 705?",Can you add 5 odd numbers to get 30?,,,,,,,...,0.701202,0.053624,-0.190019,-0.704138,-0.286697,-0.069459,-0.021796,-0.285034,-0.697746,-0.497329
375469,506508,506509,What is an Xpath?,What is XPath?,,,,,,,...,,,,,,,,,,
110983,68218,181872,Why do we say hi?,"Why do we say ""hi"" when we talk with others?",,,,,,,...,0.029769,0.173697,-0.562766,-1.347661,-0.886163,0.025577,-1.170673,0.173140,0.179374,0.250299
318371,443715,35958,What is)'(?,"What is ""what is""?",,,,,,,...,,,,,,,,,,
122261,197925,197926,"How did Aswatthama, Kripacharya, Satyaki, Krit...",Who was kripacharya?,,,,,,,...,,,,,,,,,,


### Fuzzy-wuzzy

In [47]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
#X_train.head()

In [48]:
from fuzzywuzzy import fuzz

In [49]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
#X_train.head()

In [50]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
#X_train.head()

In [51]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
#X_train.head()

In [52]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
#X_train.head()

In [53]:
# build complete feature dataframe
#X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
#                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
#X_train_temp.head()

In [54]:
#X_train = pd.concat([X_train_temp, X_train], axis=1)
#del X_train_temp
X_train_final = X_train.drop(columns=['qid1', 'qid2','question1','question2']).dropna()
#X_train_final.info()

In [55]:
X_train_final.tail(5)

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
259180,2.194194,0.771118,0.773916,0.77414,174.254869,0.0,169.116307,12.240939,169.116307,12.240939,...,-0.143487,0.113841,0.102538,0.237537,-0.017723,13,75,87,74,88
365841,2.937816,0.743547,0.737101,0.736085,175.605209,0.0,230.535119,16.654572,230.535119,16.654572,...,0.16791,-0.220139,-0.250944,-0.375932,0.000104,3,95,95,95,99
131933,2.867637,0.837482,0.821158,0.8211,194.252501,13.228487,225.539666,16.315674,225.539666,16.315674,...,0.259581,0.41754,0.41717,0.386213,-0.134999,42,56,75,56,73
146868,2.889351,1.012091,1.016023,1.016376,219.819752,16.233082,224.468312,16.181117,224.468312,16.181117,...,0.278654,0.010108,-0.363677,-0.150386,0.01739,13,36,39,43,43
121959,2.412579,0.803323,0.748405,0.748091,197.0717,15.682854,194.512371,13.995753,194.512371,13.995753,...,-0.039284,0.263121,0.004159,0.754585,-0.017696,19,39,42,56,64


In [56]:
X_train_final.describe()

Unnamed: 0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
count,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,...,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0,269873.0
mean,2.228935,0.730076,0.708471,0.708501,172.343243,11.052976,173.260439,12.522485,173.260439,12.522485,...,0.039093,0.016098,-0.043034,-0.01774,-0.002788,20.157971,61.300967,64.949973,64.09769,73.255531
std,0.582775,0.165485,0.169809,0.169793,35.94208,6.722824,44.733199,3.229626,44.733199,3.229626,...,0.386565,0.450485,0.393833,0.433924,0.35053,25.544811,18.555553,16.775879,16.838796,18.149578
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-2.179219,-2.333282,-2.733609,-2.657901,-2.368119,0.0,0.0,0.0,0.0,0.0
25%,1.908021,0.648536,0.621711,0.621862,156.380509,5.861563,148.674327,10.746959,148.674327,10.746959,...,-0.205598,-0.276915,-0.281478,-0.286411,-0.206999,5.0,46.0,51.0,52.0,60.0
50%,2.31071,0.746437,0.722927,0.722898,177.909491,13.751679,179.572672,12.976942,179.572672,12.976942,...,0.029497,0.00474,-0.033602,-0.024494,0.00075,12.0,60.0,64.0,63.0,75.0
75%,2.635688,0.832049,0.812694,0.81274,195.265,16.005096,204.247657,14.761279,204.247657,14.761279,...,0.27437,0.296651,0.203732,0.241519,0.206528,26.0,76.0,78.0,77.0,89.0
max,4.843883,1.17677,1.167233,1.165183,273.051165,27.026993,330.136085,23.90193,330.136085,23.90193,...,2.647588,2.934361,1.997286,3.083834,2.020134,1080.0,100.0,100.0,100.0,100.0


##### Test set

In [57]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
#X_test.head()

In [58]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
#X_test.head()

In [59]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
#X_test.head()

In [60]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
#X_test.head()

In [61]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
#X_test.head()

In [62]:
X_test_final = X_test.drop(columns=['question1','question2', 'qid1', 'qid2']).dropna()
X_test_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132808 entries, 8067 to 346580
Columns: 618 entries, chebyshev to token_set_ratio
dtypes: float64(613), int64(5)
memory usage: 627.2 MB


In [63]:
X_test_final.head()

Unnamed: 0_level_0,chebyshev,braycurtis,cosine,correlation,canberra,hausdorff,cityblock,euclidean,l1,l2,...,d2v_1_10_595,d2v_1_10_596,d2v_1_10_597,d2v_1_10_598,d2v_1_10_599,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,2.324107,0.684826,0.668119,0.667194,162.296297,14.440866,188.669054,13.637453,188.669054,13.637453,...,0.637098,0.126302,-0.015207,-0.44221,0.224155,0,88,88,81,90
224279,1.687532,0.754933,0.696495,0.696357,185.567222,10.447112,137.174149,9.841997,137.174149,9.841997,...,0.121519,-0.0642,0.409033,-0.501404,0.238251,23,33,38,48,55
252452,1.713,0.816789,0.771746,0.771643,201.519622,9.628905,135.093273,9.777494,135.093273,9.777494,...,0.048287,0.100115,0.2138,0.156566,-0.248331,73,40,56,43,53
174039,2.667595,1.004311,0.99603,0.995986,222.078077,16.518101,213.619918,15.382088,213.619918,15.382088,...,-0.234513,-0.416389,0.129206,0.092125,0.000758,79,30,42,32,33
384863,2.487492,0.739946,0.745139,0.744625,167.726263,10.1662,195.564936,14.286634,195.564936,14.286634,...,0.153791,0.189489,0.461311,-0.20149,0.195373,23,69,61,66,88


# Feature Selection
#### (Assuming only 10 dimensions for the doc2vectors)

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
threshold=(.8 * (1 - .8))

In [None]:
sel = VarianceThreshold(threshold=threshold)

In [None]:
sel.fit_transform(X_train_final)

In [None]:
sel.variances_.shape

In [None]:
variances = pd.Series(sel.variances_, index=X_train_final.columns)

In [None]:
variances[variances > threshold].sort_values(ascending=False)

In [None]:
y_train_final = y_train.loc[X_train_final.index]

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif, f_classif
kbest = SelectKBest(f_classif, k=10)
X_new = kbest.fit_transform(X_train_final, y_train_final)
X_new.shape

In [None]:
kbest_scores = pd.DataFrame(np.hstack((kbest.scores_.reshape(-1,1),kbest.pvalues_.reshape(-1,1))),index=X_train_final.columns, columns=['score','p-value'])

In [None]:
kbest_scores.sort_values(by=['score'],ascending=False)

# Modeling

### Logistic Regression

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train_final, y_train_final)

In [None]:
dump(logr_cv, input_folder+'logr_ramdomcv.joblib')

In [65]:
logr_cv = load(input_folder+'logr_ramdomcv.joblib')

In [66]:
logr_cv.best_params_

{'tol': 1e-05, 'C': 100.0}

In [67]:
y_train_final = y_train.loc[X_train_final.index]

In [68]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train_final, y_train_final)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=42, solver='warn', tol=1e-05,
          verbose=0, warm_start=False)

In [69]:
logr_pred = logr_model.predict(X_test_final)
y_test_final = y_test.loc[X_test_final.index]

In [70]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test_final, logr_pred)
logr_prec_score = precision_score(y_test_final, logr_pred)
logr_rec_score = recall_score(y_test_final, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))

Logistic Regression
accuracy score : 0.7442849828323594
precision score : 0.6729010520525338
recall score : 0.5985565160658947


In [71]:
from sklearn.metrics import classification_report
target_names = ['not duplicate', 'duplicate']
print(classification_report(y_test_final, logr_pred, target_names=target_names))

               precision    recall  f1-score   support

not duplicate       0.78      0.83      0.80     83760
    duplicate       0.67      0.60      0.63     49048

    micro avg       0.74      0.74      0.74    132808
    macro avg       0.73      0.71      0.72    132808
 weighted avg       0.74      0.74      0.74    132808



### XGBoost

In [72]:
import xgboost as xgb

In [None]:
# Model selection
params_xgb = {'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
               'gamma':np.linspace(.01, 1, 10, endpoint=True), 
               'learning_rate' : np.linspace(.01, 1, 10, endpoint=True),
               'reg_lambda': np.linspace(0.01, 10, 20, endpoint=True),
               'max_depth' : np.linspace(1, 32, 32, endpoint=True, dtype=int)
                 }
cv_xgb = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic', random_state=42), param_distributions=params_xgb, cv=5, n_jobs=3, random_state=42)
cv_xgb.fit(X_train_final, y_train_final)

In [None]:
dump(cv_xgb, input_folder+'xgb_ramdomcv.joblib')

In [74]:
cv_xgb = load(input_folder+'xgb_ramdomcv.joblib')

In [75]:
cv_xgb.best_params_

{'reg_lambda': 6.319473684210527,
 'n_estimators': 200,
 'max_depth': 29,
 'learning_rate': 0.23,
 'gamma': 0.23}

In [76]:
clf_xgb_model = xgb.XGBClassifier(random_state=42,
                                  n_estimators=cv_xgb.best_params_['n_estimators'],
                                  gamma=cv_xgb.best_params_['gamma'],
                                  learning_rate=cv_xgb.best_params_['learning_rate'],
                                  reg_lambda=cv_xgb.best_params_['reg_lambda'],
                                  max_depth=cv_xgb.best_params_['max_depth'])
clf_xgb_model.fit(X_train_final, y_train_final)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.23, learning_rate=0.23,
       max_delta_step=0, max_depth=29, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=6.319473684210527, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [77]:
y_pred_xgb = clf_xgb_model.predict(X_test_final)
score_xgb = accuracy_score(y_test_final, y_pred_xgb)
rscore_xgb = recall_score(y_test_final, y_pred_xgb)
pscore_xgb = precision_score(y_test_final, y_pred_xgb)
print('Accuracy score for XGBoost ', score_xgb)
print('Recall score for XGBoost ', rscore_xgb)
print('Precision score for XGBoost ', pscore_xgb)

Accuracy score for XGBoost  0.8357553761821577
Recall score for XGBoost  0.7569931495677703
Precision score for XGBoost  0.7895923271590498


In [78]:
print(classification_report(y_test_final, y_pred_xgb, target_names=target_names))

               precision    recall  f1-score   support

not duplicate       0.86      0.88      0.87     83760
    duplicate       0.79      0.76      0.77     49048

    micro avg       0.84      0.84      0.84    132808
    macro avg       0.83      0.82      0.82    132808
 weighted avg       0.83      0.84      0.84    132808

