In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all

In [4]:
INPUT_BUCKET: str = 'dq-data'
HASH_BUCKET: str = 'dq-hashed'

In [5]:
#load train_set
data: str = 'train.csv'
filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
dtypes: Dict[str, str] = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')

In [6]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


#### Train-test split

In [8]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['is_duplicate'])
y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270874 entries, 316451 to 121958
Data columns (total 4 columns):
qid1         270874 non-null int64
qid2         270874 non-null int64
question1    270873 non-null object
question2    270873 non-null object
dtypes: int64(2), object(2)
memory usage: 10.3+ MB


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import time

In [11]:
import pickle
ps.create_bucket(bucket=HASH_BUCKET)
import os
import shutil
from gensim.utils import simple_preprocess
tmp_train_path = '/tmp/train'
try:
    shutil.rmtree(tmp_train_path)
except:
    pass
try:    
    os.mkdir(tmp_train_path)
except:
    pass

def tokenize(series, batch_id, output_dir):
    print('processing batch {}'.format(batch_id))
    # save transformed batch
    out_file = ('%d' % batch_id) 
    out_path = output_dir+'/'+out_file
    with open(out_path, 'w') as handle:
        for q in series:    
            for token in simple_preprocess(q):
                handle.write(token)
            handle.write('\n')
    ps.copy_file(dest_bucket=HASH_BUCKET, file='train/'+out_file, source=out_path)
    os.remove(out_path)

In [12]:
from toolz import partition_all
from multiprocessing import Process, cpu_count, Pool
# empty HASH_BUCKET
ps.remove_all_files(bucket=HASH_BUCKET, path='train/')
series = pd.Series(pd.concat([X_train['question1'], X_train['question2']]),dtype=str)
series.dropna()
partitions = partition_all(10000, series.tolist())
#trying multiprocessing
#processes = []
pool = Pool(processes=6)
args = []
for i, batch in enumerate(partitions):
    args.append((
                   batch,
                   i, 
                   tmp_train_path))
pool.starmap(tokenize, args)

all files in bucket dq-hashed at path train/ are []
processing batch 0
processing batch 3
processing batch 6
processing batch 9
processing batch 12
processing batch 15
pushed file train/0 from /tmp/train/0 to minio bucket dq-hashed
processing batch 1
pushed file train/3 from /tmp/train/3 to minio bucket dq-hashed
processing batch 4
pushed file train/6 from /tmp/train/6 to minio bucket dq-hashed
processing batch 7
pushed file train/9 from /tmp/train/9 to minio bucket dq-hashed
processing batch 10
pushed file train/12 from /tmp/train/12 to minio bucket dq-hashed
processing batch 13
pushed file train/15 from /tmp/train/15 to minio bucket dq-hashed
processing batch 16
pushed file train/1 from /tmp/train/1 to minio bucket dq-hashed
processing batch 2
pushed file train/4 from /tmp/train/4 to minio bucket dq-hashed
processing batch 5
pushed file train/13 from /tmp/train/13 to minio bucket dq-hashed
processing batch 14
pushed file train/16 from /tmp/train/16 to minio bucket dq-hashed
processin

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

Process ForkPoolWorker-1:
Process ForkPoolWorker-6:
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstr

In [13]:
def get_tokens():
    files = ps.get_all_filenames(bucket=HASH_BUCKET, path='train/')
    
    for file in files:
        ps.get_file(bucket=HASH_BUCKET, filename=file, filepath=tmp_train_path+file)
        with open(tmp_train_path+file, 'r') as handle:
            
            for line in handle:
                yield line
                
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through)
X_trfmd = tfidf.fit_transform(get_tokens())
try:
    shutil.rmtree(tmp_train_path)
except:
    pass

In [14]:
X_trfmd

<541748x799 sparse matrix of type '<class 'numpy.float64'>'
	with 9620941 stored elements in Compressed Sparse Row format>

In [15]:
# dimension reduction using SVD
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

created SVD transform in time 20.509034633636475


In [16]:
X_svd.shape

(541748, 100)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270874 entries, 316451 to 121958
Data columns (total 4 columns):
qid1         270874 non-null int64
qid2         270874 non-null int64
question1    270873 non-null object
question2    270873 non-null object
dtypes: int64(2), object(2)
memory usage: 10.3+ MB


In [18]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]

In [19]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17


In [20]:
from fuzzywuzzy import fuzz

In [21]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45


In [22]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49,49
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64,63
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35,43
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52,70
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45,50


In [23]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49,49,55
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64,63,82
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35,43,41
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52,70,55
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45,50,50


In [24]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
316451,441499,441500,What is the tattoo written on DMC's head?,Were tattoos part of the 60's hippie culture?,4,49,49,55,55
398368,531584,531585,What are monocot plants and what are some exam...,What are some examples of monocot plants? What...,10,64,63,82,93
218253,324807,324808,Why do we see colors?,What is your review of Weightless (2017 movie)?,26,35,43,41,41
282919,402950,402951,"Is Reform Judaism becoming more ""orthodox""?",What is Reform Judaism?,20,52,70,55,87
243365,355786,355787,Is there any culture that has dessert before d...,What are my options for desserts after dinner ...,17,45,50,50,50


In [25]:
X2.shape

(270874, 100)

In [26]:
# build complete feature dataframe
X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
X_train_temp.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_90,q2_91,q2_92,q2_93,q2_94,q2_95,q2_96,q2_97,q2_98,q2_99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316451,0.83417,-0.146918,-0.016684,0.281899,0.025486,-0.208559,-0.171485,0.155565,-0.122456,-0.022247,...,4.678337e-07,4.849435e-06,-2e-06,4e-06,3e-06,-8.097317e-07,2.646273e-06,4.110308e-07,-2e-06,7e-06
398368,0.859723,-0.02162,-0.143052,0.050104,0.286142,0.05273,0.005876,0.060243,-0.186494,0.165187,...,1.09574e-05,1.903781e-05,-1.4e-05,-2e-06,-9e-06,7.747259e-06,-6.469354e-06,-2.886128e-06,-9e-06,-2e-06
218253,0.719515,0.305061,-0.359525,-0.039574,-0.139228,0.11168,-0.004213,0.090227,0.100102,0.112338,...,9.369365e-06,1.001712e-05,-1.4e-05,1e-06,-4e-06,3.02569e-06,-3.158781e-06,-1.167244e-05,-3e-06,-9e-06
282919,0.727909,0.262527,-0.069804,-0.059214,0.002341,-0.194169,0.158961,-0.081014,0.002503,0.062071,...,-1.364421e-06,5.011465e-06,4e-06,-3e-06,-1e-06,-1.43941e-06,-6.572667e-07,1.831954e-06,-3e-06,1e-06
243365,0.881564,-0.253718,-0.218576,-0.035157,-0.020854,-0.04455,-0.003412,-0.090654,0.197271,-0.110386,...,-2.221566e-05,3.615819e-07,2e-06,-3e-06,2e-06,9.655713e-07,5.186554e-07,-1.61763e-05,-3e-06,-6e-06


In [27]:
X_train = pd.concat([X_train_temp, X_train], axis=1)
del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_95,q2_96,q2_97,q2_98,q2_99,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316451,0.83417,-0.146918,-0.016684,0.281899,0.025486,-0.208559,-0.171485,0.155565,-0.122456,-0.022247,...,-8.097317e-07,2.646273e-06,4.110308e-07,-2e-06,7e-06,4,49,49,55,55
398368,0.859723,-0.02162,-0.143052,0.050104,0.286142,0.05273,0.005876,0.060243,-0.186494,0.165187,...,7.747259e-06,-6.469354e-06,-2.886128e-06,-9e-06,-2e-06,10,64,63,82,93
218253,0.719515,0.305061,-0.359525,-0.039574,-0.139228,0.11168,-0.004213,0.090227,0.100102,0.112338,...,3.02569e-06,-3.158781e-06,-1.167244e-05,-3e-06,-9e-06,26,35,43,41,41
282919,0.727909,0.262527,-0.069804,-0.059214,0.002341,-0.194169,0.158961,-0.081014,0.002503,0.062071,...,-1.43941e-06,-6.572667e-07,1.831954e-06,-3e-06,1e-06,20,52,70,55,87
243365,0.881564,-0.253718,-0.218576,-0.035157,-0.020854,-0.04455,-0.003412,-0.090654,0.197271,-0.110386,...,9.655713e-07,5.186554e-07,-1.61763e-05,-3e-06,-6e-06,17,45,50,50,50


In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270874 entries, 316451 to 121958
Columns: 205 entries, q1_0 to token_set_ratio
dtypes: float64(200), int64(5)
memory usage: 425.7 MB


#### Test set vectorization

In [30]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133416 entries, 8067 to 231389
Data columns (total 4 columns):
qid1         133416 non-null int64
qid2         133416 non-null int64
question1    133416 non-null object
question2    133415 non-null object
dtypes: int64(2), object(2)
memory usage: 5.1+ MB


In [31]:
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?


In [32]:
tmp_test_path = '/tmp/test/'
try:
    shutil.rmtree(tmp_test_path)
except:
    pass
try:    
    os.mkdir(tmp_test_path)
except:
    pass

def tokenize_test(series, batch_id, output_dir):
    print('processing batch {}'.format(batch_id))
    # save transformed batch
    out_file = ('%d' % batch_id) 
    out_path = output_dir+'/'+out_file
    with open(out_path, 'w') as handle:
        for q in series:       
            for token in simple_preprocess(q):
                handle.write(token)
            handle.write('\n')
    ps.copy_file(dest_bucket=HASH_BUCKET, file='test/'+out_file, source=out_path)
    os.remove(out_path)

In [33]:
# empty HASH_BUCKET
ps.remove_all_files(bucket=HASH_BUCKET, path='test/')
series = pd.Series(pd.concat([X_test['question1'], X_test['question2']]),dtype=str)
series.dropna()
partitions = partition_all(10000, series.tolist())
#trying multiprocessing
#processes = []
pool = Pool(processes=6)
args = []
for i, batch in enumerate(partitions):
    args.append((
                   batch,
                   i, 
                   tmp_test_path))
pool.starmap(tokenize_test, args)

all files in bucket dq-hashed at path test/ are ['test/0', 'test/1', 'test/10', 'test/100', 'test/101', 'test/102', 'test/103', 'test/104', 'test/105', 'test/106', 'test/107', 'test/108', 'test/109', 'test/11', 'test/110', 'test/111', 'test/112', 'test/113', 'test/114', 'test/115', 'test/116', 'test/117', 'test/118', 'test/119', 'test/12', 'test/120', 'test/121', 'test/122', 'test/123', 'test/124', 'test/125', 'test/126', 'test/127', 'test/128', 'test/129', 'test/13', 'test/130', 'test/131', 'test/132', 'test/133', 'test/134', 'test/135', 'test/136', 'test/137', 'test/138', 'test/139', 'test/14', 'test/140', 'test/141', 'test/142', 'test/143', 'test/144', 'test/145', 'test/146', 'test/147', 'test/148', 'test/149', 'test/15', 'test/150', 'test/151', 'test/152', 'test/153', 'test/154', 'test/155', 'test/156', 'test/157', 'test/158', 'test/159', 'test/16', 'test/160', 'test/161', 'test/162', 'test/163', 'test/164', 'test/165', 'test/166', 'test/167', 'test/168', 'test/169', 'test/17', 'te

processing batch 0
processing batch 2
processing batch 4
processing batch 6
processing batch 8
processing batch 10
pushed file test/2 from /tmp/test//2 to minio bucket dq-hashed
processing batch 3
pushed file test/6 from /tmp/test//6 to minio bucket dq-hashed
processing batch 7
pushed file test/0 from /tmp/test//0 to minio bucket dq-hashed
processing batch 1
pushed file test/4 from /tmp/test//4 to minio bucket dq-hashed
processing batch 5
pushed file test/10 from /tmp/test//10 to minio bucket dq-hashed
processing batch 11
pushed file test/8 from /tmp/test//8 to minio bucket dq-hashed
processing batch 9
pushed file test/1 from /tmp/test//1 to minio bucket dq-hashed
processing batch 12
pushed file test/3 from /tmp/test//3 to minio bucket dq-hashed
processing batch 14
pushed file test/11 from /tmp/test//11 to minio bucket dq-hashed
processing batch 16
pushed file test/7 from /tmp/test//7 to minio bucket dq-hashed
processing batch 18
pushed file test/5 from /tmp/test//5 to minio bucket dq-

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-11:
Process ForkPoolWorker-10:
Process ForkPoolWorker-9:
Process ForkPoolWorker-12:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
 

In [34]:
def get_test_tokens():
    files = ps.get_all_filenames(bucket=HASH_BUCKET, path='test/')
    #files = os.listdir(tmp_train_path)
    for file in files:
        ps.get_file(bucket=HASH_BUCKET, filename=file, filepath=tmp_test_path+file)
        with open(tmp_test_path+file, 'r') as handle:
            #data.append(pickle.load(handle))
            for line in handle:
                yield line
                
X_test_trfmd = tfidf.transform(get_test_tokens())
try:
    shutil.rmtree(tmp_test_path)
except:
    pass


In [35]:
X_test_trfmd

<266832x799 sparse matrix of type '<class 'numpy.float64'>'
	with 4738262 stored elements in Compressed Sparse Row format>

In [36]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

created SVD transform in time 0.28557920455932617


In [37]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]

In [38]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9


In [39]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63


In [40]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88,88
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40,44
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73,68
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63,59


In [41]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88,88,81
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73,73,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40,44,39
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73,68,89
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63,59,71


In [42]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8067,15738,15739,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0,88,88,81,90
368101,12736,104117,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,2,73,73,73,73
70497,121486,121487,Which is more advisable and better material fo...,What is the best server setup for buddypress?,51,40,44,39,41
226567,254474,258192,How do I improve logical programming skills?,How can I improve my logical skills for progra...,8,73,68,89,96
73186,48103,3062,How close we are to see 3rd world war?,How close is a World War III?,9,63,59,71,81


In [43]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_90,q2_91,q2_92,q2_93,q2_94,q2_95,q2_96,q2_97,q2_98,q2_99
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.519165,0.431756,0.064319,-0.016481,0.087038,0.002285,-0.008425,0.015336,-0.261355,0.121395,...,2.3e-05,1.5e-05,-1.821504e-05,3.216443e-06,-9e-06,-1.5e-05,-4.836909e-06,-8e-06,-1.309566e-05,-1.962198e-05
368101,0.85107,-0.171522,-0.255921,0.071621,0.022943,-0.00472,0.07001,0.069973,0.104416,0.061926,...,1.3e-05,8e-06,-1.039981e-05,-9.153098e-07,-1.5e-05,-7e-06,-1.4114e-06,-8e-06,1.632647e-06,-8.720784e-06
70497,0.912409,-0.152898,-0.020779,0.057625,-0.037292,0.108542,0.037998,0.028389,0.061418,-0.133019,...,-1.2e-05,2e-06,-3.847522e-06,3.611121e-07,-5e-06,-9e-06,-1.192275e-06,1e-06,7.260083e-07,-2.985969e-08
226567,0.718929,0.31318,0.216385,-0.093333,-0.036749,0.212129,0.161706,-0.131936,-0.290491,0.083805,...,-1.1e-05,8e-06,-3.781779e-07,5.320937e-06,-2e-06,-5e-06,3.50814e-06,9e-06,-6.072785e-06,-2.81859e-06
73186,0.76592,0.142573,-0.35204,-0.06137,0.031293,0.111417,-0.016486,-0.040281,-0.03856,-0.087537,...,-2e-06,1e-05,-8.842913e-06,-4.107623e-06,-5e-06,-1.1e-05,4.488547e-07,2e-06,-2.317846e-06,5.121606e-07


In [44]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2', 'qid1', 'qid2'])
X_test.head()

Unnamed: 0_level_0,q1_0,q1_1,q1_2,q1_3,q1_4,q1_5,q1_6,q1_7,q1_8,q1_9,...,q2_95,q2_96,q2_97,q2_98,q2_99,size_diff,ratio,partial_ratio,token_sort_ratio,token_set_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8067,0.519165,0.431756,0.064319,-0.016481,0.087038,0.002285,-0.008425,0.015336,-0.261355,0.121395,...,-1.5e-05,-4.836909e-06,-8e-06,-1.309566e-05,-1.962198e-05,0,88,88,81,90
368101,0.85107,-0.171522,-0.255921,0.071621,0.022943,-0.00472,0.07001,0.069973,0.104416,0.061926,...,-7e-06,-1.4114e-06,-8e-06,1.632647e-06,-8.720784e-06,2,73,73,73,73
70497,0.912409,-0.152898,-0.020779,0.057625,-0.037292,0.108542,0.037998,0.028389,0.061418,-0.133019,...,-9e-06,-1.192275e-06,1e-06,7.260083e-07,-2.985969e-08,51,40,44,39,41
226567,0.718929,0.31318,0.216385,-0.093333,-0.036749,0.212129,0.161706,-0.131936,-0.290491,0.083805,...,-5e-06,3.50814e-06,9e-06,-6.072785e-06,-2.81859e-06,8,73,68,89,96
73186,0.76592,0.142573,-0.35204,-0.06137,0.031293,0.111417,-0.016486,-0.040281,-0.03856,-0.087537,...,-1.1e-05,4.488547e-07,2e-06,-2.317846e-06,5.121606e-07,9,63,59,71,81


### Modeling

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             #'solver': ['lbfgs']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
logr_cv.fit(X_train, y_train)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(#solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))