In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all
#from joblib import Parallel, delayed

In [4]:
INPUT_BUCKET: str = 'dq-data'
HASH_BUCKET: str = 'dq-hashed'

In [5]:
#load train_set
data: str = 'train.csv'
filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
#s3_in_prefix: str = 's3://' + INPUT_BUCKET + '/'
#s3_in_url: str = s3_in_prefix + train_data
#s3_options: Dict = ps.fetch_s3_options()
dtypes: Dict[str, str] = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')

In [6]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


#### Train-test split

In [8]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['is_duplicate'])
y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270874 entries, 316451 to 121958
Data columns (total 4 columns):
qid1         270874 non-null int64
qid2         270874 non-null int64
question1    270873 non-null object
question2    270873 non-null object
dtypes: int64(2), object(2)
memory usage: 10.3+ MB


In [26]:
import spacy
nlp = spacy.load('en_core_web_md')

In [27]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [28]:
#from spacy.lemmatizer import Lemmatizer
#from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
#lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
#nlp.add_pipe(lemmatizer)
#nlp.pipe_names

In [29]:
#tagger = nlp.get_pipe('tagger')
#tagger.cfg

In [30]:
#parser = nlp.get_pipe('parser')
#parser.cfg

In [31]:
#ner = nlp.get_pipe('ner')
#ner.cfg

In [32]:
# tokenize, pos-tag, parse dependencies, recognize entities (pipeline)
#pipeline = ['tagger', 'parser', 'ner']
#for name in pipeline:
#    component = nlp.create_pipe(name)   # 3. create the pipeline components
#    nlp.add_pipe(component)             # 4. add the component to the pipeline

#preprocess_q1 = lambda row: nlp(row['question1'])
#x_df1['pr_question1'] = x_df1.apply(preprocess_q1, axis=1)
#preprocess_q2 = lambda row: nlp(row['question2'])
#x_df1['pr_question2'] = x_df1.apply(preprocess_q2, axis=1)
#x_df1.head()

In [33]:
# tokenizer
import nltk
def tokenize(text):
        tokens = [word for word in nlp(text) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        #stems = [stemmer.stem(item) for item in tokens]
        return tokens

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import time

In [35]:
import pickle
ps.create_bucket(bucket=HASH_BUCKET)
import os
import shutil
tmp_train_path = '/tmp/train'
try:
    shutil.rmtree(tmp_train_path)
except:
    pass
try:    
    os.mkdir(tmp_train_path)
except:
    pass

def tokenize(pipeline, series, batch_id, output_dir):
    #tfidf = TfidfVectorizer(tokenizer=tokenizer, binary=True, stop_words='english', use_idf=True, max_features=max_features)
    #series1 = df[col1]
    #series2 = df[col2]
    #series = pd.concat([series1, series2])
    #start = time.time()
    print('processing batch {}'.format(batch_id))
    #trnsfmd = pipeline(series)
    #end =  time.time()
    # save transformed batch
    out_file = ('%d' % batch_id)
    out_path = output_dir+'/'+out_file 
    with open(out_path, 'wb') as handle:
        for doc in pipeline.pipe(series):
            #pickle.dump(doc, handle)
            handle.write(' '.join(w for w in doc if not w.is_space))
            handle.write('\n')
            
    ps.copy_file(dest_bucket=HASH_BUCKET, file='train/'+out_file, source=out_path)
    os.remove(out_path)
    #print('created TF-IDF vectors in time {}'.format(end-start))

In [36]:
from toolz import partition_all
from joblib import Parallel, delayed
from multiprocessing import Process, cpu_count
# empty HASH_BUCKET
ps.remove_all_files(bucket=HASH_BUCKET, path='train/')
series = pd.Series(pd.concat([X_train['question1'], X_train['question2']]),dtype=str)
series.dropna()
partitions = partition_all(10000, series.tolist())
#hashvect = HashingVectorizer(tokenizer=nlp, binary=True, stop_words='english')
#Parallel(n_jobs=8)(delayed(tokenize)(nlp, batch, i, tmp_train_path)
#         for i, batch in enumerate(partitions))
#executor(tasks)

# parallel joblib and spacy dont work together, trying serial
#for i, batch in enumerate(partitions):
#    tokenize(nlp, batch, i, tmp_train_path)

#trying multiprocessing
processes = []
for i, batch in enumerate(partitions):
    p = Process(target=tokenize, name=i, kwargs={'pipeline':nlp, 
                                                 'series':batch, 
                                                 'batch_id':i, 
                                                 'output_dir':tmp_train_path})
    p.start()
    print('started process {}'.format(p.name))
    processes.append(p)
for p in processes:
    p.join()
    print('finished process {}'.format(p.name))

all files in bucket dq-hashed at path train/ are ['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '9']
processing batch 0
started process Process-166
started process 1
processing batch 1
processing batch 2
started process 2
processing batch 3
started process 3
processing batch 4
started process 4
processing batch 5
started process 5
started process 6
processing batch 6
started process 7
processing batch 7
started process 8
processing batch 8
started process 9
processing batch 9
processing batch 10
started process 10
processing batch 11
started process 11
processing batch

Process Process-166:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
Process 2:
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found


finished process Process-166


Process 12:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 1:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
Process 3:
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Traceback (most recent call last):

finished process 1
finished process 2


Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))


finished process 3


TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 4:
Process 9:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
TypeError: sequence item 0: expected str insta

finished process 4
finished process 5


Process 8:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)


finished process 6


  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
Process 7:
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 11:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", lin

finished process 7
finished process 8
finished process 9


Process 14:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 10:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
Process 17:
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Traceback (most recent call last

finished process 10
finished process 11
finished process 12


Process 19:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 16:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 13:
Process 18:
Traceback (most rece

finished process 13
finished process 14


Process 21:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()


finished process 15
finished process 16
finished process 17
finished process 18
finished process 19


  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 20:
Process 27:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process 22:
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.i

finished process 20
finished process 21
finished process 22


Process 23:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found


finished process 23
finished process 24


Process 26:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 31:
Process 25:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in do

finished process 25
finished process 26
finished process 27


  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 28:
Process 34:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35

finished process 28
finished process 29


Process 30:
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 35:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process 33:
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))


finished process 30
finished process 31
finished process 32


Traceback (most recent call last):
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found


finished process 33
finished process 34
finished process 35
finished process 36


Process 41:
Traceback (most recent call last):
Process 40:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 37:
Traceback (most recent call last

finished process 37


Process 45:
Traceback (most recent call last):
Process 38:
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found


finished process 38
finished process 39
finished process 40
finished process 41
finished process 42


Process 49:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-35-c79d5582a068>", line 30, in tokenize
    handle.write(' '.join(w for w in doc if not w.is_space))
Process 44:
TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found
Process 47:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._

finished process 43
finished process 44
finished process 45
finished process 46
finished process 47
finished process 48
finished process 49
finished process 50
finished process 51
finished process 52
finished process 53
finished process 54


In [None]:
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfTransformer
data = []
#files = ps.get_all_filenames(bucket=HASH_BUCKET, path='train/')
files = os.listdir(tmp_train_path)
for file in files:
    #ps.get_file(bucket=HASH_BUCKET, filename='train/'+file, filepath=tmp_train_path+file)
    with open(tmp_train_path+file, 'rb') as handle:
        data.append(pickle.load(handle))

#tfidf = TfidfTransformer()
pass_through = lambda x:x
tfidf = TfidfVectorizer(analyzer=pass_through, binary=True)
X_trfmd = tfidf.fit_transform(vstack(data))

In [None]:
print(vstack(data).shape)

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
X_svd.shape

In [None]:
X_train.info()

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]
## find pair-wise cosine similarity
#start = time.time()
#X_sim = cosine_similarity(X1, X2)
#end =  time.time()
#print('computed cosine similarity in time {}'.format(end-start))

In [None]:
#svd_feature_length = X_sim.shape[1]
#start = time.time()
#temp_df = pd.DataFrame(X_sim)
#x_df1 = pd.concat([x_df1,temp_df], axis=1)
#end =  time.time()
#print('rebuilt dataframe with new tf_svd feature columns in time {}'.format(end-start))

In [None]:
#temp_df.head()

In [None]:
#x_df1.head(20)

In [None]:
#x_df1 = tfidf_svd_vectorize(x_df1, 'question1', 'question2', 10000, 100)

In [None]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

In [None]:
## vector norm diff (distance)
#compute_spacy_distance = lambda row: abs(row['question1'].vector_norm - row['question2'].vector_norm)
#x_df1['spacy_distance'] = x_df1.apply(compute_spacy_distance, axis=1)
#x_df1.head()

In [None]:
# function to return mean distance between tokens and document centroid
def compute_mean_distance(doc):
    mean_distance = 0.0
    centroid = doc.vector
    count = 0
    for token in doc:
        if not token.is_stop:
            mean_distance += np.inner(token.vector,centroid)
            count += 1
    if count == 0:
        count = 1
    return mean_distance / count

In [None]:
## mean distance from centroid for question1
#compute_q1_mean_dist = lambda row: compute_mean_distance(row['question1'])
#x_df1['q1_mean_dist'] = x_df1.apply(compute_q1_mean_dist, axis=1)
#x_df1.head()

In [None]:
## mean distance from centroid for question1
#compute_q2_mean_dist = lambda row: compute_mean_distance(row['question2'])
#x_df1['q2_mean_dist'] = x_df1.apply(compute_q2_mean_dist, axis=1)
#x_df1.head()

In [None]:
## difference in mean distance
#compute_mean_dist_diff = lambda row: abs(row['q1_mean_dist'] - row['q2_mean_dist'])
#x_df1['mean_dist_diff'] = x_df1.apply(compute_mean_dist_diff, axis=1)
#x_df1.head()

In [None]:
## centroid similarity
#compute_centroid_similarity = lambda row: np.inner(row['question1'].vector, row['question2'].vector)
#x_df1['centroid_similarity'] = x_df1.apply(compute_centroid_similarity, axis=1)
#x_df1.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

In [None]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

In [None]:
X2.shape

In [None]:
# build complete feature dataframe
X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
X_train_temp.head()

In [None]:
X_train = pd.concat([X_train_temp, X_train], axis=1)
del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

In [None]:
X_train.info()

#### Test set vectorization

In [None]:
##load test_set
#test_data: str = 'test.csv'
#filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=test_data)
#dtypes: Dict[str, str] = {
#    'id': 'int64',
#    'question1': 'object',
#    'question2': 'object'
#}
#test_df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
#                                     #storage_options=s3_options,
#                                     filestream,
#                                     header=0, 
#                                     usecols=dtypes.keys(), 
#                                     names=dtypes.keys(),
#                                     skipinitialspace=True,
#                                     skip_blank_lines=True,
#                                     encoding='utf-8')
#test_df = test_df.set_index('id')

In [None]:
X_test.info()

In [None]:
X_test.head()

In [None]:
#ps.create_bucket(bucket=HASH_BUCKET)
tmp_test_path = '/tmp/test/'
if os.path.isdir(tmp_test_path):
    shutil.rmtree(tmp_test_path)
os.mkdir(tmp_test_path)
def transform(transformer, series, batch_id, output_dir, max_features=10000):
    #tfidf = TfidfVectorizer(tokenizer=tokenizer, binary=True, stop_words='english', use_idf=True, max_features=max_features)
    #series1 = df[col1]
    #series2 = df[col2]
    #series = pd.concat([series1, series2])
    #start = time.time()
    print('processing batch {}'.format(batch_id))
    X = transformer.transform(series)
    #end =  time.time()
    # save transformed batch
    out_file = ('%d' % batch_id)
    out_path = output_dir+'/'+out_file 
    with open(out_path, 'wb') as handle:
        pickle.dump(X, handle)
    #ps.copy_file(dest_bucket=HASH_BUCKET, file='test/'+out_file, source=out_path)
    #print('created TF-IDF vectors in time {}'.format(end-start))

In [None]:
# empty HASH_BUCKET
#ps.remove_all_files(bucket=HASH_BUCKET, path='test/')
series = pd.Series(pd.concat([X_test['question1'], X_test['question2']]),dtype=str)
partitions = partition_all(10000, series.tolist())
Parallel(n_jobs=8)(delayed(transform)(hashvect, batch, i, tmp_test_path, 10000)
         for i, batch in enumerate(partitions))
#executor(tasks)

In [None]:
data = []
files = ps.get_all_filenames(bucket=HASH_BUCKET, path='test/')
files = os.listdir(tmp_test_path)
for file in files:
    #ps.get_file(bucket=HASH_BUCKET, filename='test/'+file, filepath=tmp_test_path+file)
    with open(tmp_test_path+file, 'rb') as handle:
        data.append(pickle.load(handle))
X_test_trfmd = tfidf.transform(vstack(data))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]
## find pair-wise cosine similarity
#start = time.time()
#X_sim = cosine_similarity(X1, X2)
#end =  time.time()
#print('computed cosine similarity in time {}'.format(end-start))

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

In [None]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2', 'qid1', 'qid2'])
X_test.head()

### Modeling

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
logr_model = LogisticRegression(random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10),
             #'penalty': ['l1','l2'],
             'tol': np.logspace(-5, -1, 5),
             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             #'max_iter': np.linspace(10, 1000, 10)
             }
logr_cv = RandomizedSearchCV(logr_model, param_distributions=param_grid, cv=5, n_jobs=-1)
#y_train = X['is_duplicate']
#X_train = X.drop(columns=['is_duplicate'])
logr_cv.fit(X_train, y_train)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(solver=logr_cv.best_params_['solver'], 
                                random_state=42, 
                                C=logr_cv.best_params_['C'], 
                                tol=logr_cv.best_params_['tol'], 
                                #max_iter=logr_cv.best_params_['max_iter'], 
                                n_jobs=-1)
logr_model.fit(X_train, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))