In [None]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from toolz import partition_all
#from joblib import Parallel, delayed

In [None]:
INPUT_BUCKET: str = 'dq-data'
HASH_BUCKET: str = 'dq-hashed'

In [None]:
#load train_set
data: str = 'train.csv'
filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=data)
#s3_in_prefix: str = 's3://' + INPUT_BUCKET + '/'
#s3_in_url: str = s3_in_prefix + train_data
#s3_options: Dict = ps.fetch_s3_options()
dtypes: Dict[str, str] = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
df = df.set_index('id')

In [None]:
df.head()

In [None]:
df.info()

#### Train-test split

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['is_duplicate'])
y = df['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [None]:
#nlp.pipe_names

In [None]:
#tagger = nlp.get_pipe('tagger')
#tagger.cfg

In [None]:
#parser = nlp.get_pipe('parser')
#parser.cfg

In [None]:
#ner = nlp.get_pipe('ner')
#ner.cfg

In [None]:
# tokenize, pos-tag, parse dependencies, recognize entities (pipeline)
#pipeline = ['tagger', 'parser', 'ner']
#for name in pipeline:
#    component = nlp.create_pipe(name)   # 3. create the pipeline components
#    nlp.add_pipe(component)             # 4. add the component to the pipeline

#preprocess_q1 = lambda row: nlp(row['question1'])
#x_df1['pr_question1'] = x_df1.apply(preprocess_q1, axis=1)
#preprocess_q2 = lambda row: nlp(row['question2'])
#x_df1['pr_question2'] = x_df1.apply(preprocess_q2, axis=1)
#x_df1.head()

In [None]:
# tokenizer
import nltk
def tokenize(text):
        tokens = [word for word in nlp(text) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        #stems = [stemmer.stem(item) for item in tokens]
        return tokens

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import time

In [None]:
import pickle
#ps.create_bucket(bucket=HASH_BUCKET)
import os
tmp_train_path = '/tmp/train'
try:
    os.mkdir(tmp_train_path)
except:
    pass
def fit_transform(transformer, series, batch_id, output_dir, max_features=10000):
    #tfidf = TfidfVectorizer(tokenizer=tokenizer, binary=True, stop_words='english', use_idf=True, max_features=max_features)
    #series1 = df[col1]
    #series2 = df[col2]
    #series = pd.concat([series1, series2])
    #start = time.time()
    print('processing batch {}'.format(batch_id))
    trnsfmd = transformer.fit_transform(series)
    #end =  time.time()
    # save transformed batch
    out_file = ('%d' % batch_id)
    out_path = output_dir+'/'+out_file 
    with open(out_path, 'wb') as handle:
        pickle.dump(trnsfmd, handle)
    ps.copy_file(dest_bucket=HASH_BUCKET, file='train/'+out_file, source=out_path)
    #print('created TF-IDF vectors in time {}'.format(end-start))

In [None]:
from toolz import partition_all
from joblib import Parallel, delayed
# empty HASH_BUCKET
ps.remove_all_files(bucket=HASH_BUCKET, path='train/')
series = pd.Series(pd.concat([X_train['question1'], X_train['question2']]),dtype=str)
partitions = partition_all(10000, series.tolist())
hashvect = HashingVectorizer(binary=True, stop_words='english')
Parallel(n_jobs=8)(delayed(fit_transform)(hashvect, batch, i, tmp_train_path, 10000)
         for i, batch in enumerate(partitions))
#executor(tasks)

In [None]:
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfTransformer
data = []
files = ps.get_all_filenames(bucket=HASH_BUCKET, path='train/')
for file in files:
    ps.get_file(bucket=HASH_BUCKET, filename='train/'+file, filepath=tmp_train_path+file)
    with open(tmp_train_path+file, 'rb') as handle:
        data.append(pickle.load(handle))

tfidf = TfidfTransformer()
X_trfmd = tfidf.fit_transform(vstack(data))

In [None]:
X_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1 = X_svd[:len(X_train), :]
X2 = X_svd[len(X_train):, :]
## find pair-wise cosine similarity
#start = time.time()
#X_sim = cosine_similarity(X1, X2)
#end =  time.time()
#print('computed cosine similarity in time {}'.format(end-start))

In [None]:
#svd_feature_length = X_sim.shape[1]
#start = time.time()
#temp_df = pd.DataFrame(X_sim)
#x_df1 = pd.concat([x_df1,temp_df], axis=1)
#end =  time.time()
#print('rebuilt dataframe with new tf_svd feature columns in time {}'.format(end-start))

In [None]:
#temp_df.head()

In [None]:
#x_df1.head(20)

In [None]:
#x_df1 = tfidf_svd_vectorize(x_df1, 'question1', 'question2', 10000, 100)

In [None]:
# difference in text size
compute_size_diff = lambda row: abs(len(str(row['question1'])) - len(str(row['question2'])))
X_train['size_diff'] = X_train.apply(compute_size_diff, axis=1)
X_train.head()

In [None]:
## vector norm diff (distance)
#compute_spacy_distance = lambda row: abs(row['question1'].vector_norm - row['question2'].vector_norm)
#x_df1['spacy_distance'] = x_df1.apply(compute_spacy_distance, axis=1)
#x_df1.head()

In [None]:
# function to return mean distance between tokens and document centroid
def compute_mean_distance(doc):
    mean_distance = 0.0
    centroid = doc.vector
    count = 0
    for token in doc:
        if not token.is_stop:
            mean_distance += np.inner(token.vector,centroid)
            count += 1
    if count == 0:
        count = 1
    return mean_distance / count

In [None]:
## mean distance from centroid for question1
#compute_q1_mean_dist = lambda row: compute_mean_distance(row['question1'])
#x_df1['q1_mean_dist'] = x_df1.apply(compute_q1_mean_dist, axis=1)
#x_df1.head()

In [None]:
## mean distance from centroid for question1
#compute_q2_mean_dist = lambda row: compute_mean_distance(row['question2'])
#x_df1['q2_mean_dist'] = x_df1.apply(compute_q2_mean_dist, axis=1)
#x_df1.head()

In [None]:
## difference in mean distance
#compute_mean_dist_diff = lambda row: abs(row['q1_mean_dist'] - row['q2_mean_dist'])
#x_df1['mean_dist_diff'] = x_df1.apply(compute_mean_dist_diff, axis=1)
#x_df1.head()

In [None]:
## centroid similarity
#compute_centroid_similarity = lambda row: np.inner(row['question1'].vector, row['question2'].vector)
#x_df1['centroid_similarity'] = x_df1.apply(compute_centroid_similarity, axis=1)
#x_df1.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
X_train['ratio'] = X_train.apply(compute_ratio, axis=1)
X_train.head()

In [None]:
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
X_train['partial_ratio'] = X_train.apply(compute_partial_ratio, axis=1)
X_train.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
X_train['token_sort_ratio'] = X_train.apply(compute_token_sort_ratio, axis=1)
X_train.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))
X_train['token_set_ratio'] = X_train.apply(compute_token_set_ratio, axis=1)
X_train.head()

In [None]:
# build complete feature dataframe
X_train_temp = pd.concat([pd.DataFrame(X1, columns=['q1_'+str(i) for i in range(X1.shape[1])], index=X_train.index), 
                     pd.DataFrame(X2, columns=['q2_'+str(i) for i in range(X2.shape[1])], index=X_train.index)], axis=1)
X_train_temp.head()

In [None]:
X_train = pd.concat([X_train_temp, X_train], axis=1)
del X_train_temp
X_train = X_train.drop(columns=['qid1', 'qid2','question1','question2'])
X_train.head()

In [None]:
X_train.loc[X_train['is_duplicate'] == 1].head()

In [None]:
X_train.info()

#### Test set vectorization

In [None]:
##load test_set
#test_data: str = 'test.csv'
#filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=test_data)
#dtypes: Dict[str, str] = {
#    'id': 'int64',
#    'question1': 'object',
#    'question2': 'object'
#}
#test_df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
#                                     #storage_options=s3_options,
#                                     filestream,
#                                     header=0, 
#                                     usecols=dtypes.keys(), 
#                                     names=dtypes.keys(),
#                                     skipinitialspace=True,
#                                     skip_blank_lines=True,
#                                     encoding='utf-8')
#test_df = test_df.set_index('id')

In [None]:
X_test.info()

In [None]:
X_test.head()

In [1]:
#ps.create_bucket(bucket=HASH_BUCKET)
tmp_test_path = '/tmp/test'
try:
    os.mkdir(tmp_test_path)
except:
    pass
def transform(transformer, series, batch_id, output_dir, max_features=10000):
    #tfidf = TfidfVectorizer(tokenizer=tokenizer, binary=True, stop_words='english', use_idf=True, max_features=max_features)
    #series1 = df[col1]
    #series2 = df[col2]
    #series = pd.concat([series1, series2])
    #start = time.time()
    print('processing batch {}'.format(batch_id))
    X = transformer.transform(series)
    #end =  time.time()
    # save transformed batch
    out_file = ('%d' % batch_id)
    out_path = output_dir+'/'+out_file 
    with open(out_path, 'wb') as handle:
        pickle.dump(X, handle)
    ps.copy_file(dest_bucket=HASH_BUCKET, file='test/'+out_file, source=out_path)
    #print('created TF-IDF vectors in time {}'.format(end-start))

In [2]:
# empty HASH_BUCKET
ps.remove_all_files(bucket=HASH_BUCKET, path='test/')
series = pd.Series(pd.concat([X_test['question1'], X_test['question2']]),dtype=str)
partitions = partition_all(10000, series.tolist())
Parallel(n_jobs=8)(delayed(transform)(hashvect, batch, i, tmp_test_path, 10000)
         for i, batch in enumerate(partitions))
#executor(tasks)

NameError: name 'ps' is not defined

In [None]:
data = []
files = ps.get_all_filenames(bucket=HASH_BUCKET, path='test/')
for file in files:
    ps.get_file(bucket=HASH_BUCKET, filename='test/'+file, filepath=tmp_test_path+file)
    with open(tmp_test_path+file, 'rb') as handle:
        data.append(pickle.load(handle))
X_test_trfmd = tfidf.transform(vstack(data))

In [None]:
X_test_trfmd

In [None]:
# dimension reduction using SVD
start = time.time()
X_test_svd = svd.transform(X_test_trfmd)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

In [None]:
# split back into two
X1_test = X_test_svd[:len(X_test), :]
X2_test = X_test_svd[len(X_test):, :]
## find pair-wise cosine similarity
#start = time.time()
#X_sim = cosine_similarity(X1, X2)
#end =  time.time()
#print('computed cosine similarity in time {}'.format(end-start))

In [None]:
# difference in text size
X_test['size_diff'] = X_test.apply(compute_size_diff, axis=1)
X_test.head()

In [None]:
# ratio
X_test['ratio'] = X_test.apply(compute_ratio, axis=1)
X_test.head()

In [None]:
# partial ratio
X_test['partial_ratio'] = X_test.apply(compute_partial_ratio, axis=1)
X_test.head()

In [None]:
# token_sort_ratio
X_test['token_sort_ratio'] = X_test.apply(compute_token_sort_ratio, axis=1)
X_test.head()

In [None]:
# token_set_ratio
X_test['token_set_ratio'] = X_test.apply(compute_token_set_ratio, axis=1)
X_test.head()

In [None]:
# build complete feature dataframe
X_test_temp = pd.concat([pd.DataFrame(X1_test, columns=['q1_'+str(i) for i in range(X1_test.shape[1])], index=X_test.index), 
                    pd.DataFrame(X2_test, columns=['q2_'+str(i) for i in range(X2_test.shape[1])], index=X_test.index)], axis=1)
X_test_temp.head()

In [None]:
X_test = pd.concat([X_test_temp, X_test], axis=1)
del X_test_temp
X_test = X_test.drop(columns=['question1','question2'])
X_test.head()

### Modeling

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
logr_model = LogisticRegression(solver='lbfgs', random_state=42)
param_grid = {'C': np.logspace(-2, 7, 10)}
logr_cv = GridSearchCV(logr_model, param_grid=param_grid, cv=5, njobs=-1)
#y_train = X['is_duplicate']
#X_train = X.drop(columns=['is_duplicate'])
logr_cv.fit(X_train, y_train)

In [None]:
logr_cv.best_params_

In [None]:
logr_model = LogisticRegression(solver='lbfgs', random_state=42, C=logr_cv.best_params_['C'], njobs=-1)
logr_model.fit(X_train, y_train)

In [None]:
logr_pred = logr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
logr_acc_score = accuracy_score(y_test, logr_pred)
logr_prec_score = precision_score(y_test, logr_pred)
logr_rec_score = recall_score(y_test, logr_pred)
print('Logistic Regression')
print('accuracy score : {}'.format(logr_acc_score))
print('precision score : {}'.format(logr_prec_score))
print('recall score : {}'.format(logr_rec_score))