In [61]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [62]:
import warnings
warnings.filterwarnings('ignore')

In [71]:
import dask.dataframe as dd
from dask.distributed import Client
from distributed.deploy.local import LocalCluster

In [72]:
def create_dask_client(num_workers: int) -> Client:
    cluster = LocalCluster(n_workers=num_workers, ip='')
    return Client(cluster)

In [73]:
client: Client = create_dask_client(num_workers=8)

In [63]:
INPUT_BUCKET: str = 'dq-data'

In [None]:
#load train_set
train_data: str = 'train.csv'
#filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=train_data)
s3_in_prefix: str = 's3://' + INPUT_BUCKET + '/'
s3_in_url: str = s3_in_prefix + train_data
s3_options: Dict = ps.fetch_s3_options()
dtypes: Dict[str, str] = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
train_df: dd.DataFrame = dd.read_csv(urlpath=s3_in_url, 
                                     storage_options=s3_options,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8', 
                                     dtype=dtypes)
train_df.set_index('id')

In [65]:
train_df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [66]:
#from spacy.pipeline import EntityRecognizer
import en_core_web_lg
nlp = en_core_web_lg.load()
#ner = EntityRecognizer(nlp.vocab)
#q1_ner = [nlp(q) for _,q in train_df['question1'].items()]

In [67]:
x_df = train_df

In [69]:
# similarity between question1 and question2
compute_spacy_similarity = lambda row: nlp(row['question1']).similarity(nlp(row['question2']))
x_df['spacy_similarity'] = x_df.apply(compute_spacy_similarity, axis=1)
x_df.head()

KeyboardInterrupt: 

In [70]:
# difference in text size
compute_size_diff = lambda row: abs(len(nlp(row['question1'])) - len((nlp(row['question2']))))
x_df['size_diff'] = x_df.apply(compute_size, axis=1)
x_df.head()

KeyboardInterrupt: 

In [None]:
# vector norm diff (distance)
compute_spacy_distance = lambda row: abs(nlp(row['question1']).vector_norm - nlp(row['question2']).vector_norm)
x_df['spacy_distance'] = x_df.apply(compute_spacy_distance, axis=1)
x_df.head()

In [None]:
# function to return mean distance between tokens and document centroid
def compute_mean_distance(doc):
    mean_distance = 0.0
    centroid = doc.vector
    count = 0
    for token in doc:
        if not token.is_stop:
            mean_distance += np.inner(token.vector,centroid)
            count += 1
    if count == 0:
        count = 1
    return mean_distance / count

In [None]:
# mean distance from centroid for question1
compute_q1_mean_dist = lambda row: compute_mean_distance(nlp(row['question1']))
x_df['q1_mean_dist'] = x_df.apply(compute_q1_mean_dist, axis=1)
x_df.head()

In [None]:
# mean distance from centroid for question1
compute_q2_mean_dist = lambda row: compute_mean_distance(nlp(row['question2']))
x_df['q2_mean_dist'] = x_df.apply(compute_q2_mean_dist, axis=1)
x_df.head()

In [None]:
# difference in mean distance
compute_mean_dist_diff = lambda row: abs(row['q1_mean_dist'] - row['q2_mean_dist'])
x_df['mean_dist_diff'] = x_df.apply(compute_mean_dist_diff, axis=1)
x_df.head()

In [None]:
# centroid similarity
compute_centroid_similarity = lambda row: np.inner(nlp(row['question1']).vector, nlp(row['question2']).vector)
x_df['centroid_similarity'] = x_df.apply(compute_centroid_similarity, axis=1)
x_df.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(row['question1'], row['question2'])
x_df['ratio'] = x_df.apply(compute_ratio, axis=1)
x_df.head()

In [None]:
# partial ratio
compute_ratio = lambda row: fuzz.partial_ratio(row['question1'], row['question2'])
x_df['partial_ratio'] = x_df.apply(compute_ratio, axis=1)
x_df.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(row['question1'], row['question2'])
x_df['token_sort_ratio'] = x_df.apply(compute_token_sort_ratio, axis=1)
x_df.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(row['question1'], row['question2'])
x_df['token_set_ratio'] = x_df.apply(compute_token_set_ratio, axis=1)
x_df.head()