In [1]:
import pandas as pd
from utils import persistence as ps
from urllib3.response import HTTPResponse
from typing import List, Dict, Tuple
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from toolz import partition_all
#from joblib import Parallel, delayed

In [28]:
INPUT_BUCKET: str = 'dq-data'
HASH_BUCKET: str = 'dq-hashed'

In [6]:
#load train_set
train_data: str = 'train.csv'
filestream: HTTPResponse = ps.get_file_stream(bucket=INPUT_BUCKET, filename=train_data)
#s3_in_prefix: str = 's3://' + INPUT_BUCKET + '/'
#s3_in_url: str = s3_in_prefix + train_data
#s3_options: Dict = ps.fetch_s3_options()
dtypes: Dict[str, str] = {
    'id': 'int64',
    'qid1': 'int64',
    'qid2': 'int64',
    'question1': 'object',
    'question2': 'object',
    'is_duplicate': 'int64'
}
train_df: pd.DataFrame = pd.read_csv(#urlpath=s3_in_url, 
                                     #storage_options=s3_options,
                                     filestream,
                                     header=0, 
                                     usecols=dtypes.keys(), 
                                     names=dtypes.keys(),
                                     skipinitialspace=True,
                                     skip_blank_lines=True,
                                     encoding='utf-8')
train_df = train_df.set_index('id')

In [7]:
train_df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404289 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [9]:
x_df1=train_df[:50000]
x_df2=train_df[50000:100000]
x_df3=train_df[100000:150000]
x_df4=train_df[150000:200000]
x_df4=train_df[200000:250000]
x_df4=train_df[250000:300000]
x_df4=train_df[300000:350000]
x_df4=train_df[350000:]

In [10]:
import en_core_web_lg
nlp = en_core_web_lg.load()

In [11]:
#nlp.pipe_names

In [12]:
#tagger = nlp.get_pipe('tagger')
#tagger.cfg

In [13]:
#parser = nlp.get_pipe('parser')
#parser.cfg

In [14]:
#ner = nlp.get_pipe('ner')
#ner.cfg

In [15]:
# tokenize, pos-tag, parse dependencies, recognize entities (pipeline)
#pipeline = ['tagger', 'parser', 'ner']
#for name in pipeline:
#    component = nlp.create_pipe(name)   # 3. create the pipeline components
#    nlp.add_pipe(component)             # 4. add the component to the pipeline

#preprocess_q1 = lambda row: nlp(row['question1'])
#x_df1['pr_question1'] = x_df1.apply(preprocess_q1, axis=1)
#preprocess_q2 = lambda row: nlp(row['question2'])
#x_df1['pr_question2'] = x_df1.apply(preprocess_q2, axis=1)
#x_df1.head()

In [16]:
# tokenizer
import nltk
def tokenize(text):
        tokens = [word for word in nlp(text) if len(word) > 1] #if len(word) > 1 because I only want to retain words that are at least two characters before stemming, although I can't think of any such words that are not also stopwords
        #stems = [stemmer.stem(item) for item in tokens]
        return tokens

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import time

In [34]:
import pickle
hashvect = HashingVectorizer(tokenizer=tokenize, binary=True, stop_words='english')
#ps.create_bucket(bucket=HASH_BUCKET)
def transform(transformer, tokenizer, series, batch_id, output_dir, max_features=10000):
    #tfidf = TfidfVectorizer(tokenizer=tokenizer, binary=True, stop_words='english', use_idf=True, max_features=max_features)
    #series1 = df[col1]
    #series2 = df[col2]
    #series = pd.concat([series1, series2])
    #start = time.time()
    X = transformer.fit_transform(series)
    #end =  time.time()
    # save transformed batch
    out_file = ('%d' % batch_id)
    out_path = output_dir+'/'+out_file 
    with open(out_path, 'wb') as handle:
        pickle.dump(X, handle)
    ps.copy_file(dest_bucket=HASH_BUCKET, file=out_file, source=out_file,source_folder=output_dir)
    #print('created TF-IDF vectors in time {}'.format(end-start))

In [35]:
from toolz import partition_all
from joblib import Parallel, delayed
from scipy.sparse import vstack
series = pd.concat([x_df1['question1'], x_df1['question2']], axis=1)
partitions = partition_all(1000, series)
executor = Parallel(n_jobs=8)
do = delayed(transform)
tasks = (do(tokenize, batch, 10000)
         for i, batch in enumerate(partitions))
executor(tasks)

ModuleNotFoundError: No module named 'joblib'

In [25]:
# dimension reduction using SVD
start = time.time()
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
X_svd = svd.fit_transform(X)
end =  time.time()
print('created SVD transform in time {}'.format(end-start))

created SVD transform in time 0.285386323928833


In [26]:
# split back into two
X1 = X_svd[:len(x_df1), :]
X2 = X_svd[len(x_df1):, :]
# find pair-wise cosine similarity
start = time.time()
X_sim = cosine_similarity(X1, X2)
end =  time.time()
print('computed cosine similarity in time {}'.format(end-start))

MemoryError: 

In [22]:
svd_feature_length = X_sim.shape[1]
start = time.time()
temp_df = pd.DataFrame(X_sim)
x_df1 = pd.concat([x_df1,temp_df], axis=1)
end =  time.time()
print('rebuilt dataframe with new tf_svd feature columns in time {}'.format(end-start))

MemoryError: 

In [None]:
temp_df.head()

In [None]:
x_df1.head(20)

In [21]:
svd_feature_length = X_sim.shape[1]
start = time.time()
for feature_index in range(1,svd_feature_length+1):
    x_df1['tf_svd_'+str(feature_index)] = X_sim[:,feature_index-1]
end =  time.time()
print('rebuilt dataframe with new tf_svd feature columns in time {}'.format(end-start))

KeyboardInterrupt: 

In [None]:
#x_df1 = tfidf_svd_vectorize(x_df1, 'question1', 'question2', 10000, 100)

In [69]:
# similarity between question1 and question2
compute_spacy_similarity = lambda row: row['pr_question1'].similarity(row['pr_question2'])
x_df1['spacy_similarity'] = x_df1.apply(compute_spacy_similarity, axis=1)
x_df1.head()

KeyboardInterrupt: 

In [70]:
# difference in text size
compute_size_diff = lambda row: abs(len((row['question1']) - len((row['question2'])))
x_df1['size_diff'] = x_df1.apply(compute_size_diff, axis=1)
x_df1.head()

KeyboardInterrupt: 

In [None]:
# vector norm diff (distance)
compute_spacy_distance = lambda row: abs(row['question1'].vector_norm - row['question2'].vector_norm)
x_df1['spacy_distance'] = x_df1.apply(compute_spacy_distance, axis=1)
x_df1.head()

In [None]:
# function to return mean distance between tokens and document centroid
def compute_mean_distance(doc):
    mean_distance = 0.0
    centroid = doc.vector
    count = 0
    for token in doc:
        if not token.is_stop:
            mean_distance += np.inner(token.vector,centroid)
            count += 1
    if count == 0:
        count = 1
    return mean_distance / count

In [None]:
# mean distance from centroid for question1
compute_q1_mean_dist = lambda row: compute_mean_distance(row['question1'])
x_df1['q1_mean_dist'] = x_df1.apply(compute_q1_mean_dist, axis=1)
x_df1.head()

In [None]:
# mean distance from centroid for question1
compute_q2_mean_dist = lambda row: compute_mean_distance(row['question2'])
x_df1['q2_mean_dist'] = x_df1.apply(compute_q2_mean_dist, axis=1)
x_df1.head()

In [None]:
# difference in mean distance
compute_mean_dist_diff = lambda row: abs(row['q1_mean_dist'] - row['q2_mean_dist'])
x_df1['mean_dist_diff'] = x_df1.apply(compute_mean_dist_diff, axis=1)
x_df1.head()

In [None]:
# centroid similarity
compute_centroid_similarity = lambda row: np.inner(row['question1'].vector, row['question2'].vector)
x_df1['centroid_similarity'] = x_df1.apply(compute_centroid_similarity, axis=1)
x_df1.head()

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(row['question1'], row['question2'])
x_df1['ratio'] = x_df1.apply(compute_ratio, axis=1)
x_df1.head()

In [None]:
# partial ratio
compute_ratio = lambda row: fuzz.partial_ratio(row['question1'], row['question2'])
x_df1['partial_ratio'] = x_df1.apply(compute_ratio, axis=1)
x_df1.head()

In [None]:
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(row['question1'], row['question2'])
x_df1['token_sort_ratio'] = x_df1.apply(compute_token_sort_ratio, axis=1)
x_df1.head()

In [None]:
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(row['question1'], row['question2'])
x_df1['token_set_ratio'] = x_df1.apply(compute_token_set_ratio, axis=1)
x_df1.head()