In [1]:
%load_ext autoreload
%autoreload 2
lang = "eng"

import os
import sys
sys.path.insert(0, "../src/")
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from feature_extraction.doc2vec_chunk_vectorizer import Doc2VecChunkVectorizer
from feature_extraction.doc_based_feature_extractor import DocBasedFeatureExtractor
from feature_extraction.corpus_based_feature_extractor import CorpusBasedFeatureExtractor
from utils import get_doc_paths
from chunk import Chunk
import time

raw_docs_dir = f"../data/raw_docs/{lang}/"
labels_dir = "../data/labels/"
features_dir = f"../data/features/{lang}/"

if not os.path.exists(features_dir):
    os.makedirs(features_dir)

doc_paths = get_doc_paths(raw_docs_dir, lang)

sentences_per_chunk = 200

In [2]:
len(doc_paths)

605

In [3]:
# Create doc2vec embeddings
# d2vcv =  (lang, sentences_per_chunk)
# d2vcv.fit_transform(doc_paths)

In [4]:
start = time.time()

In [5]:
## Document-based features
document_chunk_features = []
document_book_features = [] 

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_features, book_features = fe.get_all_features()  
    document_chunk_features.extend(chunk_features)
    document_book_features.append(book_features)
print(len(document_book_features), len(document_chunk_features))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 605/605 [25:52<00:00,  2.57s/it]  

605 14128





In [6]:
# Recalculate the chunk features for the whole book, which is considered as one chunk
document_chunk_features_fulltext = [] #Chunk features calculated for whole book

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk=None)
    chunk_features_fulltext, _ = fe.get_all_features()
    document_chunk_features_fulltext.extend(chunk_features_fulltext)
print(len(document_chunk_features_fulltext))

100%|██████████| 605/605 [19:20<00:00,  1.92s/it]

605





In [7]:
# Pickle document-based features
with open(features_dir + 'document_chunk_features' + '.pkl', 'wb') as f:
    pickle.dump(document_chunk_features, f, -1)

with open(features_dir + 'document_book_features' + '.pkl', 'wb') as f:
    pickle.dump(document_book_features, f, -1)

with open(features_dir + 'document_chunk_features_fulltext' + '.pkl', 'wb') as f:
    pickle.dump(document_chunk_features_fulltext, f, -1)


In [8]:
# Load document-based features  
with open(features_dir + 'document_chunk_features' + '.pkl', 'rb') as f:
    document_chunk_features = pickle.load(f)

with open(features_dir + 'document_book_features' + '.pkl', 'rb') as f:
    document_book_features = pickle.load(f)

with open(features_dir + 'document_chunk_features_fulltext' + '.pkl', 'rb') as f:
    document_chunk_features_fulltext = pickle.load(f)

In [9]:
## Corpus-based features
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, sentences_per_chunk, nr_features=100) 

100%|██████████| 605/605 [04:07<00:00,  2.45it/s]


In [10]:
corpus_chunk_features, corpus_book_features = cbfe.get_all_features()

with open(features_dir + 'corpus_chunk_features' + '.pkl', 'wb') as f:
    pickle.dump(corpus_chunk_features, f, -1)

with open(features_dir + 'corpus_book_features' + '.pkl', 'wb') as f:
    pickle.dump(corpus_book_features, f, -1)

<bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>


100%|██████████| 605/605 [07:13<00:00,  1.39it/s]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 434.1084065437317
<bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>


100%|██████████| 605/605 [08:28<00:00,  1.19it/s]
  0%|          | 0/605 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 508.86294078826904
<bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>


100%|██████████| 605/605 [06:02<00:00,  1.67it/s]
  0%|          | 0/605 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 362.446186542511
<bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>


100%|██████████| 605/605 [05:59<00:00,  1.68it/s]
  0%|          | 0/605 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 360.54180121421814
<bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>


100%|██████████| 605/605 [04:29<00:00,  2.25it/s]
100%|██████████| 605/605 [04:57<00:00,  2.03it/s]
100%|██████████| 605/605 [04:45<00:00,  2.12it/s]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 856.1508932113647
<bound method CorpusBasedFeatureExtractor.get_production_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>


100%|██████████| 605/605 [41:55<00:00,  4.16s/it]  



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_production_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 2518.2515552043915
Time for <bound method CorpusBasedFeatureExtractor.get_overlap_score_doc2vec of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 36.9691698551178
Time for <bound method CorpusBasedFeatureExtractor.get_overlap_score_sbert of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 39.99323391914368
Time for <bound method CorpusBasedFeatureExtractor.get_outlier_score_doc2vec of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f2138944070>>: 1.8568933010101318
Time for <bound method CorpusBasedFeatureExtractor.get_outlier_score_sbert of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureEx

In [11]:
# # Recalculate the chunk features for the whole book, which is considered as one chunk
cbfe_fulltext = CorpusBasedFeatureExtractor(lang, doc_paths, sentences_per_chunk=None, nr_features=100)

100%|██████████| 605/605 [04:13<00:00,  2.39it/s]


In [12]:
corpus_chunk_features_fulltext, _ = cbfe_fulltext.get_all_features()
with open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'wb') as f:
    pickle.dump(corpus_chunk_features_fulltext, f, -1)

<bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>


100%|██████████| 605/605 [06:00<00:00,  1.68it/s]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>: 360.3566679954529
<bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>


100%|██████████| 605/605 [06:03<00:00,  1.66it/s]
  0%|          | 0/605 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>: 364.0487651824951
<bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>


100%|██████████| 605/605 [05:54<00:00,  1.71it/s]
  0%|          | 0/605 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>: 354.84729743003845
<bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>


100%|██████████| 605/605 [05:53<00:00,  1.71it/s]
  0%|          | 0/605 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>: 354.0057940483093
<bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>


100%|██████████| 605/605 [04:22<00:00,  2.31it/s]
100%|██████████| 605/605 [04:36<00:00,  2.19it/s]
100%|██████████| 605/605 [04:40<00:00,  2.15it/s]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>: 819.3665556907654
<bound method CorpusBasedFeatureExtractor.get_production_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>


100%|██████████| 605/605 [40:53<00:00,  4.06s/it]  



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_production_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7f20e62cd940>>: 2455.9816596508026


In [13]:
# Load corpus-based features  
with open(features_dir + 'corpus_chunk_features' + '.pkl', 'rb') as f:
    corpus_chunk_features = pickle.load(f)

with open(features_dir + 'corpus_book_features' + '.pkl', 'rb') as f:
    corpus_book_features = pickle.load(f)

with open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'rb') as f:
    corpus_chunk_features_fulltext = pickle.load(f)

In [14]:
corpus_chunk_features_fulltext

Unnamed: 0,book_name,unigram_distance,unigram_distance_limited,bigram_distance,trigram_distance,pos_unigram_PUNCT,pos_unigram_NOUN,pos_unigram_VERB,pos_unigram_PRON,pos_unigram_ADP,...,VP->VBG_RB,NP->DT_VBG_NN,VP->VBG_VBN,NP->NN_NNS,VP->VBZ_TO_VB,NP->DT_NN_NN_NN,S->NP_VP,VP->VBP_CC,NP->PRP_DT,VP->VBD_VBN_VBN
0,Radcliffe_Ann_The-Italian_1797,0.028470,0.964958,0.022108,0.002255,0.168691,0.141964,0.128439,0.100368,0.098242,...,0.000587,0.000948,0.001236,0.000412,0.000412,0.000597,0.000247,0.000330,0.000185,0.000886
1,Gissing_George_In-the-Year-of-Jubilee_1894,0.040656,0.958549,0.012670,0.001200,0.177138,0.129638,0.127511,0.123034,0.091255,...,0.000384,0.000384,0.000410,0.000576,0.000640,0.000512,0.000909,0.000256,0.000576,0.000269
2,Collins_Wilkie_Armadale_1864,0.024837,0.959111,0.006647,0.001034,0.153036,0.143237,0.122004,0.119838,0.108400,...,0.000847,0.000747,0.000653,0.000423,0.000923,0.000670,0.000776,0.000423,0.000635,0.000706
3,Conrad_Joseph_Lord-Jim_1899,0.029921,0.919140,0.009482,0.002558,0.160312,0.151091,0.119694,0.110728,0.111518,...,0.001062,0.001221,0.000358,0.000756,0.000398,0.000610,0.001712,0.000664,0.000637,0.001102
4,Shelley_Mary_Perkin-Warbeck_1830,0.041606,0.910397,0.017097,0.003477,0.150419,0.163175,0.116985,0.097747,0.104800,...,0.000358,0.001185,0.000317,0.000685,0.000409,0.000705,0.000317,0.000439,0.000225,0.000736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,Hardy_Thomas_The-Whithered-Arm_1888,0.051106,0.948614,0.015059,0.003504,0.163645,0.149969,0.118623,0.102566,0.097572,...,0.000839,0.001343,0.000839,0.000671,0.000168,0.000671,0.000504,0.000336,0.000504,0.001511
601,Eliot_George_The-Mill-on-the-Floss_1860,0.027631,1.263697,0.010010,0.001831,0.156654,0.135442,0.124457,0.108586,0.098747,...,0.000842,0.000815,0.000438,0.000535,0.000640,0.000894,0.000193,0.000482,0.000456,0.000798
602,Edgeworth_Maria_The-Modern-Griselda_1804,0.050171,0.990300,0.021133,0.002066,0.181984,0.137695,0.126285,0.123066,0.083641,...,0.000572,0.000163,0.000735,0.000817,0.000817,0.000490,0.000163,0.000817,0.000408,0.000327
603,Baldwin_Louisa_My-Next-Door-Neighour_1894,0.036336,0.963451,0.028839,0.005279,0.131734,0.155846,0.115032,0.121813,0.109381,...,0.000264,0.000264,0.000264,0.000264,0.000527,0.001318,0.000000,0.000791,0.000527,0.000527


In [15]:
# Book features
document_book_features = pd.DataFrame(document_book_features)
document_chunk_features_fulltext = pd.DataFrame(document_chunk_features_fulltext)
book_df = document_book_features\
            .merge(right=document_chunk_features_fulltext, on='book_name', how='outer', validate='one_to_one')\
            .merge(right=corpus_book_features, on='book_name', validate='one_to_one')\
            .merge(right=corpus_chunk_features_fulltext, on='book_name', validate='one_to_one')

# Chunk features
document_chunk_features = pd.DataFrame(document_chunk_features)
chunk_df = document_chunk_features.merge(right=corpus_chunk_features, on='book_name', how='outer', validate='one_to_one')
chunk_df

# Remove chunk id from book_name
chunk_df['book_name'] = chunk_df['book_name'].str.split('_').str[:4].str.join('_')

# Combine book features and averages of chunksaveraged chunk features
book_and_averaged_chunk_df = book_df.merge(chunk_df.groupby("book_name").mean().reset_index(drop=False), on="book_name")
book_and_averaged_chunk_df

chunk_and_copied_book_df = chunk_df.merge(right=book_df, on='book_name', how='outer', validate='many_to_one')
chunk_and_copied_book_df

print(book_df.shape, chunk_df.shape, book_and_averaged_chunk_df.shape, chunk_and_copied_book_df.shape)

dfs = {'book_df': book_df, 'book_and_averaged_chunk_df': book_and_averaged_chunk_df, 'chunk_df': chunk_df, 'chunk_and_copied_book_df': chunk_and_copied_book_df}

for name, df in dfs.items():
    df = df.sort_values(by='book_name', axis=0, ascending=True, na_position='first')
    df.to_csv(f"{features_dir}{name}.csv", index=False)
    
    print(df.isnull().values.any())
    print(df.columns[df.isna().any()].tolist())
end = time.time()

(605, 1219) (14128, 1211) (605, 2429) (14128, 2429)
True
['doc2vec_stepwise_distance', 'sbert_stepwise_distance']
True
['doc2vec_stepwise_distance', 'sbert_stepwise_distance']
False
[]
True
['doc2vec_stepwise_distance', 'sbert_stepwise_distance']


In [16]:
end-start

13138.919528961182