In [1]:
%load_ext autoreload
%autoreload 2
lang = "eng"

import os
import sys
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

sys.path.insert(0, "../src/")
from doc2vec_chunk_vectorizer import Doc2VecChunkVectorizer
from production_rule_extractor import ProductionRuleExtractor
from doc_based_feature_extractor import DocBasedFeatureExtractor
from corpus_based_feature_extractor import CorpusBasedFeatureExtractor
from utils import get_doc_paths

raw_docs_dir = f"../data/raw_docs/{lang}/"
labels_dir = "../data/labels/"
features_dir = f"../data/features/{lang}/"

if not os.path.exists(features_dir):
    os.makedirs(features_dir)

doc_paths = get_doc_paths(raw_docs_dir, lang)[:3]

sentences_per_chunk = 200

In [2]:
# for lang in ["eng", "ger"]:
#     doc_paths = get_doc_paths(raw_docs_dir, lang)
#     d2vcv = Doc2VecChunkVectorizer(lang, sentences_per_chunk)
#     d2vcv.fit_transform(doc_paths)

In [3]:
## Document-based features
all_chunk_features = []
all_book_features = [] #Features that must be calculated on whole book
all_average_sbert_sentence_embeddings = []
all_doc2vec_chunk_embeddings = []

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_features, book_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()  
    all_chunk_features.extend(chunk_features)
    all_book_features.append(book_features)
    all_average_sbert_sentence_embeddings.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings.append(doc2vec_chunk_embeddings)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/33 [00:00<?, ?it/s][A
 15%|█▌        | 5/33 [00:00<00:00, 49.70it/s][A
 30%|███       | 10/33 [00:00<00:00, 49.75it/s][A
 52%|█████▏    | 17/33 [00:00<00:00, 52.85it/s][A
 70%|██████▉   | 23/33 [00:00<00:00, 54.32it/s][A
100%|██████████| 33/33 [00:00<00:00, 57.28it/s][A
 33%|███▎      | 1/3 [00:04<00:08,  4.19s/it]
  0%|          | 0/50 [00:00<?, ?it/s][A
 26%|██▌       | 13/50 [00:00<00:00, 123.39it/s][A
 52%|█████▏    | 26/50 [00:00<00:00, 123.33it/s][A
100%|██████████| 50/50 [00:00<00:00, 124.90it/s][A
 67%|██████▋   | 2/3 [00:07<00:04,  4.05s/it]
  0%|          | 0/84 [00:00<?, ?it/s][A
 11%|█         | 9/84 [00:00<00:00, 85.07it/s][A
 23%|██▎       | 19/84 [00:00<00:00, 88.43it/s][A
 35%|███▍      | 29/84 [00:00<00:00, 88.89it/s][A
 46%|████▋     | 39/84 [00:00<00:00, 91.62it/s][A
 58%|█████▊    | 49/84 [00:00<00:00, 93.34it/s][A
 71%|███████▏  | 60/84 [00:00<00:00, 95.25it/s][A
 83%|████████▎ | 70/84 [00:00

In [4]:
# Recalculate the chunk features for the whole book, which is considered as one chunk
all_chunk_features_fulltext = [] #Chunk features calculated for whole book
all_average_sbert_sentence_embeddings_fulltext = []
all_doc2vec_chunk_embeddings_fulltext = []

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk=None)
    chunk_features_fulltext, _, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
    all_chunk_features_fulltext.extend(chunk_features_fulltext)
    all_average_sbert_sentence_embeddings_fulltext.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings_fulltext.append(doc2vec_chunk_embeddings)

100%|██████████| 3/3 [00:09<00:00,  3.13s/it]


In [5]:
# # Pickle document-based features
# f = open(features_dir + 'all_chunk_features' + '.pkl', 'wb')  
# pickle.dump(all_chunk_features, f, -1)
# f.close()

# f = open(features_dir + 'all_book_features' + '.pkl', 'wb')  
# pickle.dump(all_book_features, f, -1)
# f.close()

# f = open(features_dir + 'all_average_sbert_sentence_embeddings' + '.pkl', 'wb')  
# pickle.dump(all_average_sbert_sentence_embeddings, f, -1)
# f.close()

# f = open(features_dir + 'all_doc2vec_chunk_embeddings' + '.pkl', 'wb')  
# pickle.dump(all_doc2vec_chunk_embeddings, f, -1)
# f.close()

# f = open(features_dir + 'all_chunk_features_fulltext' + '.pkl', 'wb')  
# pickle.dump(all_chunk_features_fulltext, f, -1)
# f.close()

# # Load document-based features
# f = open(features_dir + 'all_chunk_features' + '.pkl', 'rb')  
# all_chunk_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_book_features' + '.pkl', 'rb')  
# all_book_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_average_sbert_sentence_embeddings' + '.pkl', 'rb')  
# all_average_sbert_sentence_embeddings = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_doc2vec_chunk_embeddings' + '.pkl', 'rb')  
# all_doc2vec_chunk_embeddings = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_chunk_features_fulltext' + '.pkl', 'rb')  
# all_chunk_features_fulltext = pickle.load(f)
# f.close()

In [6]:
## Corpus-based features
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings, all_doc2vec_chunk_embeddings, sentences_per_chunk, nr_features=100)

100%|██████████| 33/33 [00:00<00:00, 56.57it/s]
100%|██████████| 50/50 [00:00<00:00, 118.21it/s]
100%|██████████| 84/84 [00:00<00:00, 88.22it/s]


In [7]:
corpus_chunk_features, corpus_book_features = cbfe.get_all_features()

In [8]:
# Aggregate embeddings from chunks in cbfe instead of returning them via functions from dbfe
for o,n in zip(cbfe.all_average_sbert_sentence_embeddings, cbfe.new_sbert):
    print(len(o), len(n))
    print(np.array_equal(o,n))

32 32
True
49 49
True
83 83
True


In [9]:
# Aggregate embeddings from chunks in cbfe instead of returning them via functions from dbfe
for o,n in zip(cbfe.all_doc2vec_chunk_embeddings, cbfe.new_doc2vec):
    print(len(o), len(n))
    print(np.array_equal(o,n))

32 32
True
49 49
True
83 83
True


In [10]:
# # Recalculate the chunk features for the whole book, which is considered as one chunk
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings_fulltext, all_doc2vec_chunk_embeddings_fulltext, sentences_per_chunk=None, nr_features=100)

In [11]:
corpus_chunk_features_fulltext, _ = cbfe.get_all_features()

In [12]:
# # Pickle corpus-based features
# f = open(features_dir + 'corpus_chunk_features' + '.pkl', 'wb')
# pickle.dump(corpus_chunk_features, f, -1)
# f.close()

# f = open(features_dir + 'corpus_book_features' + '.pkl', 'wb')
# pickle.dump(corpus_book_features, f, -1)
# f.close()

# f = open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'wb')
# pickle.dump(corpus_chunk_features_fulltext, f, -1)
# f.close()

# # Load corpus-based features
# f = open(features_dir + 'corpus_chunk_features' + '.pkl', 'rb')  
# corpus_chunk_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'corpus_book_features' + '.pkl', 'rb')  
# corpus_book_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'rb')  
# corpus_chunk_features_fulltext = pickle.load(f)
# f.close()

In [13]:
# book features
all_book_features = pd.DataFrame(all_book_features)
all_chunk_features_fulltext = pd.DataFrame(all_chunk_features_fulltext)
book_df = all_book_features\
            .merge(right=all_chunk_features_fulltext, on='book_name', how='outer', validate='one_to_one')\
            .merge(right=corpus_book_features, on='book_name', validate='one_to_one')\
            .merge(right=corpus_chunk_features_fulltext, on='book_name', validate='one_to_one')

In [14]:
#chunk features
all_chunk_features = pd.DataFrame(all_chunk_features)
chunk_df = all_chunk_features.merge(right=corpus_chunk_features, on='book_name', how='outer', validate='one_to_one')
chunk_df

Unnamed: 0,book_name,ratio_of_punctuation_marks,ratio_of_whitespaces,ratio_of_digits,ratio_of_exclamation_marks,ratio_of_question_marks,ratio_of_commas,ratio_of_uppercase_letters,average_number_of_words_in_sentence,maximum_number_of_words_in_sentence,...,NP->PRP$_JJ_NNS,VP->VBG_TO_VB,VP->VBP_RB_VBN,NP->DT_NN_NN_NN,NP->NN_DT_NN,NP->PRP$_JJ,NP->PRP$_JJ_NN_NN,VP->MD_VB_TO_VB,NP->JJ_NNP_NNP,VP->VBP_VBG
0,Radcliffe_Ann_The-Italian_1797_0,0.029752,0.173061,0.000216,0.000540,0.000405,0.017927,0.012941,32.105,97,...,0.000000,0.001917,0.000000,0.000548,0.001917,0.000548,0.000274,0.000274,0.000000,0.000000
1,Radcliffe_Ann_The-Italian_1797_1,0.035566,0.171857,0.000000,0.000792,0.000627,0.017684,0.016401,26.175,82,...,0.000000,0.000673,0.001010,0.000000,0.000673,0.001010,0.000673,0.000337,0.000337,0.000337
2,Radcliffe_Ann_The-Italian_1797_2,0.031726,0.170552,0.000000,0.000926,0.000449,0.017616,0.015894,30.450,106,...,0.001481,0.000592,0.000000,0.000592,0.001185,0.000889,0.001185,0.001185,0.000592,0.000000
3,Radcliffe_Ann_The-Italian_1797_3,0.035779,0.173319,0.000000,0.001205,0.000551,0.018492,0.019114,25.310,79,...,0.000000,0.000359,0.000718,0.000359,0.001436,0.000359,0.000718,0.000000,0.000000,0.000000
4,Radcliffe_Ann_The-Italian_1797_4,0.029484,0.169726,0.000000,0.000561,0.000171,0.018769,0.011917,34.850,112,...,0.000791,0.001055,0.000000,0.000791,0.000000,0.000264,0.001055,0.000791,0.000000,0.000264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,Collins_Wilkie_Armadale_1864_78,0.038712,0.177045,0.000000,0.001502,0.000985,0.014077,0.018788,18.880,143,...,0.000000,0.000000,0.000489,0.000979,0.000000,0.000000,0.000979,0.000000,0.000489,0.000000
160,Collins_Wilkie_Armadale_1864_79,0.037697,0.179425,0.000000,0.001107,0.001309,0.013035,0.023722,17.900,60,...,0.000000,0.000000,0.000000,0.000000,0.000965,0.000000,0.000965,0.000000,0.001448,0.000000
161,Collins_Wilkie_Armadale_1864_80,0.044330,0.186087,0.000000,0.001190,0.001315,0.012523,0.026361,14.890,56,...,0.000556,0.000556,0.000556,0.000556,0.000000,0.001112,0.000000,0.000000,0.000556,0.000000
162,Collins_Wilkie_Armadale_1864_81,0.033757,0.183519,0.000000,0.000775,0.000672,0.012045,0.019814,17.850,77,...,0.000956,0.000000,0.000000,0.000478,0.000000,0.001912,0.000478,0.000000,0.000000,0.000000


In [15]:
chunk_df['book_name'] = chunk_df['book_name'].str.split('_').str[:4].str.join('_')

In [16]:
# Combine book features and averages of chunksaveraged chunk features
book_and_averaged_chunk_df = book_df.merge(chunk_df.groupby("book_name").mean().reset_index(drop=False), on="book_name")
book_and_averaged_chunk_df

Unnamed: 0,book_name,doc2vec_intra_textual_variance,sbert_intra_textual_variance,doc2vec_stepwise_distance,sbert_stepwise_distance,ratio_of_punctuation_marks_x,ratio_of_whitespaces_x,ratio_of_digits_x,ratio_of_exclamation_marks_x,ratio_of_question_marks_x,...,NP->PRP$_JJ_NNS_y,VP->VBG_TO_VB_y,VP->VBP_RB_VBN_y,NP->DT_NN_NN_NN_y,NP->NN_DT_NN_y,NP->PRP$_JJ_y,NP->PRP$_JJ_NN_NN,VP->MD_VB_TO_VB_y,NP->JJ_NNP_NNP,VP->VBP_VBG
0,Radcliffe_Ann_The-Italian_1797,9.107221,0.511742,11.5068,0.647907,0.035949,0.171615,2.4e-05,0.001263,0.0006,...,0.000725,0.000641,0.00071,0.000611,0.00069,0.000448,0.00039,0.000484,0.000279,0.000307
1,Gissing_George_In-the-Year-of-Jubilee_1894,7.386115,0.391544,9.869623,0.512462,0.046069,0.183475,3.5e-05,0.000567,0.001835,...,0.000476,0.000645,0.000455,0.000511,0.000682,0.000636,0.000387,0.000779,0.00051,0.000643
2,Collins_Wilkie_Armadale_1864,8.981559,0.494673,10.862341,0.549614,0.038922,0.18399,0.000139,0.001029,0.001101,...,0.000694,0.000649,0.000716,0.000684,0.000499,0.000615,0.000709,0.000488,0.000739,0.000661


In [17]:
chunk_and_copied_book_df = chunk_df.merge(right=book_df, on='book_name', how='outer', validate='many_to_one')
chunk_and_copied_book_df

Unnamed: 0,book_name,ratio_of_punctuation_marks_x,ratio_of_whitespaces_x,ratio_of_digits_x,ratio_of_exclamation_marks_x,ratio_of_question_marks_x,ratio_of_commas_x,ratio_of_uppercase_letters_x,average_number_of_words_in_sentence_x,maximum_number_of_words_in_sentence_x,...,NP->PRP$_JJ_NNS_y,VP->VBG_TO_VB_y,VP->VBP_RB_VBN_y,NP->DT_NN_NN_NN_y,NP->NN_DT_NN_y,NP->PRP$_JJ_y,VP->MD_VB_TO_VB_y,NP->NNP_NN_NN,NP->PRP$_JJ_JJ_NN,S->NP_VP_NP_''
0,Radcliffe_Ann_The-Italian_1797,0.029752,0.173061,0.000216,0.000540,0.000405,0.017927,0.012941,32.105,97,...,0.000721,0.000659,0.000669,0.000597,0.000711,0.000474,0.000505,0.000618,0.000185,0.000000
1,Radcliffe_Ann_The-Italian_1797,0.035566,0.171857,0.000000,0.000792,0.000627,0.017684,0.016401,26.175,82,...,0.000721,0.000659,0.000669,0.000597,0.000711,0.000474,0.000505,0.000618,0.000185,0.000000
2,Radcliffe_Ann_The-Italian_1797,0.031726,0.170552,0.000000,0.000926,0.000449,0.017616,0.015894,30.450,106,...,0.000721,0.000659,0.000669,0.000597,0.000711,0.000474,0.000505,0.000618,0.000185,0.000000
3,Radcliffe_Ann_The-Italian_1797,0.035779,0.173319,0.000000,0.001205,0.000551,0.018492,0.019114,25.310,79,...,0.000721,0.000659,0.000669,0.000597,0.000711,0.000474,0.000505,0.000618,0.000185,0.000000
4,Radcliffe_Ann_The-Italian_1797,0.029484,0.169726,0.000000,0.000561,0.000171,0.018769,0.011917,34.850,112,...,0.000721,0.000659,0.000669,0.000597,0.000711,0.000474,0.000505,0.000618,0.000185,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,Collins_Wilkie_Armadale_1864,0.038712,0.177045,0.000000,0.001502,0.000985,0.014077,0.018788,18.880,143,...,0.000699,0.000646,0.000729,0.000670,0.000511,0.000605,0.000499,0.000535,0.000893,0.000076
160,Collins_Wilkie_Armadale_1864,0.037697,0.179425,0.000000,0.001107,0.001309,0.013035,0.023722,17.900,60,...,0.000699,0.000646,0.000729,0.000670,0.000511,0.000605,0.000499,0.000535,0.000893,0.000076
161,Collins_Wilkie_Armadale_1864,0.044330,0.186087,0.000000,0.001190,0.001315,0.012523,0.026361,14.890,56,...,0.000699,0.000646,0.000729,0.000670,0.000511,0.000605,0.000499,0.000535,0.000893,0.000076
162,Collins_Wilkie_Armadale_1864,0.033757,0.183519,0.000000,0.000775,0.000672,0.012045,0.019814,17.850,77,...,0.000699,0.000646,0.000729,0.000670,0.000511,0.000605,0.000499,0.000535,0.000893,0.000076


In [18]:
print(book_df.shape, chunk_df.shape, book_and_averaged_chunk_df.shape, chunk_and_copied_book_df.shape)

(3, 1120) (164, 1211) (3, 2330) (164, 2330)


In [19]:
book_df.to_csv(f"{features_dir}/book_df.csv", index=False)
book_and_averaged_chunk_df.to_csv(f"{features_dir}/book_and_averaged_chunk_df.csv", index=False)
chunk_df.to_csv(f"{features_dir}/chunk_df.csv", index=False)
chunk_and_copied_book_df.to_csv(f"{features_dir}/chunk_and_copied_book_df.csv", index=False)