In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

sys.path.insert(0, "../src/")
from doc2vec_chunk_vectorizer import Doc2VecChunkVectorizer
from production_rule_extractor import ProductionRuleExtractor
from doc_based_feature_extractor import DocBasedFeatureExtractor
from corpus_based_feature_extractor import CorpusBasedFeatureExtractor
from utils import get_doc_paths

raw_docs_dir = "../data/raw_docs/"
labels_dir = "../data/labels/"
extracted_features_dir = "../data/extracted_features/"

lang = "eng"
doc_paths = get_doc_paths(raw_docs_dir, lang)[:3]

sentences_per_chunk = 200

In [2]:
# for lang in ["eng", "ger"]:
#     doc_paths = get_doc_paths(raw_docs_dir, lang)
#     d2vcv = Doc2VecChunkVectorizer(lang, sentences_per_chunk)
#     d2vcv.fit_transform(doc_paths)

In [3]:
# Document-based features
# Chunk features
all_chunk_features = []
all_book_features = [] #Features that must be calculated on whole book
all_average_sbert_sentence_embeddings = []
all_doc2vec_chunk_embeddings = []

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_features, book_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
        
    all_chunk_features.extend(chunk_features)
    all_book_features.append(book_features)
    all_average_sbert_sentence_embeddings.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings.append(doc2vec_chunk_embeddings)

100%|██████████| 3/3 [00:14<00:00,  4.92s/it]


In [5]:
# Book features
# Recalculate the chunk features for the whole book, which is considered as one chunk
all_chunk_book_features = [] #Chunk features calculated for whole book

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk=None)
    chunk_book_features, book_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
        
    all_chunk_book_features.extend(chunk_book_features)

100%|██████████| 3/3 [00:09<00:00,  3.33s/it]


In [6]:
# f = open(extracted_features_dir + lang + '/all_chunk_features' + '.pkl', 'wb')  
# pickle.dump(all_chunk_features, f, -1)
# f.close()

# f = open(extracted_features_dir + lang + '/all_book_features' + '.pkl', 'wb')  
# pickle.dump(all_book_features, f, -1)
# f.close()

# f = open(extracted_features_dir + lang + '/all_average_sbert_sentence_embeddings' + '.pkl', 'wb')  
# pickle.dump(all_average_sbert_sentence_embeddings, f, -1)
# f.close()

# f = open(extracted_features_dir + lang + '/all_doc2vec_chunk_embeddings' + '.pkl', 'wb')  
# pickle.dump(all_doc2vec_chunk_embeddings, f, -1)
# f.close()

# f = open(extracted_features_dir + lang + '/all_chunk_features' + '.pkl', 'rb')  
# all_chunk_features = pickle.load(f)
# f.close()

# f = open(extracted_features_dir + lang + '/all_book_features' + '.pkl', 'rb')  
# all_book_features = pickle.load(f)
# f.close()

# f = open(extracted_features_dir + lang + '/all_average_sbert_sentence_embeddings' + '.pkl', 'rb')  
# all_average_sbert_sentence_embeddings = pickle.load(f)
# f.close()

# f = open(extracted_features_dir + lang + '/all_doc2vec_chunk_embeddings' + '.pkl', 'rb')  
# all_doc2vec_chunk_embeddings = pickle.load(f)
# f.close()

In [7]:
# f = open(extracted_features_dir + lang + '/all_corpus_based_features_' + '.pkl', 'rb')  
# all_corpus_based_features = pickle.load(f)
# f.close()

In [8]:
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings, all_doc2vec_chunk_embeddings, sentences_per_chunk, nr_features=100)

100%|██████████| 3/3 [00:00<00:00,  5.94it/s]
100%|██████████| 3/3 [00:01<00:00,  2.17it/s]
100%|██████████| 3/3 [00:03<00:00,  1.19s/it]


In [9]:
all_corpus_based_features = cbfe.get_all_features()
corpus_chunk_features, corpus_book_features = all_corpus_based_features
corpus_chunk_features, corpus_book_features

100%|██████████| 3/3 [00:00<00:00, 1087.17it/s]
100%|██████████| 3/3 [00:00<00:00, 873.93it/s]


(                            book_name  unigram_distance  \
 0    Radcliffe_Ann_The-Italian_1797_0          0.053512   
 1    Radcliffe_Ann_The-Italian_1797_1          0.032266   
 2    Radcliffe_Ann_The-Italian_1797_2          0.037827   
 3    Radcliffe_Ann_The-Italian_1797_3          0.029868   
 4    Radcliffe_Ann_The-Italian_1797_4          0.051704   
 ..                                ...               ...   
 159   Collins_Wilkie_Armadale_1864_78          0.042406   
 160   Collins_Wilkie_Armadale_1864_79          0.043421   
 161   Collins_Wilkie_Armadale_1864_80          0.042596   
 162   Collins_Wilkie_Armadale_1864_81          0.076752   
 163   Collins_Wilkie_Armadale_1864_82          0.035168   
 
      unigram_distance_limited  bigram_distance  trigram_distance  \
 0                    0.626328         0.079167          0.009183   
 1                    0.648189         0.034697          0.004494   
 2                    0.598874         0.046354          0.005383   
 3

In [10]:
'''
entry is doubled in total_trigram_counts and book_trigram_mapping_abs and book_trigram_mapping_rel
{'<EOS> <EOS> <BOS>': 33086,
 '<EOS> <BOS> <BOS>': 33086,
 '''

"\nentry is doubled in total_trigram_counts and book_trigram_mapping_abs and book_trigram_mapping_rel\n{'<EOS> <EOS> <BOS>': 33086,\n '<EOS> <BOS> <BOS>': 33086,\n "

In [11]:
# f = open(extracted_features_dir + lang + '/all_corpus_based_features' + '.pkl', 'wb')
# pickle.dump(all_corpus_based_features, f, -1)
# f.close()

In [12]:
# import os
# import pandas as pd

# book_df = pd.DataFrame(all_book_features)
# #book_df = book_df.merge(all_corpus_based_features, on="book_name")
# book_and_averaged_chunk_df = book_df.merge(pd.DataFrame(all_chunk_features).groupby("book_name").mean().reset_index(drop=False), on="book_name")

# chunk_df = pd.DataFrame(all_chunk_features)
# chunk_and_copied_book_df = chunk_df.merge(pd.DataFrame(all_book_features), on="book_name")
# #chunk_and_copied_book_df = chunk_and_copied_book_df.merge(all_corpus_based_features, on="book_name")

# os.makedirs(f"{extracted_features_dir}/{lang}", exist_ok=True)
# book_df.to_csv(f"{extracted_features_dir}/{lang}/book_df.csv", index=False)
# book_and_averaged_chunk_df.to_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv", index=False)
# chunk_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv", index=False)
# chunk_and_copied_book_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv", index=False)