In [None]:
%load_ext autoreload
%autoreload 2
lang = "eng"

import os
import sys
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

sys.path.insert(0, "../src/")
from doc2vec_chunk_vectorizer import Doc2VecChunkVectorizer
from production_rule_extractor import ProductionRuleExtractor
from doc_based_feature_extractor import DocBasedFeatureExtractor
from corpus_based_feature_extractor import CorpusBasedFeatureExtractor
from utils import get_doc_paths

raw_docs_dir = f"../data/raw_docs/{lang}/"
labels_dir = "../data/labels/"
features_dir = f"../data/features/{lang}/"

if not os.path.exists(features_dir):
    os.makedirs(features_dir)

doc_paths = get_doc_paths(raw_docs_dir, lang)[:1]

sentences_per_chunk = 200

In [None]:
# sentences_per_chunk = None
# for lang in ["eng", "ger"]:
#     doc_paths = get_doc_paths(raw_docs_dir, lang)
#     d2vcv = Doc2VecChunkVectorizer(lang, sentences_per_chunk)
#     d2vcv.fit_transform(doc_paths)

In [None]:
## Document-based features
all_chunk_features = []
all_book_features = [] #Features that must be calculated on whole book
all_average_sbert_sentence_embeddings = []
all_doc2vec_chunk_embeddings = []

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_features, book_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()  
    all_chunk_features.extend(chunk_features)
    all_book_features.append(book_features)
    all_average_sbert_sentence_embeddings.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings.append(doc2vec_chunk_embeddings)

In [None]:
# Recalculate the chunk features for the whole book, which is considered as one chunk
all_chunk_features_fulltext = [] #Chunk features calculated for whole book
all_average_sbert_sentence_embeddings_fulltext = []
all_doc2vec_chunk_embeddings_fulltext = []

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk=None)
    chunk_features_fulltext, _, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
    all_chunk_features_fulltext.extend(chunk_features_fulltext)
    all_average_sbert_sentence_embeddings_fulltext.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings_fulltext.append(doc2vec_chunk_embeddings)

In [None]:
# # Load document-based features
# f = open(features_dir + 'all_chunk_features' + '.pkl', 'rb')  
# all_chunk_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_book_features' + '.pkl', 'rb')  
# all_book_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_average_sbert_sentence_embeddings' + '.pkl', 'rb')  
# all_average_sbert_sentence_embeddings = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_doc2vec_chunk_embeddings' + '.pkl', 'rb')  
# all_doc2vec_chunk_embeddings = pickle.load(f)
# f.close()

# f = open(features_dir + 'all_chunk_features_fulltext' + '.pkl', 'rb')  
# all_chunk_features_fulltext = pickle.load(f)
# f.close()

In [None]:
## Corpus-based features
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings, all_doc2vec_chunk_embeddings, sentences_per_chunk, nr_features=100)

In [None]:
corpus_chunk_features, corpus_book_features = cbfe.get_all_features()

In [None]:
# Aggregate embeddings from chunks in cbfe instead of returning them via functions from dbfe
for o,n in zip(cbfe.all_average_sbert_sentence_embeddings, cbfe.new_sbert):
    print(len(o), len(n))
    print(np.array_equal(o,n))

# Aggregate embeddings from chunks in cbfe instead of returning them via functions from dbfe
for o,n in zip(cbfe.all_doc2vec_chunk_embeddings, cbfe.new_doc2vec):
    print(len(o), len(n))
    print(np.array_equal(o,n))

In [None]:
# # Recalculate the chunk features for the whole book, which is considered as one chunk
cbfe_fulltext = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings_fulltext, all_doc2vec_chunk_embeddings_fulltext, sentences_per_chunk=None, nr_features=100)

In [None]:
corpus_chunk_features_fulltext, _ = cbfe_fulltext.get_all_features()

In [None]:
# Aggregate embeddings from chunks in cbfe instead of returning them via functions from dbfe
for o,n in zip(cbfe_fulltext.all_average_sbert_sentence_embeddings, cbfe_fulltext.new_sbert):
    print(len(o))
    print(len(n))
    print(np.array_equal(o,n))

# Aggregate embeddings from chunks in cbfe instead of returning them via functions from dbfe
for o,n in zip(cbfe_fulltext.all_doc2vec_chunk_embeddings, cbfe_fulltext.new_doc2vec):
    print(len(o), len(n))
    print(np.array_equal(o,n))

In [None]:
# Pickle corpus-based features
f = open(features_dir + 'corpus_chunk_features' + '.pkl', 'wb')
pickle.dump(corpus_chunk_features, f, -1)
f.close()

f = open(features_dir + 'corpus_book_features' + '.pkl', 'wb')
pickle.dump(corpus_book_features, f, -1)
f.close()

f = open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'wb')
pickle.dump(corpus_chunk_features_fulltext, f, -1)
f.close()

In [None]:
# # Load corpus-based features
# f = open(features_dir + 'corpus_chunk_features' + '.pkl', 'rb')  
# corpus_chunk_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'corpus_book_features' + '.pkl', 'rb')  
# corpus_book_features = pickle.load(f)
# f.close()

# f = open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'rb')  
# corpus_chunk_features_fulltext = pickle.load(f)
# f.close()

In [None]:
# book features
all_book_features = pd.DataFrame(all_book_features)
all_chunk_features_fulltext = pd.DataFrame(all_chunk_features_fulltext)
book_df = all_book_features\
            .merge(right=all_chunk_features_fulltext, on='book_name', how='outer', validate='one_to_one')\
            .merge(right=corpus_book_features, on='book_name', validate='one_to_one')\
            .merge(right=corpus_chunk_features_fulltext, on='book_name', validate='one_to_one')

In [None]:
book_df = pd.DataFrame(all_chunk_features_fulltext)
book_df.to_csv(f"{features_dir}/book_df.csv", index=False)

In [None]:
#chunk features
all_chunk_features = pd.DataFrame(all_chunk_features)
chunk_df = all_chunk_features.merge(right=corpus_chunk_features, on='book_name', how='outer', validate='one_to_one')
chunk_df

In [None]:
chunk_df['book_name'] = chunk_df['book_name'].str.split('_').str[:4].str.join('_')

In [None]:
# Combine book features and averages of chunksaveraged chunk features
book_and_averaged_chunk_df = book_df.merge(chunk_df.groupby("book_name").mean().reset_index(drop=False), on="book_name")
book_and_averaged_chunk_df

In [None]:
chunk_and_copied_book_df = chunk_df.merge(right=book_df, on='book_name', how='outer', validate='many_to_one')
chunk_and_copied_book_df

In [None]:
print(book_df.shape, chunk_df.shape, book_and_averaged_chunk_df.shape, chunk_and_copied_book_df.shape)

In [None]:
dfs = {'book_df': book_df, 'book_and_averaged_chunk_df': book_and_averaged_chunk_df, 'chunk_df': chunk_df, 'chunk_and_copied_book_df': chunk_and_copied_book_df}

In [None]:
pd.unique(book_df.dtypes)