In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

sys.path.insert(0, "../src/")
sys.path.insert(0, "../src/feature_extraction/")
from doc2vec_chunk_vectorizer import Doc2VecChunkVectorizer
from production_rule_extractor import ProductionRuleExtractor
from doc_based_feature_extractor import DocBasedFeatureExtractor
from corpus_based_feature_extractor import CorpusBasedFeatureExtractor
from utils import get_doc_paths, read_labels

raw_docs_dir = "../data/raw_docs/"
labels_dir = "../data/labels/"
extracted_features_dir = "../data/extracted_features/"

lang = "eng"
doc_paths = get_doc_paths(raw_docs_dir, lang)

sentences_per_chunk = 200

In [None]:
# for lang in ["eng", "ger"]:
#     doc_paths = get_doc_paths(raw_docs_dir, lang)
#     d2vcv = Doc2VecChunkVectorizer(lang, sentences_per_chunk)
#     d2vcv.fit_transform(doc_paths)

In [None]:
all_chunk_based_features = []
all_book_based_features = []
all_average_sbert_sentence_embeddings = []
all_doc2vec_chunk_embeddings = []

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_based_features, book_based_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
        
    all_chunk_based_features.extend(chunk_based_features)
    all_book_based_features.append(book_based_features)
    all_average_sbert_sentence_embeddings.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings.append(doc2vec_chunk_embeddings)

In [None]:
f = open(extracted_features_dir + lang + '/all_chunk_based_features' + '.pkl', 'wb')  
pickle.dump(all_chunk_based_features, f, -1)
f.close()

f = open(extracted_features_dir + lang + '/all_book_based_features' + '.pkl', 'wb')  
pickle.dump(all_book_based_features, f, -1)
f.close()

f = open(extracted_features_dir + lang + '/all_average_sbert_sentence_embeddings' + '.pkl', 'wb')  
pickle.dump(all_average_sbert_sentence_embeddings, f, -1)
f.close()

f = open(extracted_features_dir + lang + '/all_doc2vec_chunk_embeddings' + '.pkl', 'wb')  
pickle.dump(all_doc2vec_chunk_embeddings, f, -1)
f.close()

In [2]:
f = open(extracted_features_dir + lang + '/all_chunk_based_features' + '.pkl', 'rb')  
all_chunk_based_features = pickle.load(f)
f.close()

f = open(extracted_features_dir + lang + '/all_book_based_features' + '.pkl', 'rb')  
all_book_based_features = pickle.load(f)
f.close()

f = open(extracted_features_dir + lang + '/all_average_sbert_sentence_embeddings' + '.pkl', 'rb')  
all_average_sbert_sentence_embeddings = pickle.load(f)
f.close()

f = open(extracted_features_dir + lang + '/all_doc2vec_chunk_embeddings' + '.pkl', 'rb')  
all_doc2vec_chunk_embeddings = pickle.load(f)
f.close()

In [None]:
# f = open(extracted_features_dir + lang + '/all_corpus_based_features_' + '.pkl', 'rb')  
# all_corpus_based_features = pickle.load(f)
# f.close()

In [None]:
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings, all_doc2vec_chunk_embeddings)
all_corpus_based_features = cbfe.get_all_features(k=100)

100%|██████████| 606/606 [00:56<00:00, 10.64it/s]
100%|██████████| 606/606 [03:32<00:00,  2.85it/s]
  0%|          | 0/606 [00:00<?, ?it/s]

call bigram function from k most common ngrams
call bigram function from k most common ngrams


100%|██████████| 606/606 [00:44<00:00, 13.76it/s]
100%|██████████| 606/606 [00:47<00:00, 12.70it/s]
606it [05:47,  1.74it/s]
606it [05:46,  1.75it/s]
606it [00:04, 133.80it/s]
606it [00:04, 132.74it/s]


f = open(extracted_features_dir + lang + '/all_corpus_based_features' + '.pkl', 'wb')
pickle.dump(all_corpus_based_features, f, -1)
f.close()

In [None]:
import os
import pandas as pd

book_df = pd.DataFrame(all_book_based_features)
book_df = book_df.merge(all_corpus_based_features, on="book_name")
book_and_averaged_chunk_df = book_df.merge(pd.DataFrame(all_chunk_based_features).groupby("book_name").mean().reset_index(drop=False), on="book_name")

chunk_df = pd.DataFrame(all_chunk_based_features)
chunk_and_copied_book_df = chunk_df.merge(pd.DataFrame(all_book_based_features), on="book_name")
chunk_and_copied_book_df = chunk_and_copied_book_df.merge(all_corpus_based_features, on="book_name")

os.makedirs(f"{extracted_features_dir}/{lang}", exist_ok=True)
book_df.to_csv(f"{extracted_features_dir}/{lang}/book_df.csv", index=False)
book_and_averaged_chunk_df.to_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv", index=False)
chunk_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv", index=False)
chunk_and_copied_book_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv", index=False)