In [1]:
%load_ext autoreload
%autoreload 2
lang = "ger"

import os
import sys
sys.path.insert(0, "../src/")
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from feature_extraction.doc2vec_chunk_vectorizer import Doc2VecChunkVectorizer
from feature_extraction.doc_based_feature_extractor import DocBasedFeatureExtractor
from feature_extraction.corpus_based_feature_extractor import CorpusBasedFeatureExtractor
from utils import get_doc_paths
from chunk import Chunk
import time

raw_docs_dir = f"../data/raw_docs/{lang}/"
labels_dir = "../data/labels/"
features_dir = f"../data/features/{lang}/"

if not os.path.exists(features_dir):
    os.makedirs(features_dir)

doc_paths = get_doc_paths(raw_docs_dir, lang)[:3]

sentences_per_chunk = 200

In [2]:
# Create doc2vec embeddings
# d2vcv =  (lang, sentences_per_chunk)
# d2vcv.fit_transform(doc_paths)

In [3]:
start = time.time()

In [4]:
## Document-based features
document_chunk_features = []
document_book_features = [] 

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_features, book_features = fe.get_all_features()  
    document_chunk_features.extend(chunk_features)
    document_book_features.append(book_features)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 3/3 [00:10<00:00,  3.34s/it]


In [5]:
# Recalculate the chunk features for the whole book, which is considered as one chunk
document_chunk_features_fulltext = [] #Chunk features calculated for whole book

for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk=None)
    chunk_features_fulltext, _ = fe.get_all_features()
    document_chunk_features_fulltext.extend(chunk_features_fulltext)

100%|██████████| 3/3 [00:08<00:00,  2.74s/it]


In [6]:
# Pickle document-based features
with open(features_dir + 'document_chunk_features' + '.pkl', 'wb') as f:
    pickle.dump(document_chunk_features, f, -1)

with open(features_dir + 'document_book_features' + '.pkl', 'wb') as f:
    pickle.dump(document_book_features, f, -1)

with open(features_dir + 'document_chunk_features_fulltext' + '.pkl', 'wb') as f:
    pickle.dump(document_chunk_features_fulltext, f, -1)


In [7]:
# Load document-based features  
with open(features_dir + 'document_chunk_features' + '.pkl', 'rb') as f:
    document_chunk_features = pickle.load(f)

with open(features_dir + 'document_book_features' + '.pkl', 'rb') as f:
    document_book_features = pickle.load(f)

with open(features_dir + 'document_chunk_features_fulltext' + '.pkl', 'rb') as f:
    document_chunk_features_fulltext = pickle.load(f)

In [8]:
## Corpus-based features
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, sentences_per_chunk, nr_features=100) 

100%|██████████| 3/3 [00:04<00:00,  1.37s/it]


In [9]:
corpus_chunk_features, corpus_book_features = cbfe.get_all_features()

with open(features_dir + 'corpus_chunk_features' + '.pkl', 'wb') as f:
    pickle.dump(corpus_chunk_features, f, -1)

with open(features_dir + 'corpus_book_features' + '.pkl', 'wb') as f:
    pickle.dump(corpus_book_features, f, -1)

<bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>


100%|██████████| 3/3 [00:08<00:00,  2.98s/it]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 42.8352530002594
<bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>


100%|██████████| 3/3 [00:05<00:00,  1.67s/it]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 40.16433596611023
<bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>


100%|██████████| 3/3 [00:04<00:00,  1.50s/it]
  0%|          | 0/3 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 4.7468955516815186
<bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>


100%|██████████| 3/3 [00:04<00:00,  1.59s/it]
  0%|          | 0/3 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 4.976939678192139
<bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>


100%|██████████| 3/3 [00:04<00:00,  1.38s/it]
100%|██████████| 3/3 [00:04<00:00,  1.39s/it]
100%|██████████| 3/3 [00:04<00:00,  1.41s/it]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 12.568570613861084
Time for <bound method CorpusBasedFeatureExtractor.get_overlap_score_doc2vec of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 0.001676321029663086
Time for <bound method CorpusBasedFeatureExtractor.get_overlap_score_sbert of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 0.003639698028564453
Time for <bound method CorpusBasedFeatureExtractor.get_outlier_score_doc2vec of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe501743ca0>>: 0.0029480457305908203
Time for <bound method CorpusBasedFeatureExtractor.get_outlier_score_sbert of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatur




In [10]:
# # Recalculate the chunk features for the whole book, which is considered as one chunk
cbfe_fulltext = CorpusBasedFeatureExtractor(lang, doc_paths, sentences_per_chunk=None, nr_features=100)

100%|██████████| 3/3 [00:04<00:00,  1.40s/it]


In [11]:
corpus_chunk_features_fulltext, _ = cbfe_fulltext.get_all_features()
with open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'wb') as f:
    pickle.dump(corpus_chunk_features_fulltext, f, -1)

<bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>


100%|██████████| 3/3 [00:05<00:00,  1.70s/it]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>: 39.06505727767944
<bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>


100%|██████████| 3/3 [00:04<00:00,  1.59s/it]



Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_unigram_distance_limited of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>: 40.08614683151245
<bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>


100%|██████████| 3/3 [00:04<00:00,  1.58s/it]
  0%|          | 0/3 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_bigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>: 4.98028302192688
<bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>


100%|██████████| 3/3 [00:04<00:00,  1.57s/it]
  0%|          | 0/3 [00:00<?, ?it/s]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_trigram_distance of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>: 4.917870044708252
<bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>


100%|██████████| 3/3 [00:04<00:00,  1.39s/it]
100%|██████████| 3/3 [00:04<00:00,  1.44s/it]
100%|██████████| 3/3 [00:04<00:00,  1.35s/it]


Time for corpus_chunk_feature_mapping <bound method CorpusBasedFeatureExtractor.get_tag_distribution of <feature_extraction.corpus_based_feature_extractor.CorpusBasedFeatureExtractor object at 0x7fe4fece0c70>>: 12.559510707855225





In [12]:
# Load corpus-based features  
with open(features_dir + 'corpus_chunk_features' + '.pkl', 'rb') as f:
    corpus_chunk_features = pickle.load(f)

with open(features_dir + 'corpus_book_features' + '.pkl', 'rb') as f:
    corpus_book_features = pickle.load(f)

with open(features_dir + 'corpus_chunk_features_fulltext' + '.pkl', 'rb') as f:
    corpus_chunk_features_fulltext = pickle.load(f)

In [13]:
corpus_chunk_features_fulltext

Unnamed: 0,book_name,unigram_distance,unigram_distance_limited,bigram_distance,trigram_distance,pos_unigram_PUNCT,pos_unigram_NOUN,pos_unigram_ADV,pos_unigram_PRON,pos_unigram_DET,...,pos_trigram_NOUN_ADV_ADV,pos_trigram_PUNCT_SCONJ_DET,pos_trigram_VERB_PUNCT_PUNCT,pos_trigram_PUNCT_ADV_PUNCT,pos_trigram_PRON_ADV_VERB,pos_trigram_ADP_NOUN_PUNCT,pos_trigram_BOS_BOS_ADP,pos_trigram_ADP_PRON_VERB,pos_trigram_VERB_PRON_DET,pos_trigram_VERB_PUNCT_CCONJ
0,Raabe_Wilhelm_Eine-Grabrede-aus-dem-Jahr-1609_...,0.082029,0.785397,0.011187,0.00248,0.14123,0.176958,0.094498,0.071845,0.116246,...,0.003267,0.004021,0.002765,0.001257,0.004021,0.007288,0.011561,0.004021,0.005026,0.003518
1,Moellhausen_Balduin_Die-Mandanen-Waise_1865,0.045865,0.625736,0.005807,0.000905,0.151232,0.150216,0.103993,0.106444,0.115176,...,0.004431,0.006359,0.004172,0.001398,0.004476,0.004183,0.004859,0.004893,0.005152,0.005457
2,Conradi_Hermann_Adam-Mensch_1889,0.049122,0.858093,0.00368,0.00102,0.200356,0.127888,0.138919,0.105587,0.088104,...,0.00487,0.002371,0.005166,0.008709,0.004573,0.004743,0.003627,0.004009,0.003571,0.003261


In [14]:
# Book features
document_book_features = pd.DataFrame(document_book_features)
document_chunk_features_fulltext = pd.DataFrame(document_chunk_features_fulltext)
book_df = document_book_features\
            .merge(right=document_chunk_features_fulltext, on='book_name', how='outer', validate='one_to_one')\
            .merge(right=corpus_book_features, on='book_name', validate='one_to_one')\
            .merge(right=corpus_chunk_features_fulltext, on='book_name', validate='one_to_one')

# Chunk features
document_chunk_features = pd.DataFrame(document_chunk_features)
chunk_df = document_chunk_features.merge(right=corpus_chunk_features, on='book_name', how='outer', validate='one_to_one')
chunk_df

# Remove chunk id from book_name
chunk_df['book_name'] = chunk_df['book_name'].str.split('_').str[:4].str.join('_')

# Combine book features and averages of chunksaveraged chunk features
book_and_averaged_chunk_df = book_df.merge(chunk_df.groupby("book_name").mean().reset_index(drop=False), on="book_name")
book_and_averaged_chunk_df

chunk_and_copied_book_df = chunk_df.merge(right=book_df, on='book_name', how='outer', validate='many_to_one')
chunk_and_copied_book_df

print(book_df.shape, chunk_df.shape, book_and_averaged_chunk_df.shape, chunk_and_copied_book_df.shape)

dfs = {'book_df': book_df, 'book_and_averaged_chunk_df': book_and_averaged_chunk_df, 'chunk_df': chunk_df, 'chunk_and_copied_book_df': chunk_and_copied_book_df}

for name, df in dfs.items():
    df = df.sort_values(by='book_name', axis=0, ascending=True, na_position='first')
    df.to_csv(f"{features_dir}{name}.csv", index=False)
    
    print(df.isnull().values.any())
    print(df.columns[df.isna().any()].tolist())
end = time.time()

(3, 1118) (60, 1110) (3, 2227) (60, 2227)
True
['doc2vec_stepwise_distance', 'sbert_stepwise_distance']
True
['doc2vec_stepwise_distance', 'sbert_stepwise_distance']
False
[]
True
['doc2vec_stepwise_distance', 'sbert_stepwise_distance']


In [15]:
end-start

238.25999474525452