In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.insert(0, "../src/")
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from handcrafted_features import DocBasedFeatureExtractor, Doc2VecChunkVectorizer, CorpusBasedFeatureExtractor
from utils import get_doc_paths, read_labels

raw_docs_dir = "../data/raw_docs/"
labels_dir = "../data/labels/"
extracted_features_dir = "../data/extracted_features/"

lang = "eng"
doc_paths = get_doc_paths(raw_docs_dir, lang)[:3]

sentences_per_chunk = 200
# d2vcv = Doc2VecChunkVectorizer(lang, sentences_per_chunk)
# d2vcv.fit_transform(doc_paths)


all_chunk_based_features = []
all_book_based_features = []
all_average_sbert_sentence_embeddings = []
all_doc2vec_chunk_embeddings = []
for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_based_features, book_based_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
    all_chunk_based_features.extend(chunk_based_features)
    all_book_based_features.append(book_based_features)
    all_average_sbert_sentence_embeddings.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings.append(doc2vec_chunk_embeddings)


100%|██████████| 3/3 [00:15<00:00,  5.01s/it]


In [2]:
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings=None, all_doc2vec_chunk_embeddings=None)
all_corpus_based_features = cbfe.get_all_features(k=50)


 33%|███▎      | 1/3 [00:00<00:00,  7.21it/s]

../data/raw_docs/eng/Radcliffe_Ann_The-Italian_1797.txt
../data/raw_docs/eng/Gissing_George_In-the-Year-of-Jubilee_1894.txt


100%|██████████| 3/3 [00:00<00:00,  7.83it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

../data/raw_docs/eng/Collins_Wilkie_Armadale_1864.txt


100%|██████████| 3/3 [00:00<00:00,  8.07it/s]


In [3]:
# book name x words
mfws = all_corpus_based_features
print(mfws.std(axis=1))

0    0.010713
1    0.010713
2    0.010713
dtype: float64


In [None]:
# absolute values

In [5]:
class MFWTable():
    def __init__(self, mfws):
        # book name x words
        self.mfws = mfws.set_index('book_name', drop=True)
        self.normalized = {}

    def get_normalization(self, normalization):
        
        def zscore(self):
            normalized_mfws = (self.mfws - self.mfws.mean(axis=0))/self.mfws.std(axis=0)
            return normalized_mfws
        
        if normalization in self.normalized:
            normalized_mfws = self.normalized[normalization]
        else:
            if normalization == 'zscore':
                normalized_mfws = zscore()
        
        self.normalized[normalization] = normalized_mfws
        return normalized_mfws
    
n = MFWTable(mfws)
n.get_normalization('zscore')

TypeError: zscore() missing 1 required positional argument: 'self'

In [None]:
class _FunctionRegistry():
    def __init__():
        self.deltas = {}
        
    def add_delta(self, delta):
        self.deltas[delta.name] = delta
        
    
registry = _FunctionRegistry()

class Delta():
    def __init__(self, name, normalization, distance):
        self.name = name
        self.normalization = normalization
        self.distance = distance
        # register when instantiated
        registry.add_delta(self)
        
    # call from registry with argument
    def __call__(self, mfws):
        df = pd.DataFrame(index=corpus.index, columns=corpus.index)
        for a, b in combinations(df.index, 2):
            delta = self.distance(corpus.loc[a,:], corpus.loc[b,:], *args, **kwargs)
            df.at[a, b] = delta
            df.at[b, a] = delta
        return df.fillna(0)
    
    def prepare(self, corpus):
        for normalization in self.normalizations:
            corpus = normalization(corpus)
        return corpus

    def __call__(self, corpus):
        return self.create_result(self.basis(self.prepare(corpus)), corpus)

        
Delta('Burrows', 'zscore', 'manhattan')
for delta in registry:
    delta(mfws)

In [None]:
import os
import pandas as pd

book_df = pd.DataFrame(all_book_based_features)
book_df = book_df.merge(all_corpus_based_features, on="book_name")
book_and_averaged_chunk_df = book_df.merge(pd.DataFrame(all_chunk_based_features).groupby("book_name").mean().reset_index(drop=False), on="book_name")

chunk_df = pd.DataFrame(all_chunk_based_features)
chunk_and_copied_book_df = chunk_df.merge(pd.DataFrame(all_book_based_features), on="book_name")
chunk_and_copied_book_df = chunk_and_copied_book_df.merge(all_corpus_based_features, on="book_name")

os.makedirs(f"{extracted_features_dir}/{lang}", exist_ok=True)
book_df.to_csv(f"{extracted_features_dir}/{lang}/book_df.csv", index=False)
book_and_averaged_chunk_df.to_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv", index=False)
chunk_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv", index=False)
chunk_and_copied_book_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv", index=False)


In [None]:
import sys
sys.path.insert(0, "../src/")
import numpy as np
import pandas as pd

extracted_features_dir = "../data/extracted_features/"
labels_dir = "../data/labels/"
lang = "eng"

book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_df.csv")
book_and_averaged_chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv")
chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv")
chunk_and_copied_book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv")
