# Corpus Modification

In [41]:
import json
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from stop_words import get_stop_words
import seaborn as sns

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from utils import add_epoch_division, linkage_matrix, plot_dendrogram, remove_noise_poet, text_cleaning

In [42]:
corpus = text_cleaning(pd.read_csv("../corpora/german_poems.csv"))

In [44]:
with open("epochs.json") as f:
    epochs = json.loads(f.read())

epochs = epochs["brenner"]
epoch_exception = "Klassik_Romantik"
corpus = add_epoch_division(corpus, epochs, epoch_exception=epoch_exception)

In [4]:
LOWERCASE = True
MAX_FEATURES = 10000
STOP_WORDS = get_stop_words("de")

In [74]:
def merge_corpus_poets(corpus, min_count=6):
    """ Merge poems in corpus by poet. Epoch with the most entries will be chosen.
    """
    df = corpus.copy()
    poets = [k for k, v in dict(df.poet.value_counts()).items() if v >= min_count]
    df = df[df.poet.isin(poets)]
    new_poems = {}

    for idx, poet in enumerate(list(np.unique(df.poet))):
        pcorpus = df[df.poet == poet]
        epochs = dict(pcorpus.epoch.value_counts())
        s = " ".join(pcorpus.poem)
        new_poems[idx] = [idx, poet, s, max(epochs)]
        
    mod_c = pd.DataFrame.from_dict(new_poems).T
    mod_c.columns = ["id", "poet", "poem", "epoch"]
    
    return mod_c

In [76]:
mod_c = merge_corpus_poets(corpus)

In [77]:
mod_c.epoch.value_counts()

Realismus            69
Frühaufklärung       41
Aufklärung           39
Barock               36
Biedermeier          35
Moderne               7
                      4
Weimarer_Republik     2
Name: epoch, dtype: int64