# Corpus Modification

In [42]:
import json
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from stop_words import get_stop_words
import seaborn as sns

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from utils import add_epoch_division, linkage_matrix, plot_dendrogram, remove_noise_poet, text_cleaning

In [31]:
corpus = text_cleaning(pd.read_csv("../corpora/german_poems.csv"))

In [32]:
with open("epochs.json") as f:
    epochs = json.loads(f.read())

epochs = epochs["amann"]
epoch_exceptions = ["Sturm_Drang"]
corpus = add_epoch_division(corpus, epochs, epoch_exceptions=epoch_exceptions)

In [33]:
LOWERCASE = True
MAX_FEATURES = 10000
STOP_WORDS = get_stop_words("de")

In [34]:
def merge_corpus_poets(corpus, min_count=6):
    """ Merge poems in corpus by poet. Epoch with the most entries will be chosen.
    """
    
    # remove poets with less than min_count
    df = corpus.copy()
    poets = [k for k, v in dict(df.poet.value_counts()).items() if v >= min_count]
    df = df[df.poet.isin(poets)]
    
    
    
    new_poems = {}
    

    # fill dictionary 'new_poems' with a summarized poem of all poems of a poet within 
    # an epoch, with the corresponding epoch, the mean of the publication years, 
    # the poets name and an id.
    # Skip poet with no name (N.N.)
    c = 0
    for poet in list(np.unique(df.poet)):
        if poet != "N. N.,":
            pcorpus = df[df.poet == poet]
            for e in pcorpus.epoch.unique():
                ecorpus = pcorpus[pcorpus.epoch == e]
                s = " ".join(ecorpus.poem)
                year = int(ecorpus.year.mean())
                new_poems[c] = [c, poet, s, year, e]
                c += 1
        

    mod_c = pd.DataFrame.from_dict(new_poems).T
    mod_c.columns = ["id", "poet", "poem", "year", "epoch"]

    return mod_c

In [35]:
mod_c = merge_corpus_poets(corpus)

In [36]:
mod_c.epoch.value_counts()

Aufklärung         64
Klassik            61
Realismus          50
Barock             45
                   34
Biedermeier        28
Naturalismus       22
Expressionismus    15
Romantik           13
Name: epoch, dtype: int64

In [41]:
c = text_cleaning(mod_c)

In [None]:
def replace_poets(text):
    text = re.sub('Assmann', 'Aßmann', text)
    text = re.sub('Czepko, Daniel von', 'Czepko von Reigersfeld, Daniel', text)
    text = re.sub('Goethe, Johann Wolfgang', 'Goethe, Johann Wolfgang von', text)
    text = re.sub('Hoffmannswaldau, Christian Hoffmann von', 'Hofmann von Hofmannswaldau, Christian', text)
    text = re.sub('Hofmannswaldau, Christian Hofmann von', 'Hofmann von Hofmannswaldau, Christian', text)
    text = re.sub('Karsch, Anna Luise', 'Karsch, Anna Louisa', text)
    text = re.sub('Kosegarten, Gotthard Ludwig', 'Kosegarten, Ludwig Gotthard', text)
    
    return text

In [40]:
list(mod_c.poet.unique())

['Abschatz, Hans Assmann von',
 'Abschatz, Hans Aßmann von',
 'Ahlefeld, Charlotte von',
 'Angelus Silesius',
 'Arndt, Ernst Moritz',
 'Arnim, Ludwig Achim von',
 'Arnold, Gottfried',
 'Aston, Louise',
 'Ball, Hugo',
 'Baudelaire, Charles',
 'Bechstein, Ludwig',
 'Bierbaum, Otto Julius',
 'Birken, Sigmund von',
 'Blumauer, Aloys',
 'Bodenstedt, Friedrich von',
 'Bodmer, Johann Jacob',
 'Boie, Heinrich Christian',
 'Brentano, Clemens',
 'Brinckman, John',
 'Brockes, Barthold Heinrich',
 'Busch, Wilhelm',
 'Büchner, Luise',
 'Bürger, Gottfried August',
 'Candidus, Karl',
 'Canitz, Friedrich Rudolph Ludwig von',
 'Celander (auch Johann Georg Gressel)',
 'Chamisso, Adelbert von',
 'Christen, Ada',
 'Conradi, Hermann',
 'Cronegk, Johann Friedrich von',
 'Czepko von Reigersfeld, Daniel',
 'Czepko, Daniel von',
 'Dach, Simon',
 'Dahn, Felix',
 'Daumer, Georg Friedrich',
 'Dauthendey, Max',
 'Dehmel, Richard Fedor Leopold',
 'Denis, Michael',
 'Dingelstedt, Franz von',
 'Dranmor, (Schmid, Ludw