# Test

In [5]:
import json
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from stop_words import get_stop_words
import seaborn as sns

from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, v_measure_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, Normalizer

from yellowbrick.text import UMAPVisualizer
from yellowbrick.style import set_palette

from utils import add_epoch_division, linkage_matrix, merge_corpus_poets, plot_dendrogram, remove_noise_poet, text_cleaning



In [6]:
LOWERCASE = True
MAX_FEATURES = 10000
STOP_WORDS = get_stop_words("de")

In [4]:
import numpy as np
import pandas as pd
import utils

from sklearn.decomposition import LatentDirichletAllocation

In [3]:
corpus = pd.read_csv("../corpora/amann_poems.csv")

In [7]:
text = corpus["poem"].values
labels = LabelEncoder().fit_transform(corpus["epoch"].values)
unique_epochs = list(np.unique(corpus["epoch"]))


vectorizer = TfidfVectorizer(max_df=0.5,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             stop_words=STOP_WORDS)
vector = vectorizer.fit_transform(text)

In [14]:
lda = LatentDirichletAllocation(n_components=3, random_state=0)
lda.fit(vector)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [15]:
for index, topic in enumerate(lda.components_):
    print(f'Top 15 words for Topic #{index}')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Top 15 words for Topic #0
['seggt', 'heww', 'sick', 'vœr', 'hadd', 'ęr', 'weer', 'dor', 'dunn', 'vnd', 'sik', 'oll', 'vun', 'auß', 'auff']


Top 15 words for Topic #1
['tal', 'jhr', 'mut', 'rot', 'diß', 'muth', 'not', 'wol', 'hertz', 'auff', 'dieß', 'ew', 'itzt', 'tränen', 'sey']


Top 15 words for Topic #2
['wil', 'frewd', 'unnd', 'ewer', 'dier', 'drumb', 'vil', 'selbs', 'vmb', 'vns', 'dan', 'wan', 'auß', 'auff', 'vnd']




In [12]:
corpus.head(3)

Unnamed: 0,pid,filename,poet,title,year,poem,poemlength
0,38237,"Dahn, Felix_Ein Königsspiel_1873","Dahn, Felix",Ein Königsspiel,1873,Saß der König Artaxerxes In dem goldnen Haus z...,594
1,25612,"Tieck, Ludwig_[So wie ein Weiser schloß er sei...","Tieck, Ludwig",[So wie ein Weiser schloß er seinen Lauf],1813,"So wie ein Weiser schloß er seinen Lauf, Wohlt...",113
2,61526,"Rückert, Friedrich_UNTITLED_1837","Rückert, Friedrich",UNTITLED,1837,"Bei einem Lehrer ist von Schuelern eine Gilde,...",287


In [10]:
corpus2 = utils.text_cleaning(corpus)
corpus2.head(3)

Unnamed: 0,pid,filename,poet,title,year,poem,poemlength
0,38237,"Dahn, Felix_Ein Königsspiel_1873","Dahn, Felix",Ein Königsspiel,1873,Saß der König Artaxerxes In dem goldnen Haus z...,594
1,25612,"Tieck, Ludwig_[So wie ein Weiser schloß er sei...","Tieck, Ludwig",[So wie ein Weiser schloß er seinen Lauf],1813,"So wie ein Weiser schloß er seinen Lauf, Wohlt...",113
2,61526,"Rückert, Friedrich_UNTITLED_1837","Rückert, Friedrich",UNTITLED,1837,"Bei einem Lehrer ist von Schuelern eine Gilde,...",287


# json with epoch splitting

In [13]:
epochs = {"random": {
            "Barock": {"b": 1600, "e": 1720},
            "Aufklärung": {"b": 1720, "e": 1790},
            "Empfindsamkeit": {"b": 1740, "e": 1790},
            "Sturm und Drang": {"b": 1765, "e": 1790},
            "Klassik": {"b": 1786, "e": 1882},
            "Romantik": {"b": 1798, "e": 1835},
            "Biedermeier": {"b": 1815, "e": 1848},
            "Vormärz_Junges Deutschland": {"b": 1825, "e": 1848},
            "Realismus": {"b": 1848, "e": 1890},
            "Naturalismus": {"b": 1880, "e": 1900},
            "Moderne": {"b": 1890, "e": 1920},
            "Expressionismus": {"b": 1910, "e": 1925},
            "Avantgarde_Dadaismus": {"b": 1915, "e": 1925},
            "Literatur der Weimarer Republik_Neue Sachlichkeit": {"b": 1919, "e": 1932}},
         "brenner": {
            "Barock": {"b": 1600, "e": 1700},
            "Frühaufklärung": {"b": 1700, "e": 1755},
            "Aufklärung": {"b": 1755, "e": 1810},
            "Klassik_Romantik": {"b": 1786, "e": 1832},
            "Biedermeier": {"b": 1815, "e": 1848},
            "Realismus": {"b": 1848, "e": 1900},
            "Moderne": {"b": 1880, "e": 1918},
            "Weimarer Republik": {"b": 1918, "e": 1933}
         }}

In [38]:
import json

with open("epochs.json") as f:
    d = json.loads(f.read())
epochs = d["brenner"]

In [39]:
epochs_d = epochs.copy()

In [40]:
del epochs["Klassik_Romantik"]

In [41]:
epochs_d

{'Barock': {'b': 1600, 'e': 1700},
 'Frühaufklärung': {'b': 1700, 'e': 1755},
 'Aufklärung': {'b': 1755, 'e': 1810},
 'Klassik_Romantik': {'b': 1786, 'e': 1832},
 'Biedermeier': {'b': 1815, 'e': 1848},
 'Realismus': {'b': 1848, 'e': 1900},
 'Moderne': {'b': 1880, 'e': 1918},
 'Weimarer Republik': {'b': 1918, 'e': 1933}}

In [47]:
def add_epoch_division(corpus, epochs, epoch_exception=""):
    """ Divide poems in DataFrame into epochs by dictionary.
    """
    df = corpus.copy()
    epochs_d = {}
    
    for epoch, v in epochs.items():
        if epoch != epoch_exception:
            epochs_d[epoch] = list(range(epochs[epoch]["b"], epochs[epoch]["e"] + 1))
    
    df["epoch"] = df.apply(lambda row: utils.get_epoch(row.year, epochs_d), axis=1)
    return df

In [48]:
c2 = add_epoch_division(corpus, epochs_d, epoch_exception="Klassik_Romantik")

In [57]:
c2[c2.pid == 25612].epoch[1]

''

In [18]:
pd.read_json("epochs.json")

AttributeError: module 'pandas' has no attribute 'read_dict'

In [3]:
from itertools import combinations, product

In [9]:
eps_search = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
min_samples = [2, 3, 4, 5]
metrics = ["cosine", "euclidean"]

In [10]:
cartesian_inputs = list(product(eps_search, min_samples, metrics))

In [13]:
cartesian_inputs

[(0.1, 2, 'cosine'),
 (0.1, 2, 'euclidean'),
 (0.1, 3, 'cosine'),
 (0.1, 3, 'euclidean'),
 (0.1, 4, 'cosine'),
 (0.1, 4, 'euclidean'),
 (0.1, 5, 'cosine'),
 (0.1, 5, 'euclidean'),
 (0.3, 2, 'cosine'),
 (0.3, 2, 'euclidean'),
 (0.3, 3, 'cosine'),
 (0.3, 3, 'euclidean'),
 (0.3, 4, 'cosine'),
 (0.3, 4, 'euclidean'),
 (0.3, 5, 'cosine'),
 (0.3, 5, 'euclidean'),
 (0.5, 2, 'cosine'),
 (0.5, 2, 'euclidean'),
 (0.5, 3, 'cosine'),
 (0.5, 3, 'euclidean'),
 (0.5, 4, 'cosine'),
 (0.5, 4, 'euclidean'),
 (0.5, 5, 'cosine'),
 (0.5, 5, 'euclidean'),
 (0.7, 2, 'cosine'),
 (0.7, 2, 'euclidean'),
 (0.7, 3, 'cosine'),
 (0.7, 3, 'euclidean'),
 (0.7, 4, 'cosine'),
 (0.7, 4, 'euclidean'),
 (0.7, 5, 'cosine'),
 (0.7, 5, 'euclidean'),
 (0.9, 2, 'cosine'),
 (0.9, 2, 'euclidean'),
 (0.9, 3, 'cosine'),
 (0.9, 3, 'euclidean'),
 (0.9, 4, 'cosine'),
 (0.9, 4, 'euclidean'),
 (0.9, 5, 'cosine'),
 (0.9, 5, 'euclidean'),
 (1.0, 2, 'cosine'),
 (1.0, 2, 'euclidean'),
 (1.0, 3, 'cosine'),
 (1.0, 3, 'euclidean'),
 (1.0, 4, 