# Hierarchical Clustering

In [12]:
import json
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from nltk import word_tokenize
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage, ward, fcluster
from stop_words import get_stop_words
import seaborn as sns

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

from utils import add_epoch_division, alter_epoch_division, linkage_matrix, plot_dendrogram, remove_noise_poet, text_cleaning

In [2]:
DIM_RED = False
DOWNSAMPLED = False
LOWERCASE = True
MAX_FEATURES = 10000
PATH = "../corpora/amann_poems.csv"
CORPUS_NAME = "amann"
REDUCE_CORPUS = False
STOP_WORDS = get_stop_words("de")

In [4]:
corpus = text_cleaning(pd.read_csv(PATH))
corpus = corpus[corpus.epoch != "Sturm_Drang"]

In [8]:
def random_downsampling(corpus, class_col = "epoch", max_value = 14):
    """ Reduces all instances of all classes to a certain maximum value.
    """   
    
    
    unique_epochs = list(np.unique(corpus[class_col]))
    subcorpora = []
    
    for epoch in unique_epochs:
        subcorpus = corpus[corpus[class_col] == epoch]
        subcorpus = subcorpus.sample(max_value)
        subcorpora.append(subcorpus)
    
    return pd.concat(subcorpora, axis=0)

if DOWNSAMPLED:
    corpus = random_downsampling(corpus)

In [9]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             stop_words=STOP_WORDS)
vector = vectorizer.fit_transform(corpus["poem"])

In [10]:
text = corpus["poem"]
shortened_classes = [c[:2] for c in corpus["epoch"].values]
ids = [p for p in corpus["id"].values]

- Ward-Algorithmus
- Kosinus Ähnlichkeit

In [68]:
dist = 1 - cosine_similarity(vector)
linkage_matrix = ward(dist) 

fig, ax = plt.subplots(figsize=(15, 20)) # set size

hierarchical_labels = corpus["epoch"].values
hierarchical_labels = corpus.year.values

hierarchical_labels = []

for i,j,k in zip(corpus.epoch.values, 
                 corpus.year.values,
                 corpus.poet.values):
    hierarchical_labels.append(k + "__" + str(j) + "__" + i)

ax = dendrogram(linkage_matrix, orientation="right", labels=hierarchical_labels);

plt.tick_params(axis= 'x',  
                which='both', 
                bottom='off',   
                top='off',
                labelbottom='off')

figure_name = CORPUS_NAME

if DOWNSAMPLED:
    figure_name += "_ds"


plt.savefig(f'../results/hierarchical_clusters/{figure_name}.png', dpi=300) 
plt.close(fig)
#plt.tight_layout()

In [71]:
flat_clusters = fcluster(linkage_matrix, t=5, criterion='distance')

In [72]:
flat_clusters

array([3, 2, 7, 1, 2, 3, 5, 5, 2, 5, 2, 6, 2, 2, 6, 2, 5, 5, 5, 6, 6, 2,
       2, 5, 7, 6, 6, 1, 2, 2, 2, 7, 3, 4, 1, 2, 2, 2, 7, 1, 1, 7, 7, 2,
       1, 1, 7, 2, 2, 6, 2, 5, 2, 1, 2, 5, 7, 2, 2, 6, 2, 2, 2, 2, 2, 1,
       7, 7, 5, 2, 2, 4, 7, 2, 1, 7, 6, 6, 1, 2, 2, 5, 5, 4, 6, 1, 7, 2,
       7, 7, 7, 4, 2, 5, 5, 5, 2, 5, 1, 2, 5, 1, 5, 2, 2, 1, 2, 1, 5, 2,
       1, 2, 2, 3, 2, 6, 4, 2, 2, 7, 7, 2, 2, 1, 1, 1, 2, 4, 7, 7, 7, 2,
       7, 5, 7, 2, 2, 2, 5, 5, 7, 1, 1, 1, 2, 2, 2, 2, 2, 1, 3, 3, 2, 2,
       2, 2, 1, 2, 7, 7, 2, 2, 2, 3, 2, 1, 7, 3, 6, 7, 3, 1, 2, 7, 7, 1,
       5, 2, 7, 7, 2, 5, 2, 2, 2, 6, 5, 3, 2, 1, 2, 2, 5, 1, 5, 7, 2, 6,
       1, 5, 1, 1, 3, 2, 7, 2, 1, 2, 2, 4, 6, 2, 3, 3, 2, 5, 1, 5, 6, 2,
       1, 2, 5, 1, 2, 2, 2, 4, 2, 2, 2, 7, 6, 2, 2, 7, 6, 6, 6, 7],
      dtype=int32)

In [73]:
d = {}

i = 0
for epoch,poet,year,cluster in zip(corpus.epoch, corpus.poet, corpus.year, flat_clusters):
    #d[cluster].append({"epoch": epoch, "poet": poet, "year": year})
    d[i] = {"epoch": epoch, "poet": poet, "year": year, "cluster": cluster}
    i +=1

In [77]:
cluster_results = pd.DataFrame(d).T

In [78]:
barock = cluster_results[cluster_results.epoch == "Barock"]

In [79]:
barock.cluster.value_counts()

3    13
6    12
4     8
7     3
5     1
2     1
Name: cluster, dtype: int64

In [84]:
barock.sort_values(by="year")

Unnamed: 0,epoch,poet,year,cluster
209,Barock,"Spee, Friedrich",1613,4
71,Barock,"Gloger, Georg",1617,4
170,Barock,"Opitz, Martin",1619,3
227,Barock,"Weckherlin, Georg Rodolf",1623,4
116,Barock,"Hoyers, Anna Ovena",1624,4
59,Barock,"Fleming, Paul",1624,6
150,Barock,"Logau, Friedrich von",1630,3
32,Barock,"Czepko von Reigersfeld, Daniel",1632,3
33,Barock,"Dach, Simon",1632,4
91,Barock,"Harsdörffer, Georg Philipp",1632,4
