# Hierarchical Clustering

In [1]:
import json
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
from nltk import word_tokenize
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage, ward, fcluster
from stop_words import get_stop_words
import seaborn as sns

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

from utils import add_epoch_division, alter_epoch_division, linkage_matrix, plot_dendrogram, remove_noise_poet, text_cleaning

In [6]:
SPLIT_BAROCK = False
DIM_RED = False
DOWNSAMPLED = False
LOWERCASE = True
MAX_FEATURES = 10000
NOISELESS = True
if NOISELESS:
    PATH = "../corpora/amann_poems_noiseless.csv"
else:
    PATH = "../corpora/amann_poems.csv"
CORPUS_NAME = "amann"
REDUCE_CORPUS = False
STOP_WORDS = get_stop_words("de")

In [7]:
corpus = text_cleaning(pd.read_csv(PATH))
corpus = corpus[corpus.epoch != "Sturm_Drang"]

In [8]:
def random_downsampling(corpus, class_col = "epoch", max_value = 14):
    """ Reduces all instances of all classes to a certain maximum value.
    """   
    
    
    unique_epochs = list(np.unique(corpus[class_col]))
    subcorpora = []
    
    for epoch in unique_epochs:
        subcorpus = corpus[corpus[class_col] == epoch]
        subcorpus = subcorpus.sample(max_value)
        subcorpora.append(subcorpus)
    
    return pd.concat(subcorpora, axis=0)

if DOWNSAMPLED:
    corpus = random_downsampling(corpus)

In [9]:
if SPLIT_BAROCK:
    for idx, row in corpus.iterrows():
        if row.year < 1650:
            corpus.at[idx,"epoch"] = "Frühbarock"
        elif row.year >= 1650 and row.year < 1700:
            corpus.at[idx,"epoch"] = "Hochbarock"

In [10]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             stop_words=STOP_WORDS)
vector = vectorizer.fit_transform(corpus["poem"])

In [11]:
text = corpus["poem"]
shortened_classes = [c[:2] for c in corpus["epoch"].values]
ids = [p for p in corpus["id"].values]

### Ward-Algorithmus + Kosinus Ähnlichkeit

In [12]:
dist = 1 - cosine_similarity(vector)
linkage_matrix = ward(dist) 

fig, ax = plt.subplots(figsize=(15, 20)) # set size

hierarchical_labels = corpus["epoch"].values
hierarchical_labels = corpus.year.values

hierarchical_labels = []

for i,j,k in zip(corpus.epoch.values, 
                 corpus.year.values,
                 corpus.poet.values):
    hierarchical_labels.append(k + "__" + str(j) + "__" + i)

ax = dendrogram(linkage_matrix, orientation="right", labels=hierarchical_labels);

plt.tick_params(axis= 'x',  
                which='both', 
                bottom='off',   
                top='off',
                labelbottom='off')

figure_name = CORPUS_NAME

if DOWNSAMPLED:
    figure_name += "_ds"

if NOISELESS:
    figure_name += "_noiseless"
    
if SPLIT_BAROCK:
    figure_name += "_splitbarock"


plt.savefig(f'../results/hierarchical_clusters/{figure_name}.png', dpi=300) 
plt.close(fig)
#plt.tight_layout()

## Flatten cluster and save plots

Only saving plots, where a flattened cluster of a specific treshold contains only one epoch

In [214]:
for treshold in range(2, 18):
    flat_clusters = fcluster(linkage_matrix, t=treshold, criterion='distance')
    
    d = {}
    i = 0
    for epoch,poet,year,cluster in zip(corpus.epoch, corpus.poet, corpus.year, flat_clusters):
        d[i] = {"epoch": epoch, "poet": poet, "year": year, "cluster": cluster}
        i += 1
    cr = pd.DataFrame(d).T
    
    save = False
    for p in range(1, len(list(np.unique(cr.cluster)))+1):
        tmp_cr = cr[cr.cluster == p]
        if len(np.unique(tmp_cr.epoch)) == 1:
            plt.scatter(tmp_cr.cluster, tmp_cr.epoch, c="#d62728", s=50, marker="X")
            save = True
        else:
            plt.scatter(tmp_cr.cluster, tmp_cr.epoch, c="#1f77b4")
    
    if len(list(np.unique(cr.cluster))) > 20:
        plt.xticks(list(range(0, len(list(np.unique(cr.cluster))) + 1, 2)))
    else:
        plt.xticks(list(range(0, len(list(np.unique(cr.cluster))) + 1)))
    plt.xlabel("Cluster")
    
        
    if save:
        plt.title(f" treshold: {treshold}\n cluster: {p}/{len(list(np.unique(cr.cluster)))}")
        output_name = f"treshold{treshold}_{p}of{len(list(np.unique(cr.cluster)))}"
        plt.savefig(f"../results/hierarchical_clusters/treshold_img/{output_name}.png", 
                    dpi=300, bbox_inches='tight')
        #plt.show()
    plt.close()
    plt.clf()

<Figure size 432x288 with 0 Axes>