# Clustering evaluation

In [99]:
from collections import Counter
import json
import numpy as np
import pandas as pd

from utils import generate_wordcloud
from wordcloud import WordCloud

In [100]:
DIR = "amanns_experiments/"

RESULTS_PATH_KMEANS = f"../results/{DIR}kmeans_results_amann.json"
RESULTS_PATH_DBSCAN = f"../results/{DIR}dbscan_results_amann.json"
RESULTS_PATH_KMEDOIDS = f"../results/{DIR}kmedoids_results_amann.json"
RESULTS_PATH_KMEANS_RD = f"../results/{DIR}kmeans_results_amann_rd.json"
RESULTS_PATH_DBSCAN_RD = f"../results/{DIR}dbscan_results_amann_rd.json"
RESULTS_PATH_KMEANS = f"../results/{DIR}kmeans_results_amann.json"
RESULTS_PATH_KMEANS_NOISELESS = f"../results/{DIR}kmeans_results_amann_noiseless.json"
MAX_ARI = 0.5
MIN_ARI = -0.1

### standard

In [101]:
with open(RESULTS_PATH_KMEANS) as f:
    kmeans = json.load(f)

kmeans_scores = {}
kmeans_top_words = {}

for k,v in kmeans.items():
    kmeans_scores[k] = v["scores"]
    kmeans_top_words[k] = v["tw"]

In [47]:
with open(RESULTS_PATH_DBSCAN) as f:
    dbscan = json.load(f)
    
dbscan_scores = {}

for k,v in dbscan.items():
    dbscan_scores[k] = v["scores"]

In [55]:
with open(RESULTS_PATH_KMEDOIDS) as f:
    kmedoids = json.load(f)
    
kmedoids_scores = {}

for k,v in kmedoids.items():
    kmedoids_scores[k] = v["scores"]

### reduced

In [43]:
with open(RESULTS_PATH_KMEANS_RD) as f:
    kmeans_rd = json.load(f)

kmeans_rd_scores = {}
kmeans_rd_top_words = {}

for k,v in kmeans_rd.items():
    kmeans_rd_scores[k] = v["scores"]
    kmeans_rd_top_words[k] = v["tw"]

In [56]:
with open(RESULTS_PATH_DBSCAN_RD) as f:
    dbscan_rd = json.load(f)
    
dbscan_scores_rd = {}

for k,v in dbscan_rd.items():
    dbscan_scores_rd[k] = v["scores"]

### noiseless

In [44]:
with open(RESULTS_PATH_KMEANS_NOISELESS) as f:
    kmeans_noiseless = json.load(f)

kmeans_noiseless_scores = {}
kmeans_noiseless_top_words = {}

for k,v in kmeans_noiseless.items():
    kmeans_noiseless_scores[k] = v["scores"]
    kmeans_noiseless_top_words[k] = v["tw"]

### dfs

In [102]:
kmeans_score_df = pd.DataFrame.from_dict(kmeans_scores, orient="index").sort_values("ari", ascending=False)
kmeans_score_rd_df = pd.DataFrame.from_dict(kmeans_rd_scores, orient="index").sort_values("ari", ascending=False)
kmeans_noiseless_score_df = pd.DataFrame.from_dict(kmeans_noiseless_scores, orient="index").sort_values("ari", ascending=False)

In [85]:
dbscan_score_df = pd.DataFrame.from_dict(dbscan_scores, orient="index").sort_values("ari", ascending=False)
dbscan_score_rd_df = pd.DataFrame.from_dict(dbscan_scores_rd, orient="index").sort_values("ari", ascending=False)

In [57]:
kmedoids_score_df = pd.DataFrame.from_dict(kmedoids_scores, orient="index").sort_values("ari", ascending=False)

## Best Epochs

In [126]:
best_epochs = {}
min_score = 0.7

In [127]:
kmeans_best = list(kmeans_score_df[kmeans_score_df.ari >= min_score].index)
best_epochs["kmeans"] = kmeans_best
kmeans_score_df.head(7)

Unnamed: 0,ari,vm
Barock/Naturalismus,0.80549,0.746453
Barock/Realismus,0.773402,0.729663
Aufklärung/Naturalismus,0.745326,0.621077
Barock/Klassik,0.688026,0.651029
Barock/Aufklärung,0.541105,0.53055
Barock/Biedermeier,0.335736,0.423804
Aufklärung/Realismus,0.293688,0.394675


In [128]:
kmedoids_best = list(kmedoids_score_df[kmedoids_score_df.ari >= min_score].index)
best_epochs["kmedoids"] = kmedoids_best
kmedoids_score_df.head(7)

Unnamed: 0,ari,vm
Barock/Realismus,0.860581,0.811946
Barock/Klassik,0.836902,0.784668
Barock/Expressionismus,0.683957,0.610906
Barock/Naturalismus,0.632206,0.537986
Aufklärung/Realismus,0.550768,0.559927
Barock/Romantik,0.543027,0.498014
Barock/Aufklärung,0.430008,0.353975


In [129]:
dbscan_best = list(dbscan_score_df[dbscan_score_df.ari >= min_score].index)
best_epochs["dbscan"] = dbscan_best
dbscan_score_df.head(7)

Unnamed: 0,ari,vm
Barock/Realismus,0.51506,0.531504
Barock/Klassik,0.491301,0.499456
Barock/Biedermeier,0.452367,0.513535
Barock/Naturalismus,0.411893,0.444248
Barock/Aufklärung,0.374712,0.380885
Aufklärung/Romantik,0.31345,0.152963
Biedermeier/Expressionismus,0.289992,0.190054


In [130]:
kmeans_rd_best = list(kmeans_score_rd_df[kmeans_score_rd_df.ari >= min_score].index)
best_epochs["kmeans_rd"] = kmeans_rd_best
kmeans_score_rd_df.head(7)

Unnamed: 0,ari,vm
Klassik/Expressionismus,0.850551,0.761851
Barock/Biedermeier,0.72353,0.686389
Barock/Realismus,0.690851,0.660663
Barock/Klassik,0.653025,0.622653
Aufklärung/Naturalismus,0.579205,0.545969
Barock/Aufklärung,0.572019,0.509235
Romantik/Expressionismus,0.439114,0.498718


In [131]:
dbscan_rd_best = list(dbscan_score_rd_df[dbscan_score_rd_df.ari >= min_score].index)
best_epochs["dbscan_rd"] = dbscan_rd_best
dbscan_score_rd_df.head(7)

Unnamed: 0,ari,vm
Aufklärung/Naturalismus,0.806515,0.729329
Aufklärung/Realismus,0.776133,0.725598
Barock/Realismus,0.756305,0.712015
Barock/Klassik,0.707729,0.635589
Aufklärung/Biedermeier,0.68796,0.603123
Aufklärung/Expressionismus,0.572789,0.473481
Barock/Biedermeier,0.568653,0.551779


In [132]:
best_epochs_list = []

for k, v in best_epochs.items():
    for pair in v:
        epochs = pair.split("/")
        for epoch in epochs:
            best_epochs_list.append(epoch)
best_epochs_counter = dict(Counter(best_epochs_list).most_common(8))

In [133]:
best_epochs_counter

{'Barock': 7,
 'Realismus': 4,
 'Naturalismus': 3,
 'Aufklärung': 3,
 'Klassik': 3,
 'Expressionismus': 1,
 'Biedermeier': 1}

In [135]:
sum_counts = sum(best_epochs_counter.values())
c = 0
for k, v in best_epochs_counter.items():
    print(f"{k}: {int(v/sum_counts * 100)}%")

Barock: 31%
Realismus: 18%
Naturalismus: 13%
Aufklärung: 13%
Klassik: 13%
Expressionismus: 4%
Biedermeier: 4%


## top words

In [20]:
tw = pd.DataFrame.from_dict(kmeans_rd_top_words, orient="index")
tw.columns = ["cluster1", "cluster2"]

In [21]:
tw.head(3)

Unnamed: 0,cluster1,cluster2
Barock/Aufklärung,"[mädchen, dieß, amor, flur, hain, frei, jhr, w...","[auff, vnd, hertz, diß, wil, jhr, auß, gantz, ..."
Barock/Klassik,"[tränen, tal, mut, liebchen, hain, not, lenz, ...","[auff, vnd, diß, hertz, wil, wol, auß, hertzen..."
Barock/Romantik,"[vnd, auß, hertze, jhr, itzt, deß, hauß, umb, ...","[tränen, not, tun, schwert, tut, duft, frei, l..."


In [22]:
tw = tw.loc[["Barock/Realismus"]]

In [23]:
def strjoin(liste):
    return " ".join(liste)

In [24]:
tw["cluster1"] = tw.cluster1.apply(strjoin)
tw["cluster2"] = tw.cluster2.apply(strjoin)

In [25]:
tw.head(3)

Unnamed: 0,cluster1,cluster2
Barock/Realismus,mädchen sehnsucht duft not leis ew rasch sanft...,auff vnd sey diß wil hertz wol auß hertzen jhr


In [26]:
for i in tw.cluster1.values:
    print(i)
    
for i in tw.cluster2.values:
    print(i)

mädchen sehnsucht duft not leis ew rasch sanft dunklen drin
auff vnd sey diß wil hertz wol auß hertzen jhr


In [17]:
#generate_wordcloud(dict(Counter(cluster1_words)))