# Agglomerative Clustering

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from stop_words import get_stop_words
import seaborn as sns

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import LabelEncoder

from utils import add_epoch_division, linkage_matrix, plot_dendrogram, remove_noise_poet, text_cleaning

In [2]:
corpus = text_cleaning(pd.read_csv("../corpora/german_poems.csv"))

In [3]:
LOWERCASE = True
MAX_FEATURES = 10000
STOP_WORDS = get_stop_words("de")

In [4]:
# reduce corpus
epochs = {"Frühaufklärung": {"b": 1700, "e": 1755},
          "Realismus": {"b": 1848, "e": 1900}
         }
corpus = add_epoch_division(corpus, epochs, epoch_exception="")

epoch1 = "Frühaufklärung"
epoch2 = "Realismus"
corpus = corpus[(corpus.epoch == epoch1) | (corpus.epoch == epoch2)]

In [5]:
#corpus = corpus.sample(10000)

In [5]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             lowercase=LOWERCASE,
                             max_features=MAX_FEATURES,
                             stop_words=STOP_WORDS)
features = vectorizer.fit_transform(corpus["poem"])

text = corpus["poem"]
classes = [c[0] for c in corpus["epoch"].values]
pids = [p for p in corpus["pid"].values]

## Clustering

In [77]:
pca = PCA(n_components=2)
X_red = pca.fit_transform(features.toarray())

In [78]:
%%time
agcl = AgglomerativeClustering(n_clusters=None, distance_threshold=0)
model = agcl.fit(X_red)

CPU times: user 13.6 s, sys: 1.11 s, total: 14.7 s
Wall time: 16 s


In [79]:
def get_font_size(p):
    font_size = 20-p
    if font_size < 4:
        font_size = 4
    return font_size    

In [87]:
p = 4
cluster_labels = pids

l = dendrogram(linkage_matrix(model), 
               p=p, 
               truncate_mode='level', 
               labels=cluster_labels, 
               orientation="left",
               leaf_font_size=get_font_size(p),
               no_plot=True,
               show_leaf_counts=False,
               show_contracted=False,
               get_leaves=True)
plt.show()

In [93]:
for i in range(1,20):

    p = i
    cluster_labels = pids

    l = dendrogram(linkage_matrix(model), 
                   p=p, 
                   truncate_mode='level', 
                   labels=cluster_labels, 
                   orientation="left",
                   leaf_font_size=get_font_size(p),
                   no_plot=True,
                   show_leaf_counts=False,
                   show_contracted=False,
                   get_leaves=True)
    
    print(f"{p} \t {len(l['ivl'])}")

1 	 4
2 	 8
3 	 16
4 	 32
5 	 64
6 	 126
7 	 248
8 	 464
9 	 834
10 	 1443
11 	 2350
12 	 3640
13 	 5543
14 	 8182
15 	 11484
16 	 14680
17 	 16786
18 	 17599
19 	 17772


In [118]:
p = 17
cluster_labels = pids

l = dendrogram(linkage_matrix(model), 
               p=p, 
               truncate_mode='level', 
               labels=cluster_labels, 
               orientation="left",
               leaf_font_size=get_font_size(p),
               no_plot=True,
               show_leaf_counts=False,
               show_contracted=False,
               get_leaves=True)

print(f"{p} \t {len(l['ivl'])}")

17 	 16786


#### remove noise

In [119]:
noise = [leaf for leaf in l["ivl"] if leaf != ""]

In [107]:
corpus_noiseless = remove_noise_poet(corpus, noise, min_n=20)

In [120]:
corpus_noiseless = corpus[corpus.pid.isin(noise)]

In [121]:
corpus.shape

(17796, 8)

In [122]:
corpus_noiseless.shape

(15973, 8)

In [123]:
corpus_noiseless.to_csv("../corpora/german_poems_noiseless.csv", index=False)

In [124]:
print(model.children_.shape)
model.children_[:]

(17795, 2)


array([[   65,  7353],
       [ 5728,  7522],
       [ 8260, 17797],
       ...,
       [35586, 35587],
       [35580, 35588],
       [35585, 35589]])