## Dendrograms - demo

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.decomposition import PCA

from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
from scipy.spatial.distance import pdist, squareform

plt.rcParams['figure.figsize'] = [9, 5]
sns.set_theme()

### Denrograms - iris subset

In [None]:
# Load iris dataset
iris = load_iris()

# Format iris dataset as dataframe
iris_df = pd.DataFrame(
    iris['data'],
    columns=iris['feature_names']
)

# Load iris labels
iris_df['label'] = iris['target_names'][iris['target']]

# Keep only petal length and petal width
iris_df_sub = iris_df.filter(
    items=[
        'petal length (cm)', 
        'petal width (cm)', 
        'label'
    ]
)

# Subset dataset to two columns for easier 
# visual inspection
numer_cols = ['petal length (cm)', 'petal width (cm)']

# Drop duplicates that have appeared
# due to removing columns
iris_df_sub = iris_df_sub.drop_duplicates().reset_index(drop=True)

# Reduce dataset for easier dendrogram preview
iris_df_sub = iris_df.groupby('label').sample(3, random_state=44).reset_index(drop=True)

In [None]:
# Plot 'petal length (cm)' and 'petal width (cm)' with
# cluster info and point idx
sns.scatterplot(iris_df_sub, x='petal length (cm)', y='petal width (cm)',hue='label')
for idx in iris_df_sub.index:
    plt.text(
        x=iris_df_sub.loc[idx,'petal length (cm)']+0.03, 
        y=iris_df_sub.loc[idx,'petal width (cm)'], 
        s=str(idx), 
        size='small'
    )

plt.title('Iris subset')

In [None]:
linkage_matrix = linkage(iris_df_sub[numer_cols], method='ward')
linkage_matrix

In [None]:
linkage_df = pd.DataFrame(linkage_matrix, columns=['First group', 'Second group', 'Distance', 'Size'])
linkage_df['New id'] = np.arange(linkage_df.shape[0]) + linkage_df.shape[0]+1

linkage_df

In [None]:
plt.figure()
dendrogram(linkage_matrix, color_threshold=4)
plt.title('Dendrogram - ward linkage')
plt.show()

In [None]:
cophenet(linkage_matrix, pdist(iris_df_sub[numer_cols].to_numpy()))[0]

In [None]:
def cophenetic_corr(linkage_matrix, data_mtx, distance_measure='euclidean'):
    
    if distance_measure != 'precomputed':
        cop_corr = cophenet(
            linkage_matrix, 
            pdist(data_mtx, metric=distance_measure),
        )[0]
        
    else:
        cop_corr = cophenet(
            linkage_matrix, 
            squareform(data_mtx),
        )[0]
        
    print(
        'Cophenetic correlation : {}'.format(cop_corr)
    )
    

In [None]:
linkage_matrix = linkage(iris_df_sub[numer_cols], method='single')

plt.figure()
dendrogram(linkage_matrix, color_threshold=4)
plt.title('Dendrogram - single linkage')
plt.show()

In [None]:
linkage_matrix = linkage(iris_df_sub[numer_cols], method='complete')

plt.figure()
dendrogram(linkage_matrix, color_threshold=4)
plt.title('Dendrogram - complete linkage')
plt.show()

In [None]:
linkage_matrix = linkage(iris_df_sub[numer_cols], method='average')

plt.figure()
dendrogram(linkage_matrix, color_threshold=4)
plt.title('Dendrogram - average linkage')
plt.show()

### Dendrograms - full iris dataset

In [None]:
# Load iris dataset
iris_df = pd.DataFrame(
    iris['data'],
    columns=iris['feature_names']
)

numer_cols = iris_df.columns

# Load iris labels
iris_df['label'] = iris['target_names'][iris['target']]

In [None]:
# Run PCA so we can show dataset in 2D space
pca = PCA(n_components=2)
pca_data = pca.fit_transform(iris_df[numer_cols].values)
pca_data = np.column_stack([pca_data, iris_df['label'].values])
pca_df = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'label'])

# Plot and colour based on reference label
sns.scatterplot(pca_df, x='PC1', y='PC2', hue='label')
plt.title('Scatterplot with true label')

In [None]:
linkage_matrix = linkage(iris_df[numer_cols], method='single')

cophenetic_corr(linkage_matrix, iris_df[numer_cols].to_numpy())

plt.figure(figsize=(16,8))
dendrogram(linkage_matrix, color_threshold=1, labels=iris_df['label'].to_numpy(), leaf_font_size=8)
plt.title('Dendrogram - single linkage')
plt.show()

In [None]:
# Plot and colour based on reference label
sns.scatterplot(pca_df, x='PC1', y='PC2', hue='label')
plt.title('Scatterplot with true label')

In [None]:
linkage_matrix = linkage(iris_df[numer_cols], method='complete')
cophenetic_corr(linkage_matrix, iris_df[numer_cols].to_numpy())

plt.figure(figsize=(16,8))
dendrogram(linkage_matrix, color_threshold=4, labels=iris_df['label'].to_numpy(), leaf_font_size=8)
plt.title('Dendrogram - complete linkage')
plt.show()

In [None]:
linkage_matrix = linkage(iris_df[numer_cols], method='average')
cophenetic_corr(linkage_matrix, iris_df[numer_cols].to_numpy())

plt.figure(figsize=(16,8))
dendrogram(linkage_matrix, color_threshold=4, labels=iris_df['label'].to_numpy(), leaf_font_size=8)
plt.title('Dendrogram - average linkage')
plt.show()

In [None]:
linkage_matrix = linkage(iris_df[numer_cols], method='ward')
cophenetic_corr(linkage_matrix, iris_df[numer_cols].to_numpy())

plt.figure(figsize=(16,8))
dendrogram(linkage_matrix, color_threshold=10, labels=iris_df['label'].to_numpy(), leaf_font_size=8)
plt.title('Dendrogram - ward linkage')
plt.show()

### Dendrogram truncation

In [None]:
# Truncated dendrogram
linkage_matrix = linkage(iris_df[numer_cols], method='ward')

plt.figure(figsize=(16,8))
dendrogram(linkage_matrix, color_threshold=4, truncate_mode="level", p=2)
plt.title('Dendrogram - ward linkage')
plt.show()