## HDBSCAN on digits and iris

In [2]:
# !{sys.executable} -m pip install hdbscan
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import hdbscan
from hdbscan.validity import validity_index

import umap
import umap.plot as uplot
from sklearn.datasets import fetch_openml, load_iris
from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.decomposition import PCA
from kneed import KneeLocator

sns.set_theme()

### Digits data

In [None]:
# Load digits data
mnist = fetch_openml('mnist_784', parser='auto')

# Normalize digits data
digits_data = mnist.data/255
digit_labels = mnist.target

# Subset the dataset
digit_labels_df = pd.DataFrame(digit_labels)

label_subset_df = digit_labels_df.groupby('class').head(2000)

digits_data = digits_data.loc[label_subset_df.index, :].reset_index(drop=True)
digit_labels = label_subset_df['class'].to_numpy()

In [None]:
pca = PCA()
pca.fit(digits_data)
pca_transformed_data = pca.transform(digits_data)
print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)*100

kneedle = KneeLocator(
    range(len(cumulative_explained_variance)), 
    cumulative_explained_variance, 
    S=1.0, 
    curve="concave", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
pca_dim_red = pca_transformed_data[:,:50]

In [None]:
# Default values
umap_obj = umap.UMAP()
embedding = umap_obj.fit_transform(pca_dim_red)

uplot.points(umap_obj, labels=digit_labels)

In [None]:
def print_clustering_stats(clusterer, clust_data, data_labels):
    
    # Create reduced version of data (exclude noise)
    np_labels = np.array(clusterer.labels_)
    non_noise_idx = np.where(np_labels != -1)

    non_noise_labels = np_labels[non_noise_idx]
    clust_labels_sub = data_labels[non_noise_idx]
    clust_data_sub = clust_data[non_noise_idx]
    noise_size = np_labels.shape[0] - non_noise_labels.shape[0]
    
    print('ARI : {}'.format(adjusted_rand_score(np_labels, data_labels)))
    print('ARI sub : {}'.format(adjusted_rand_score(non_noise_labels, clust_labels_sub)))
    print('noise size : {}'.format(noise_size))
    print('Silouethe : {}'.format(silhouette_score(clust_data, np_labels)))
    print('Silouethe sub : {}'.format(silhouette_score(clust_data_sub, clust_labels_sub)))
    print('DBCV : {}'.format(validity_index(clust_data, np_labels)))
    

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=5, 
    min_cluster_size=100
)
clusterer.fit(pca_dim_red)

print_clustering_stats(
    clusterer=clusterer,
    clust_data=pca_dim_red,
    data_labels=digit_labels
)

uplot.points(umap_obj, labels=clusterer.labels_)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=1,
    min_cluster_size=100,
    metric='euclidean'
)
clusterer.fit(pca_dim_red)

print_clustering_stats(
    clusterer=clusterer,
    clust_data=pca_dim_red,
    data_labels=digit_labels
)

uplot.points(umap_obj, labels=clusterer.labels_)

In [None]:
digits_data_rshp = np.reshape(digits_data, newshape=[digits_data.shape[0], 28, 28])

for clust_i, clust_name in enumerate(np.unique(clusterer.labels_)):
    
    clust_samples = np.where(clusterer.labels_==clust_name)[0]
    
    nrows = 1
    ncols = 6
    fig, axs = plt.subplots(nrows, ncols, figsize=(12, 4*nrows))  # Adjust the size as needed

    axs = axs.flatten()

    for i, img_idx in enumerate(clust_samples[:6]):
        axs[i].imshow(digits_data_rshp[img_idx], cmap=plt.cm.gray_r)
    fig.suptitle('Cluster {}'.format(clust_i))

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=2,
    min_cluster_size=100,
    metric='euclidean',
    cluster_selection_method='leaf'
)
clusterer.fit(pca_dim_red)

print_clustering_stats(
    clusterer=clusterer,
    clust_data=pca_dim_red,
    data_labels=digit_labels
)

uplot.points(umap_obj, labels=clusterer.labels_)

### Iris data

In [None]:
# Load iris dataset
iris = load_iris()

# Load iris dataset
iris_df = pd.DataFrame(
    iris['data'],
    columns=iris['feature_names']
)

numer_cols = iris_df.columns

# Load iris labels
iris_df['label'] = iris['target_names'][iris['target']]

# Run PCA so we can show dataset in 2D space
pca = PCA(n_components=2)
pca_data = pca.fit_transform(iris_df[numer_cols].to_numpy())
pca_data = np.column_stack([pca_data, iris_df['label'].to_numpy()])
pca_df = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'label'])

# Plot and colour based on reference label
sns.scatterplot(pca_df, x='PC1', y='PC2', hue='label')
plt.title('Scatterplot with true label')

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=2,
    min_cluster_size=20,
    metric='euclidean'
)
clusterer = clusterer.fit(iris_df[numer_cols].to_numpy())

pca_df['label'] = clusterer.labels_

print_clustering_stats(
    clusterer=clusterer,
    clust_data=iris_df[numer_cols].to_numpy(),
    data_labels=iris_df['label'].to_numpy()
)

sns.scatterplot(pca_df, x='PC1', y='PC2', hue='label')
plt.title('Scatterplot with true label')

In [None]:
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=1,
    min_cluster_size=30,
    metric='euclidean',
    cluster_selection_method='leaf',
)
clusterer = clusterer.fit(iris_df[numer_cols].to_numpy())

pca_df['label'] = clusterer.labels_

print_clustering_stats(
    clusterer=clusterer,
    clust_data=iris_df[numer_cols].to_numpy(),
    data_labels=iris_df['label'].to_numpy()
)

sns.scatterplot(pca_df, x='PC1', y='PC2', hue='label')
plt.title('Scatterplot with true label')