## HDBSCAN custom implementation

In [None]:
# !{sys.executable} -m pip install hdbscan
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import hdbscan

from sklearn.neighbors import NearestNeighbors

from sklearn.datasets import make_blobs
from scipy.spatial.distance import pdist, squareform

from scipy.cluster.hierarchy import dendrogram
import igraph as ig

sns.set_theme()

HDBSCAN main steps:

1. **Calculate core and reachability distances**

2. **Build the minimum spanning tree**

3. **Build dendrogram from MST**

4. **Condense the dendrogram based on minimum cluster size**

5. **Extract the stable clusters from the condensed tree**


Steps 1-3 are implemented from scratch. HDBSCAN python library is used to carry out steps 4 & 5.

### Create the dataset

In [None]:
plt.rcParams["figure.figsize"] = (8,8)


# Create test data
X, y = make_blobs(
    n_samples=100, 
    centers=[(-1,2), (0, 0), (2,2.9), [1,0]],
    cluster_std=0.15,
    random_state=2
)

# Add two outliers
X = np.vstack([X, np.array([[2,6],[3,6]])])
y = np.concatenate([y, np.array([-1,-1])])

plt.scatter(X[:,0], X[:,1])


### Set hdbscan parameters

In [None]:
### HDBSCAN parameters
MIN_SAMPLES = 5

### Step 1 - compute mutual reachability

In [None]:
# Calculate distances between points
dists = pdist(X, metric='euclidean')
dists = squareform(dists)

# Fit the nn object
nn_obj = NearestNeighbors(metric='precomputed')
nn_obj = nn_obj.fit(dists)

In [None]:
# Find core distance as distance to nth neighbor
distances, neigh_ids = nn_obj.kneighbors(dists, n_neighbors=MIN_SAMPLES+1)
core_dists = distances[:, -1]

# Find mutual reachability distance
reach_dists = np.max(
    [dists, *np.meshgrid(core_dists, core_dists)], 
    axis=0
)

### Step 2 - minimum spanning tree

In [None]:
g = ig.Graph.Weighted_Adjacency(reach_dists, mode='undirected')

# Find the minimum spanning tree
mst = g.spanning_tree(weights=g.es["weight"])
mst_adj = mst.get_adjacency(attribute='weight')
MST = np.array(mst_adj.data)

In [None]:
# Original MST contains distances, in order to create graph
# we need ajdacency matrix where high values mean high connectivity
# which is oposite from distance
plot_MST = MST.copy()
plot_MST[plot_MST == 0] = np.inf
plot_MST = 1/plot_MST

In [None]:
# Use igraph to plot MST
# This is similar to the second phase of UMAP
property_dict = {0: 'blue', 1: 'red', 2: 'green', 3: 'yellow', -1: 'purple'}
node_colors = [property_dict[curr_y] for curr_y in y]

plot_g = ig.Graph.Weighted_Adjacency(plot_MST, mode='undirected')
plot_g.vs['color'] = node_colors

layout = plot_g.layout("auto", weights='weight')
ig.plot(plot_g, layout=layout)

### Step 3 - Building cluster hierarchy

In [None]:
# Convert adjacency matrix of MST to a list of edges with distances
edges = []
for i in range(MST.shape[0]):
    for j in range(i+1, MST.shape[1]):
        if MST[i, j] != 0:
            edges.append((i, j, MST[i, j]))
            
edges[:4]

In [None]:
# Sort edges by distance from lowest to highest
edges.sort(key=lambda x: x[2], reverse=False)

edges[:4]

In [None]:
# Create dataframe from edges
edges_df = pd.DataFrame(edges, columns=['p1', 'p2', 'dist'])
edges_df

In [None]:
# Initialize groups_dict (each point gets a cluster)
groups = {i:[i] for i in range(MST.shape[0])}

groups

In [None]:
# Initialize linkage matrix
linkage_matrix = []

# Next id for newly formed groups (merges)
next_id = MST.shape[0]

# Iterate through the edges
for idx in edges_df.index:

    # Iterate through all of the existing groups
    for k in groups:

        # Find cluster containing p1
        if edges_df.loc[idx, 'p1'] in groups[k]:
            cluster_i = k
        # Find cluster containing p2
        if edges_df.loc[idx, 'p2'] in groups[k]:
            cluster_j = k

    # If points do not belong to same cluster then
    # merge groups containing this points
    if cluster_i != cluster_j:

        # Add cluster merge to linkage matrix (this if  statement controls
        # order of p1 and p2 in linkage the matrix)
        # This is done to make dendrogram look better
        # (clustering would work even without this step)
        if len(groups[cluster_i])> len(groups[cluster_j]):
            linkage_matrix.append(
                [cluster_j, cluster_i, edges_df.loc[idx, 'dist'], 
                len(groups[cluster_i]) + len(groups[cluster_j])]
            )
        else:
            linkage_matrix.append(
                [cluster_i, cluster_j, edges_df.loc[idx, 'dist'], 
                len(groups[cluster_i]) + len(groups[cluster_j])]
            )
        
        # Create merged cluster
        groups[next_id] = groups[cluster_i] + groups[cluster_j]
        
        # Delete processed groups
        del groups[cluster_i], groups[cluster_j] 
        
        # Increment cluster id
        next_id +=1


linkage_matrix[:10]        


In [None]:
# Convert linkage_matrix to the format required by scipy dendrogram
linkage_matrix = np.array(linkage_matrix)

# Set figure size for dendrogram
plt.rcParams["figure.figsize"] = (16,8)

# Plot the dendrogram
plt.figure(figsize=(14,8))
dendrogram(linkage_matrix,color_threshold=0.5)
plt.title('MST dendrogram')
plt.show()

## Part 2 - HDBSCAN package

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10, 
    min_samples=5,
    gen_min_span_tree=True, 
    metric='euclidean',
    approx_min_span_tree=False
)
clusterer = clusterer.fit(X)

In [None]:
# Plot minimum spanning tree
clusterer.minimum_spanning_tree_.plot(
    edge_cmap='viridis',
    edge_alpha=0.6,
    node_size=80,
    edge_linewidth=2
)

In [None]:
# Plot dendrogram
plt.figure(figsize=(14,8))
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)

In [None]:
# Condensed tree plot
clusterer.condensed_tree_.plot()
plt.title('min_cluster_size = 4')

In [None]:
# Selected clusters
clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())
plt.title('Same setup witn min_cluster_size=2')

In [None]:
clusterer.labels_

In [None]:
# Final clustering results
plt.figure(figsize=(6,7))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=clusterer.labels_, palette='tab10')

In [None]:
plt.rcParams["figure.figsize"] = (8,8)

plt.scatter(X[:,0], X[:,1], alpha=clusterer.probabilities_)

In [None]:
plt.rcParams["figure.figsize"] = (8,8)

plt.scatter(X[:,0], X[:,1], alpha=clusterer.outlier_scores_)

### Using cluster_selection_epsilon

In [None]:
# Before cluster_selection_epsilon
plt.figure(figsize=(6,7))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=clusterer.labels_, palette='tab10')

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10, 
    min_samples=5,
    gen_min_span_tree=True, 
    metric='euclidean',
    approx_min_span_tree=False,
    cluster_selection_epsilon=1.5
)
clusterer = clusterer.fit(X)

In [None]:
# After cluster_selection_epsilon
plt.figure(figsize=(6,7))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=clusterer.labels_, palette='tab10')

### Leaf clustering

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=5, 
    min_samples=3,
    gen_min_span_tree=True, 
    metric='euclidean',
    approx_min_span_tree=False,
    cluster_selection_method='leaf'
)
clusterer = clusterer.fit(X)

clusterer.condensed_tree_.plot(select_clusters=True, selection_palette=sns.color_palette())
plt.title('Same setup witn min_cluster_size=2')

In [None]:
plt.figure(figsize=(6,7))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=clusterer.labels_, palette='tab10')

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=5, 
    min_samples=3,
    gen_min_span_tree=True, 
    metric='euclidean',
    approx_min_span_tree=False,
    cluster_selection_method='leaf',
    cluster_selection_epsilon=0.5
)
clusterer = clusterer.fit(X)

plt.figure(figsize=(6,7))
sns.scatterplot(x=X[:,0], y=X[:,1], hue=clusterer.labels_, palette='tab10')