## Clustering digits with DBSCAN

In [None]:
import random

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import fetch_openml
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import silhouette_score

import umap
import umap.plot as uplot

import hdbscan
import hdbscan.validity

from kneed import KneeLocator

from sklearn.decomposition import PCA

random.seed(2)
sns.set_theme()

### Load the digit dataset

In [None]:
# Load digits data
mnist = fetch_openml('mnist_784', parser='auto')

# Normalize digits data
digits_data = mnist.data/255
digit_labels = mnist.target

# Subset the dataset
digit_labels_df = pd.DataFrame(digit_labels)

label_subset_df = digit_labels_df.groupby('class').head(2000)

digits_data = digits_data.loc[label_subset_df.index, :].reset_index(drop=True)
digit_labels = label_subset_df['class'].to_numpy()

In [None]:
digits_data.shape

In [None]:
# Default values
umap_obj = umap.UMAP()
embedding = umap_obj.fit_transform(digits_data)

uplot.points(umap_obj, labels=digit_labels)

### Dbscan - default parameter choosing methods

In [None]:
# Identify MinPts
min_pts = int(np.log(digits_data.shape[0]))

# Find distances to min_pts'th neighbor
knn = NearestNeighbors()
knn.fit(digits_data)
distances, _ = knn.kneighbors(digits_data, n_neighbors=min_pts)


# Sort the dsitances to min_pts'th neighbor
sort_dist = np.sort(distances[:,-1])

# Plot and search for the knee
plt.plot(range(sort_dist.shape[0]), sort_dist)

In [None]:
# Cluster again
dbscan_tun = DBSCAN(eps=6, min_samples=min_pts)
_ = dbscan_tun.fit(digits_data)

np.unique(dbscan_tun.labels_)

In [None]:
print('ARI : {}'.format(
    adjusted_rand_score(dbscan_tun.labels_, digit_labels)
    )
)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

### DBSCAN with PCA

In [None]:
pca = PCA()
pca.fit(digits_data)
pca_transformed_data = pca.transform(digits_data)
print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)*100

kneedle = KneeLocator(
    range(len(cumulative_explained_variance)), 
    cumulative_explained_variance, 
    S=1.0, 
    curve="concave", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
pca_reduced_data = pca_transformed_data[:, :100]

In [None]:
# Identify MinPts
min_pts =  pca_reduced_data.shape[1] + 1

# Find distances to min_pts'th neighbor
knn = NearestNeighbors(n_neighbors=min_pts+1)
knn.fit(pca_reduced_data)
distances, _ = knn.kneighbors(pca_reduced_data)


# Sort the distances to the min_pts'th neighbor
sort_dist = np.sort(distances[:,-1])

# Plot and search for a knee
plt.plot(range(sort_dist.shape[0]), sort_dist)

In [None]:
# Cluster again
dbscan_tun = DBSCAN(eps=2.5, min_samples=min_pts)
_ = dbscan_tun.fit(pca_reduced_data)

In [None]:
print('ARI : {}'.format(
    adjusted_rand_score(dbscan_tun.labels_, digit_labels)
    )
)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

### Brute force search

In [None]:
# Lists of candidate parameters
n_neighbor_list = [10, 20, 40, 80] + list(range(100, 600, 100))
eps_list = [1, 3, 6, 8, 10, 12]

# List to store results
result_list = []

print('Loop has started !')

# Iterate through the neighbors
for iter_i, n_neighbor in enumerate(n_neighbor_list):
    
    print('Done {}/{} for outter loop'.format(iter_i, len(n_neighbor_list)))
    
    # Take various eps thresholds and perform clustering
    for curr_eps in eps_list:
        
        curr_dbscan = DBSCAN(eps=curr_eps, min_samples=n_neighbor, n_jobs=-1)
        _ = curr_dbscan.fit(pca_reduced_data)
        
        # Get number of unique clusters by removing noise
        unique_clusters = [clust for clust in set(curr_dbscan.labels_) if clust != -1]
        
        # Create reduced version of the data
        np_labels = np.array(curr_dbscan.labels_)
        non_noise_idx = np.where(np_labels != -1)

        non_noise_labels = np_labels[non_noise_idx]
        digit_labels_sub = digit_labels[non_noise_idx]
        pca_data_sub = pca_reduced_data[non_noise_idx]
        noise_size = np_labels.shape[0] - non_noise_labels.shape[0]
        
        # If number of clusters is higher than 1, calculate and 
        # record the stats
        if len(unique_clusters) > 1:
            
            try:
                dbcv = hdbscan.validity.validity_index(pca_reduced_data, curr_dbscan.labels_, metric='euclidean')
            except:
                dbcv = -1
        
            result_list.append({
                'n_neigh': n_neighbor,
                'eps': curr_eps,
                'num_clust': len(set(curr_dbscan.labels_)),
                'silouethe': silhouette_score(pca_reduced_data, curr_dbscan.labels_, metric='euclidean'),
                'silouethe_sub': silhouette_score(pca_data_sub, non_noise_labels, metric='euclidean'),
                'dbcv': dbcv,
                'ari': adjusted_rand_score(digit_labels, curr_dbscan.labels_),
                'ari_sub': adjusted_rand_score(non_noise_labels, digit_labels_sub),
                'noise_size': noise_size
            })

# Obtain dataframe
result_df = pd.DataFrame.from_dict(result_list).round(3)

In [None]:
# Sort by dbcv
best_dbcv = result_df.sort_values(by='dbcv', ascending=False).head(10).reset_index(drop=True)
best_dbcv

In [None]:
dbscan_tun = DBSCAN(
    eps=best_dbcv.loc[0,'eps'], 
    min_samples=best_dbcv.loc[0, 'n_neigh']
)
_ = dbscan_tun.fit(pca_reduced_data)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

In [None]:
# Sort by silouethe score
best_silouethe = result_df.sort_values(by='silouethe', ascending=False).head(10).reset_index(drop=True)
best_silouethe

In [None]:
dbscan_tun = DBSCAN(
    eps=best_silouethe.loc[0,'eps'], 
    min_samples=best_silouethe.loc[0, 'n_neigh']
)
_ = dbscan_tun.fit(pca_reduced_data)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

In [None]:
# Sort by silouethe score for non-noise samples
best_silouethe_sub = result_df.sort_values(by='silouethe_sub', ascending=False).head(10).reset_index(drop=True)
best_silouethe_sub

In [None]:
dbscan_tun = DBSCAN(
    eps=best_silouethe_sub.loc[0,'eps'], 
    min_samples=best_silouethe_sub.loc[0, 'n_neigh']
)
_ = dbscan_tun.fit(pca_reduced_data)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

In [None]:
# Sort by num_clust
high_clust_num = result_df.sort_values(by='num_clust', ascending=False).head(10).reset_index(drop=True)
high_clust_num

In [None]:
dbscan_tun = DBSCAN(
    eps=high_clust_num.loc[0,'eps'], 
    min_samples=high_clust_num.loc[0, 'n_neigh']
)
_ = dbscan_tun.fit(pca_reduced_data)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

In [None]:
# Sort by ari
best_ari = result_df.sort_values(by='ari', ascending=False).head(10).reset_index(drop=True)
best_ari

In [None]:
dbscan_tun = DBSCAN(
    eps=best_ari.loc[0,'eps'], 
    min_samples=best_ari.loc[0, 'n_neigh']
)
_ = dbscan_tun.fit(pca_reduced_data)

uplot.points(umap_obj, labels=dbscan_tun.labels_)

In [None]:
# Sort by ari on non-noise samples
best_sub_ari = result_df.sort_values(by='ari_sub', ascending=False).head(10).reset_index(drop=True)
best_sub_ari

In [None]:
dbscan_tun = DBSCAN(
    eps=best_sub_ari.loc[0,'eps'], 
    min_samples=best_sub_ari.loc[0, 'n_neigh']
)
_ = dbscan_tun.fit(pca_reduced_data)

uplot.points(umap_obj, labels=dbscan_tun.labels_)