## DBSCAN sklearn demo

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors

from sklearn.datasets import make_moons

from kneed import KneeLocator

sns.set_theme()
plt.rcParams["image.cmap"] = "tab10"

### Create the  dataset

In [None]:
# Create moon dataset
X, y = make_moons(
    n_samples=2000,  
    shuffle=True, 
    noise=0.03, 
    random_state=12
)

# Plot moon dataset
plt.figure()
plt.scatter(X[:, 0], X[:, 1], s=4)
plt.title('Dataset')
plt.show()

In [None]:
# Cluster moon with k-means
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(X)

# Plot moon dataset coloured based on kmeans clustering
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels_, s=4)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=4, c='red')
plt.title('Clustered dataset - K means')
plt.show()

## Cluster with raw DBSCAN

In [None]:
# Cluster with raw dbscan
dbscan = DBSCAN()
_ = dbscan.fit(X)

# Plot clustering results
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=dbscan.labels_, s=4)
plt.title('Clustered dataset - DBSCAN')
plt.show()

### Tunning DBSCAN parameters

In [None]:
# Identify MinPts
min_pts = X.shape[1] * 2

# Find distances to min_pts'th neighbor
nbrs = NearestNeighbors().fit(X)
distances, _ = nbrs.kneighbors(X, n_neighbors=min_pts)
distances

In [None]:
# Sort the distances to the min_pts'th neighbor
sort_dist = np.sort(distances[:,-1])

# Plot and search for a knee
plt.plot(range(sort_dist.shape[0]), sort_dist)
plt.ylabel('distance to {}-rd neighbor'.format(min_pts-1))
plt.show()

In [None]:
kneedle = KneeLocator(
    range(sort_dist.shape[0]), 
    sort_dist, 
    S=1.0, 
    curve="convex", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
# Cluster again
dbscan_tun = DBSCAN(eps=0.038, min_samples=4)
_ = dbscan_tun.fit(X)

plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=dbscan_tun.labels_, s=4)
plt.title('Clustered dataset - DBSCAN after parameter tunning')
plt.show()

In [None]:
silhouette_score(X, kmeans.labels_)

In [None]:
silhouette_score(X, dbscan_tun.labels_)