<a href="https://colab.research.google.com/github/samipn/clustering_demos/blob/main/dbscan_pycaret.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment (d): DBSCAN Clustering using PyCaret

This notebook uses the PyCaret library to perform DBSCAN clustering on a non-convex dataset and evaluates clustering quality.


In [1]:
!pip install pycaret


Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting numpy<1.27,>=1.21 (from pycaret)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<2.2.0 (from pycaret)
  Downloading pandas-2.1.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret)
  Downloading scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.5-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━

In [21]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons
from pycaret.clustering import setup, create_model, plot_model, assign_model, pull

# Create non-convex dataset (two moons)
X, _ = make_moons(n_samples=600, noise=0.07, random_state=42)
df = pd.DataFrame(X, columns=["x1", "x2"])
df.head()


Unnamed: 0,x1,x2
0,0.465443,0.94983
1,0.688884,-0.444873
2,1.836104,0.123468
3,0.752228,0.707265
4,-0.321856,0.863399


In [22]:
# PyCaret setup for clustering
exp = setup(
    data=df,
    normalize=True,
    session_id=42,
    verbose=True
)


Unnamed: 0,Description,Value
0,Session id,42
1,Original data shape,"(600, 2)"
2,Transformed data shape,"(600, 2)"
3,Numeric features,2
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Normalize,True
9,Normalize method,zscore


In [28]:
# Create DBSCAN model & inspect results
dbscan = create_model('dbscan', eps=0.2, min_samples=10)  # Adjusted parameters for better clustering
results = pull()  # get results table
results

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2283,266.1625,2.1312,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2283,266.1625,2.1312,0,0,0


In [29]:
plot_model(dbscan, plot='cluster')  # scatter plot of clusters

# Assign cluster labels to original data
dbscan_assigned = assign_model(dbscan)
dbscan_assigned.head()

Unnamed: 0,x1,x2,Cluster
0,0.465443,0.94983,Cluster 0
1,0.688884,-0.444873,Cluster 1
2,1.836104,0.123468,Cluster 1
3,0.752228,0.707265,Cluster 0
4,-0.321856,0.863398,Cluster 0


In [30]:
from sklearn.metrics import silhouette_score

labels = dbscan_assigned['Cluster'].values
X_cluster = dbscan_assigned[['x1', 'x2']].values

unique_labels = np.unique(labels)
if len(unique_labels) > 1:
    mask = labels != -1  # exclude noise
    if len(np.unique(labels[mask])) > 1:
        sil = silhouette_score(X_cluster[mask], labels[mask])
        print("Silhouette score (excluding noise):", sil)
    else:
        print("Too few non-noise clusters for silhouette.")
else:
    print("Silhouette score cannot be computed (only one cluster).")

Silhouette score (excluding noise): 0.1887532
