In [None]:
from google.colab import drive
import os
import numpy as np
import cupy as cp
import pandas as pd
import cudf
from cuml import DBSCAN
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
from sklearn.metrics import davies_bouldin_score


In [None]:
import json

def run_dbscan_on_all(pca_data, output_dir, min_samples=5, eps_list=None):
    if eps_list is None:
        eps_list = [0.3, 0.5, 0.7, 1.0, 1.3]

    os.makedirs(output_dir, exist_ok=True)
    results = {}

    for name, X in pca_data.items():
        print(f"\n Running DBSCAN on: {name}")
        results[name] = {}
        for eps in eps_list:
            print(f"  • eps = {eps}")
            model = DBSCAN(eps=eps, min_samples=min_samples)
            labels = model.fit_predict(X).to_numpy()
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = np.sum(labels == -1)

            metrics = {
                "n_clusters": int(n_clusters),
                "n_noise": int(n_noise),
                "eps": float(eps)
            }

            # Compute silhouette if more than 1 cluster and not all noise
            if n_clusters > 1 and n_noise < len(labels):
                try:
                    sil = float(cython_silhouette_score(X, labels.get(), metric='euclidean'))
                    dbi = davies_bouldin_score(X.to_numpy(), labels.get())
                    metrics["silhouette"] = float(sil)
                    metrics["dbi"] = float(dbi)
                except Exception as e:
                    print(f"    (skipped metrics: {e})")

            # Save cluster labels
            label_filename = f"labels_dbscan_{name}_eps{str(eps).replace('.', '')}.npy"
            np.save(os.path.join(output_dir, label_filename), labels)

            results[name][str(eps)] = metrics

    # Save JSON
    with open(os.path.join(output_dir, "dbscan_results.json"), "w") as f:
        json.dump(results, f, indent=4)

    print("\n All DBSCAN runs complete.")


In [None]:
from google.colab import drive
import os
import cupy as cp
import cudf

#  Mount your Drive
drive.mount('/content/drive')

#  Paths
input_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/PCA_Arrays"
output_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/Clustering/DBSCAN"
os.makedirs(output_dir, exist_ok=True)

#  Load only the 2C and 3C PCA projections (95% skipped)
X_all_pca_2 = cp.load(os.path.join(input_dir, "X_all_pca_2.npy"))
X_all_pca_3 = cp.load(os.path.join(input_dir, "X_all_pca_3.npy"))
X_key_pca_2 = cp.load(os.path.join(input_dir, "X_key_pca_2.npy"))
X_key_pca_3 = cp.load(os.path.join(input_dir, "X_key_pca_3.npy"))

print(" Selected PCA arrays loaded for DBSCAN.")

# Convert CuPy arrays to cuDF DataFrames
def to_cudf(cp_array):
    return cudf.DataFrame(cp_array)

#  Dictionary for DBSCAN
dbscan_inputs = {
    "Full (2C)": to_cudf(X_all_pca_2),
    "Full (3C)": to_cudf(X_all_pca_3),
    "Key (2C)": to_cudf(X_key_pca_2),
    "Key (3C)": to_cudf(X_key_pca_3),
}

# Run DBSCAN clustering on all configs
run_dbscan_on_all(dbscan_inputs, output_dir, min_samples=5)
