In [1]:
from google.colab import drive
import os
import numpy as np
import cupy as cp
import pandas as pd
import cudf
from cuml import DBSCAN
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
from sklearn.metrics import davies_bouldin_score


In [2]:
import json

def run_dbscan_on_all(pca_data, output_dir, min_samples=5, eps_list=None):
    if eps_list is None:
        eps_list = [0.3, 0.5, 0.7, 1.0, 1.3]

    os.makedirs(output_dir, exist_ok=True)
    results_path = os.path.join(output_dir, "dbscan_results.json")

    # Load existing results to avoid overwriting
    if os.path.exists(results_path):
        with open(results_path, "r") as f:
            results = json.load(f)
    else:
        results = {}

    for name, X in pca_data.items():
        print(f"\n Running DBSCAN on: {name}")
        if name not in results:
            results[name] = {}
        for eps in eps_list:
            print(f"  • eps = {eps}")
            model = DBSCAN(eps=eps, min_samples=min_samples)
            labels = model.fit_predict(X).to_numpy()
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = np.sum(labels == -1)

            metrics = {
                "n_clusters": int(n_clusters),
                "n_noise": int(n_noise),
                "eps": float(eps)
            }

            # Auto-skip silhouette/DBI if dataset is large or cluster structure is invalid
            if 1 < n_clusters < 50 and len(labels) < 300000:
                try:
                    sil = float(cython_silhouette_score(X, labels, metric='euclidean'))
                    dbi = davies_bouldin_score(X.to_numpy(), labels)
                    metrics["silhouette"] = sil
                    metrics["dbi"] = dbi
                except Exception as e:
                    print(f"     Skipped metrics (runtime error): {e}")
                    metrics["silhouette"] = None
                    metrics["dbi"] = None
            else:
                print(f"     Skipped metrics due to size or structure: n={len(labels)}, clusters={n_clusters}")

            # Save cluster labels
            label_filename = f"labels_dbscan_{name}_eps{str(eps).replace('.', '')}.npy"
            np.save(os.path.join(output_dir, label_filename), labels)

            results[name][str(eps)] = metrics

            # Save JSON
            with open(os.path.join(output_dir, "dbscan_results.json"), "w") as f:
                json.dump(results, f, indent=4)

    print("\n All DBSCAN runs complete.")


In [5]:
from google.colab import drive
import os
import cupy as cp
import cudf

#  Mount your Drive
drive.mount('/content/drive')

#  Paths
input_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/PCA_Arrays"
output_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/Clustering/DBSCAN"
os.makedirs(output_dir, exist_ok=True)

#  Load only the 2C and 3C PCA projections (95% skipped)
#X_all_pca_2 = cp.load(os.path.join(input_dir, "X_all_pca_2.npy"))
#X_all_pca_3 = cp.load(os.path.join(input_dir, "X_all_pca_3.npy"))
#X_key_pca_2 = cp.load(os.path.join(input_dir, "X_key_pca_2.npy"))
X_key_pca_3 = cp.load(os.path.join(input_dir, "X_key_pca_3.npy"))

print(" Selected PCA arrays loaded for DBSCAN.")

# Convert CuPy arrays to cuDF DataFrames
def to_cudf(cp_array):
    return cudf.DataFrame(cp_array)

#  Dictionary for DBSCAN
#dbscan_inputs = {
#    "Full (2C)": to_cudf(X_all_pca_2),
#    "Full (3C)": to_cudf(X_all_pca_3),
#    "Key (2C)": to_cudf(X_key_pca_2),
#    "Key (3C)": to_cudf(X_key_pca_3),
#}

#full_configs = {
#    "Full (2C)": to_cudf(X_all_pca_2),
#    "Full (3C)": to_cudf(X_all_pca_3)
#}

#run_dbscan_on_all(full_configs, output_dir, min_samples=5)

key_configs = {
 #   "Key (2C)": to_cudf(X_key_pca_2)
    "Key (3C)": to_cudf(X_key_pca_3)
}
run_dbscan_on_all(key_configs, output_dir, min_samples=5)



# Run DBSCAN clustering on all configs
#run_dbscan_on_all(dbscan_inputs, output_dir, min_samples=5)





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Selected PCA arrays loaded for DBSCAN.

 Running DBSCAN on: Key (3C)
  • eps = 0.3
     Skipped metrics due to size or structure: n=968524, clusters=5491
  • eps = 0.5
     Skipped metrics due to size or structure: n=968524, clusters=5074
  • eps = 0.7
     Skipped metrics due to size or structure: n=968524, clusters=4933
  • eps = 1.0
     Skipped metrics due to size or structure: n=968524, clusters=1072
  • eps = 1.3
     Skipped metrics due to size or structure: n=968524, clusters=416

 All DBSCAN runs complete.
