In [None]:
from google.colab import drive
import os
import numpy as np
import cupy as cp
import pandas as pd
import cudf
from cuml import DBSCAN
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
from sklearn.metrics import davies_bouldin_score


In [None]:
import json

def run_dbscan_on_all(pca_data, output_dir, min_samples=5, eps_list=None):
    if eps_list is None:
        eps_list = [0.3, 0.5, 0.7, 1.0, 1.3]

    os.makedirs(output_dir, exist_ok=True)
    results_path = os.path.join(output_dir, "dbscan_results.json")

    # Load existing results to avoid overwriting
    if os.path.exists(results_path):
        with open(results_path, "r") as f:
            results = json.load(f)
    else:
        results = {}

    for name, X in pca_data.items():
        print(f"\n Running DBSCAN on: {name}")
        if name not in results:
            results[name] = {}
        for eps in eps_list:
            print(f"  • eps = {eps}")
            model = DBSCAN(eps=eps, min_samples=min_samples)
            labels = model.fit_predict(X).to_numpy()
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_noise = np.sum(labels == -1)

            metrics = {
                "n_clusters": int(n_clusters),
                "n_noise": int(n_noise),
                "eps": float(eps)
            }

            # Auto-skip silhouette/DBI if dataset is large or cluster structure is invalid
            if 1 < n_clusters < 50 and len(labels) < 300000:
                try:
                    sil = float(cython_silhouette_score(X, labels, metric='euclidean'))
                    dbi = davies_bouldin_score(X.to_numpy(), labels)
                    metrics["silhouette"] = sil
                    metrics["dbi"] = dbi
                except Exception as e:
                    print(f"     Skipped metrics (runtime error): {e}")
                    metrics["silhouette"] = None
                    metrics["dbi"] = None
            else:
                print(f"     Skipped metrics due to size or structure: n={len(labels)}, clusters={n_clusters}")

            # Save cluster labels
            label_filename = f"labels_dbscan_{name}_eps{str(eps).replace('.', '')}.npy"
            np.save(os.path.join(output_dir, label_filename), labels)

            results[name][str(eps)] = metrics

            # Save JSON
            with open(os.path.join(output_dir, "dbscan_results.json"), "w") as f:
                json.dump(results, f, indent=4)

    print("\n All DBSCAN runs complete.")


In [None]:
from google.colab import drive
import os
import cupy as cp
import cudf

#  Mount your Drive
drive.mount('/content/drive')

#  Paths
input_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/PCA_Arrays"
output_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/Clustering/DBSCAN"
os.makedirs(output_dir, exist_ok=True)

#  Load only the 2C and 3C PCA projections (95% skipped)
#X_all_pca_2 = cp.load(os.path.join(input_dir, "X_all_pca_2.npy"))
#X_all_pca_3 = cp.load(os.path.join(input_dir, "X_all_pca_3.npy"))
#X_key_pca_2 = cp.load(os.path.join(input_dir, "X_key_pca_2.npy"))
X_key_pca_3 = cp.load(os.path.join(input_dir, "X_key_pca_3.npy"))

print(" Selected PCA arrays loaded for DBSCAN.")

# Convert CuPy arrays to cuDF DataFrames
def to_cudf(cp_array):
    return cudf.DataFrame(cp_array)

#  Dictionary for DBSCAN
#dbscan_inputs = {
#    "Full (2C)": to_cudf(X_all_pca_2),
#    "Full (3C)": to_cudf(X_all_pca_3),
#    "Key (2C)": to_cudf(X_key_pca_2),
#    "Key (3C)": to_cudf(X_key_pca_3),
#}

#full_configs = {
#    "Full (2C)": to_cudf(X_all_pca_2),
#    "Full (3C)": to_cudf(X_all_pca_3)
#}

#run_dbscan_on_all(full_configs, output_dir, min_samples=5)

key_configs = {
 #   "Key (2C)": to_cudf(X_key_pca_2)
    "Key (3C)": to_cudf(X_key_pca_3)
}
run_dbscan_on_all(key_configs, output_dir, min_samples=5)



# Run DBSCAN clustering on all configs
#run_dbscan_on_all(dbscan_inputs, output_dir, min_samples=5)





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Selected PCA arrays loaded for DBSCAN.

 Running DBSCAN on: Key (3C)
  • eps = 0.3
     Skipped metrics due to size or structure: n=968524, clusters=5491
  • eps = 0.5
     Skipped metrics due to size or structure: n=968524, clusters=5074
  • eps = 0.7
     Skipped metrics due to size or structure: n=968524, clusters=4933
  • eps = 1.0
     Skipped metrics due to size or structure: n=968524, clusters=1072
  • eps = 1.3
     Skipped metrics due to size or structure: n=968524, clusters=416

 All DBSCAN runs complete.


In [None]:
import os
import json
import numpy as np
import pandas as pd

def load_dbscan_visualization_data(base_dir, include_metrics=True):
    """
    Load PCA projections, DBSCAN labels, and fraud labels for visualization.
    Directory structure assumed:
    - base_dir/
        ├── PCA/
        ├── DBSCAN/
        └── y_labels.pkl
    """
    pca_dir = os.path.join(base_dir, "PCA")
    db_dir = os.path.join(base_dir, "DBSCAN")

    data = {}

    # Load fraud labels
    y_path = os.path.join(base_dir, "y_labels.pkl")
    y = pd.read_pickle(y_path)

    index_all = pd.read_csv(os.path.join(base_dir, "index_all_scaled.csv"), index_col=0).index
    index_key = pd.read_csv(os.path.join(base_dir, "index_key_scaled.csv"), index_col=0).index

    data["y_all"] = y.loc[index_all].values
    data["y_key"] = y.loc[index_key].values

    data["index_all"] = index_all
    data["index_key"] = index_key

    # Load PCA arrays
    for fname in os.listdir(pca_dir):
        if fname.endswith(".npy"):
            key = fname.replace(".npy", "")
            data[key] = np.load(os.path.join(pca_dir, fname))

    # Load DBSCAN labels
    for fname in os.listdir(db_dir):
        if fname.startswith("labels_dbscan") and fname.endswith(".npy"):
            key = fname.replace(".npy", "")
            data[key] = np.load(os.path.join(db_dir, fname))

    # Load DBSCAN metrics
    if include_metrics:
        metrics_path = os.path.join(db_dir, "dbscan_results.json")
        if os.path.exists(metrics_path):
            with open(metrics_path, "r") as f:
                data["dbscan_results"] = json.load(f)

    return data


In [None]:
import matplotlib.pyplot as plt

def plot_dbscan_comparison(x, y_true, label_dict, label_display, eps_mapping, dbscan_results, save_path=None):
    fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(16, 8), constrained_layout=True)
    axs = axs.flatten()

    for i, (eps_str, labels) in enumerate(label_dict.items()):
        epsilon_val = eps_mapping[eps_str]
        metrics = dbscan_results.get(label_display, {}).get(str(epsilon_val), {})
        k_val = metrics.get("n_clusters", "?")

        axs[i].scatter(x[:, 0], x[:, 1], c=labels, cmap="cividis", alpha=0.6, s=10)
        axs[i].scatter(x[y_true == 1][:, 0], x[y_true == 1][:, 1], c="red", marker="x", s=10, label="Fraud")
        axs[i].scatter(x[labels == -1][:, 0], x[labels == -1][:, 1], c="black", marker="v", s=10, label="Noise")
        axs[i].set_title(f"ε = {epsilon_val:.2f}, k = {k_val}")
        axs[i].set_xlabel("PC1")
        axs[i].set_ylabel("PC2")

    # 6th panel (bottom right) for legend only
    axs[-1].axis("off")
    handles, labels = axs[0].get_legend_handles_labels()
    axs[-1].legend(handles, labels, loc="center", fontsize="medium")

    fig.suptitle(f"DBSCAN Results for {label_display}", fontsize=16)

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.show()


In [None]:
# Config map: (PCA key, fraud label key, list of eps values to visualize)
dbscan_config_map = {
    "Full (2C)": ("x_all_pca_2", "y_all", [0.3, 0.5, 0.7, 1.0, 1.3]),
    "Full (3C)": ("x_all_pca_3", "y_all", [0.3, 0.5, 0.7, 1.0, 1.3]),
    "Key (2C)": ("x_key_pca_2", "y_key", [0.3, 0.5, 0.7, 1.0, 1.3]),
    "Key (3C)": ("x_key_pca_3", "y_key", [0.3, 0.5, 0.7, 1.0, 1.3])
}


In [None]:
# Set base path and output dir
base_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/Clustering"
output_dir = "/content/drive/MyDrive/NCU/Dissertation/Figures/DBSCANPlots"
os.makedirs(output_dir, exist_ok=True)

# Load all data
data = load_dbscan_visualization_data(base_dir)

# Generate plots
for label, (x_key, y_key, eps_list) in dbscan_config_map.items():
    x = data[x_key]
    y = data[y_key]

    eps_mapping = {f"{int(eps * 10):02d}": eps for eps in eps_list}
    label_dict = {}

    for eps in eps_list:
        eps_str = f"{int(eps * 10):02d}"
        file_key = f"labels_dbscan_{label}_eps{eps_str}"
        if file_key in data:
            label_dict[eps_str] = data[file_key]
        else:
            print(f"⚠️  Missing: {file_key}")

    if label_dict:
        save_path = os.path.join(
            output_dir,
            f"dbscan_comparison_{label.lower().replace(' ', '_').replace('(', '').replace(')', '')}.png"
        )
        print(f"✅ Generating: {label} with ε = {list(eps_mapping.values())}")
        plot_dbscan_comparison(
            x=x,
            y_true=y,
            label_dict=label_dict,
            label_display=label,
            eps_mapping=eps_mapping,
            dbscan_results=data["dbscan_results"],
            save_path=save_path
        )
