<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Working/Modeling/04_Hier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
# Cell 1: Import packages
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from cuml.metrics import silhouette_score
from cuml import AgglomerativeClustering

from sklearn.metrics import davies_bouldin_score
from scipy.cluster.hierarchy import dendrogram, linkage

import cupy as cp
import cudf

plt.style.use("/content/drive/MyDrive/NCU/Dissertation/apa.mplstyle")  # Optional


In [None]:
# Cell 2: Load PCA arrays + labels
def load_hierarchical_data(base_dir):
    data = {}

    # Labels
    y_path = os.path.join(base_dir, "PCA", "y_labels.pkl")
    y = pd.read_pickle(y_path)

    index_all = pd.read_csv(os.path.join(base_dir, "PCA", "index_all_scaled.csv"), index_col=0).index
    index_key = pd.read_csv(os.path.join(base_dir, "PCA", "index_key_scaled.csv"), index_col=0).index

    data["y_all"] = y.loc[index_all].values
    data["y_key"] = y.loc[index_key].values

    # PCA inputs
    variants = {
        "x_all_pca_2": "X_all_pca_2.npy",
        "x_all_pca_3": "X_all_pca_3.npy",
        "x_key_pca_2": "X_key_pca_2.npy",
        "x_key_pca_3": "X_key_pca_3.npy",
    }

    for key, filename in variants.items():
        path = os.path.join(base_dir, "PCA", filename)
        data[key] = np.load(path)

    return data


In [None]:
def run_hierarchical_clustering_all(input_dict, output_dir, result_path, n_clusters=2):
    """
    Runs Agglomerative Clustering on each PCA config, stores labels and metrics.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Load or initialize results
    if os.path.exists(result_path):
        with open(result_path, "r") as f:
            results = json.load(f)
    else:
        results = {}

    for name, X in input_dict.items():
        print(f"\n Running Agglomerative Clustering for: {name} | Shape: {X.shape}")

        # Convert to cuDF if not already
        if not isinstance(X, cudf.DataFrame):
            X = cudf.DataFrame(X)

        model = AgglomerativeClustering(n_clusters=n_clusters, linkage="complete")
        labels = model.fit_predict(X)

        # Save labels to file
        label_file = f"labels_hier_{name.lower().replace(' ', '_').replace('(', '').replace(')', '')}.npy"
        np.save(os.path.join(output_dir, label_file), labels)

        # Compute metrics
        try:
            sil = float(cython_silhouette_score(X, labels, metric='euclidean'))
            dbi = float(davies_bouldin_score(X.to_numpy(), labels))
        except Exception as e:
            print(f" Metric error on {name}: {e}")
            sil, dbi = "-", "-"

        results[name] = {
            "n_clusters": int(n_clusters),
            "silhouette": sil,
            "db_index": dbi
        }

        # Save JSON after each config
        with open(result_path, "w") as f:
            json.dump(results, f, indent=4)

        print(f" Saved: {label_file} | Silhouette={sil}, DBI={dbi}")

    print("\n Hierarchical clustering complete for all configurations.")



In [None]:
# Cell 4: Dendrogram using scipy (CPU, small sample size)
def plot_dendrogram(x, label, save_path=None, truncate_n=200):
    subset = x[:truncate_n]
    linkage_matrix = linkage(subset, method="complete", metric="euclidean")

    plt.figure(figsize=(10, 4))
    dendrogram(linkage_matrix, truncate_mode="lastp", p=25, leaf_rotation=90)
    plt.title(f"Dendrogram: {label}")
    plt.xlabel("Sample Index")
    plt.ylabel("Distance")

    if save_path:
        plt.savefig(save_path, dpi=300)
    plt.show()


In [None]:
# Cell 5: 2D/3D cluster plot with fraud overlay
def plot_cluster_projection(x, y, labels, title, save_path=None):
    plt.figure(figsize=(5, 5))
    plt.scatter(x[:, 0], x[:, 1], c=labels, cmap="cividis", alpha=0.6, s=10)
    plt.scatter(x[y == 1][:, 0], x[y == 1][:, 1], c='red', marker='x', s=10, label="Fraud")
    plt.title(title)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend(loc="upper right", fontsize="small")

    if save_path:
        plt.savefig(save_path, dpi=300)
    plt.show()


In [None]:
# Cell 6: Run all
base_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/Clustering"
data = load_hierarchical_data(base_dir)

output_dir = "/content/drive/MyDrive/NCU/Dissertation/Figures/HierarchicalPlots"
os.makedirs(output_dir, exist_ok=True)

# Configurations to run
configurations = {
    "Full (2C)": ("x_all_pca_2", "y_all", 2),
    "Full (3C)": ("x_all_pca_3", "y_all", 2),
    "Key (2C)": ("x_key_pca_2", "y_key", 2),
    "Key (3C)": ("x_key_pca_3", "y_key", 2),
}

for label, (x_key, y_key, n_clusters) in configurations.items():
    x = data[x_key]
    y = data[y_key]

    print(f"\n {label}: Running Agglomerative Clustering with k={n_clusters}")
    labels = run_hierarchical_clustering(x, n_clusters=n_clusters)

    # Save projection plot
    proj_path = os.path.join(output_dir, f"hierarchical_projection_{label.lower().replace(' ', '_').replace('(', '').replace(')', '')}.png")
    plot_cluster_projection(x, y, labels, title=f"{label}: k = {n_clusters}", save_path=proj_path)

    # Save dendrogram
    dendro_path = os.path.join(output_dir, f"hierarchical_dendrogram_{label.lower().replace(' ', '_').replace('(', '').replace(')', '')}.png")
    plot_dendrogram(x, label=label, save_path=dendro_path)
