<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Working/Modeling/06_Cluster_merge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:

# === CONFIGURATION === #
base_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/Clustering"
kmeans_dir = os.path.join(base_dir, "KMeans")
hier_dir = os.path.join(base_dir, "Hierarchical")
dbscan_dir = os.path.join(base_dir, "DBSCAN")
output_dir = os.path.join(base_dir, "OptionC_Merged")
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Feature inclusion config
config = {
    "use_kmeans": True,
    "use_hier": True,
    "use_dbscan": True
}

In [None]:
# === Load Base Features and Labels === #
X_scaled = pd.read_csv(os.path.join(base_dir, "X_all_scaled.csv"), index_col=0)
fraud_labels = pd.read_pickle(os.path.join(base_dir, "y_labels.pkl"))
indices = pd.read_csv(os.path.join(base_dir, "index_all_scaled.csv"), index_col=0).index

X_scaled = X_scaled.loc[indices]
fraud_labels = fraud_labels.loc[indices]

In [None]:
# === Initialize final DataFrame === #
df = X_scaled.copy()
df["fraud_label"] = fraud_labels

In [None]:
# === Merge K-Means Clusters === #
if config["use_kmeans"]:
    try:
        kmeans_labels = np.load(os.path.join(kmeans_dir, "labels_kmeans_Full_2C.npy"))
        if len(kmeans_labels) != len(df):
            raise ValueError("Length mismatch in K-Means labels.")
        df["kmeans_cluster"] = kmeans_labels
        print(f" K-Means cluster labels added.")
    except Exception as e:
        print(f" Error loading K-Means labels: {e}")

In [None]:
# === Merge Hierarchical Clusters === #
if config["use_hier"]:
    try:
        hier_labels = np.load(os.path.join(hier_dir, "labels_hier_Full_2C.npy"))
        if len(hier_labels) != len(df):
            raise ValueError("Length mismatch in Hierarchical labels.")
        df["hier_cluster"] = hier_labels
        print(f" Hierarchical cluster labels added.")
    except Exception as e:
        print(f" Error loading Hierarchical labels: {e}")

In [None]:
# === Merge DBSCAN Noise Flags === #
if config["use_dbscan"]:
    try:
        eps_val = 1.0  # You can change this for other variants
        eps_str = f"{int(eps_val * 10):02d}"
        noise_path = os.path.join(dbscan_dir, "NoiseIndices", f"noise_indices_Full_2C_eps{eps_str}.csv")
        noise_ids = pd.read_csv(noise_path, header=None)[0]
        df["dbscan_noise"] = 0
        df.loc[noise_ids, "dbscan_noise"] = 1
        print(f" DBSCAN noise flags added (ε = {eps_val})")
    except Exception as e:
        print(f" Error loading DBSCAN noise indices: {e}")

In [None]:

# === Export final dataset === #
out_path_csv = os.path.join(output_dir, "X_all_augmented_optionC.csv")
out_path_pkl = os.path.join(output_dir, "X_all_augmented_optionC.pkl")
df.to_csv(out_path_csv)
df.to_pickle(out_path_pkl)

print(f"\n Final merged dataset saved to:\n - {out_path_csv}\n - {out_path_pkl}")
print(f" Shape: {df.shape}")

In [None]:
\begin{longtable}{p{3.2cm} p{5.5cm} p{3.5cm} p{5.5cm}}
\caption{Selected Clustering Outputs Integrated into the Supervised Learning Dataset} \\
\toprule
\textbf{Clustering Method} & \textbf{Selected Output File} & \textbf{Feature Name} & \textbf{Rationale} \\
\midrule
\endfirsthead
\multicolumn{4}{l}{\textit{(continued from previous page)}} \\
\toprule
\textbf{Clustering Method} & \textbf{Selected Output File} & \textbf{Feature Name} & \textbf{Rationale} \\
\midrule
\endhead
\midrule
\multicolumn{4}{r}{\textit{(continued on next page)}} \\
\endfoot
\bottomrule
\endlastfoot

K-Means & \texttt{labels\_kmeans\_Full\_2C.npy} & \texttt{kmeans\_cluster} & Highest silhouette score among K-Means configurations; interpretable two-cluster structure. \\

Hierarchical & \texttt{labels\_hier\_Full\_2C.npy} & \texttt{hier\_cluster} & Strongest overall clustering metrics (Silhouette = 0.92, DBI = 0.056); observable fraud density in cluster groupings. \\

DBSCAN & \texttt{noise\_indices\_Full\_2C\_eps10.csv} & \texttt{dbscan\_noise} & High fraud concentration in noise cluster; ε = 1.0 offered optimal balance of separation and anomaly coverage. \\

\end{longtable}
