<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Working/Modeling/06_Cluster_merge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import pandas as pd

In [None]:
\begin{longtable}{p{3.2cm} p{5.5cm} p{3.5cm} p{5.5cm}}
\caption{Selected Clustering Outputs Integrated into the Supervised Learning Dataset} \\
\toprule
\textbf{Clustering Method} & \textbf{Selected Output File} & \textbf{Feature Name} & \textbf{Rationale} \\
\midrule
\endfirsthead
\multicolumn{4}{l}{\textit{(continued from previous page)}} \\
\toprule
\textbf{Clustering Method} & \textbf{Selected Output File} & \textbf{Feature Name} & \textbf{Rationale} \\
\midrule
\endhead
\midrule
\multicolumn{4}{r}{\textit{(continued on next page)}} \\
\endfoot
\bottomrule
\endlastfoot

K-Means & \texttt{labels\_kmeans\_Full\_2C.npy} & \texttt{kmeans\_cluster} & Highest silhouette score among K-Means configurations; interpretable two-cluster structure. \\

Hierarchical & \texttt{labels\_hier\_Full\_2C.npy} & \texttt{hier\_cluster} & Strongest overall clustering metrics (Silhouette = 0.92, DBI = 0.056); observable fraud density in cluster groupings. \\

DBSCAN & \texttt{noise\_indices\_Full\_3C\_eps10.csv} & \texttt{dbscan\_noise} & High fraud concentration in noise cluster; ε = 1.0 offered optimal balance of separation and anomaly coverage. \\

\end{longtable}


In [None]:
import os
import numpy as np
import pandas as pd

# === CONFIGURATION === #
base_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed"
cluster_dir = os.path.join(base_dir, "Clustering")
output_dir = os.path.join(base_dir, "cluster_features")
os.makedirs(output_dir, exist_ok=True)

# Index reference
index_all = pd.read_csv(os.path.join(cluster_dir, "index_all_scaled.csv"), index_col=0).index

# === CLUSTER FEATURE DEFINITIONS === #
cluster_configs = [
    {
        "source_file": os.path.join(cluster_dir, "KMeans", "labels_Full_(2C)_k2.npy"),
        "column_name": "kmeans_full2c_ordinal",
        "type": "ordinal"
    },
    {
        "source_file": os.path.join(cluster_dir, "Hierarchical", "labels_hier_full_2c.npy"),
        "column_name": "hier_full2c_ordinal",
        "type": "ordinal"
    },
    {
        "source_file": os.path.join(cluster_dir, "DBSCAN", "NoiseIndices", "noise_indices_Key_3C_eps10.csv"),
        "column_name": "dbscan_key3c_e100_noise",
        "type": "binary"
    }
]

# === EXECUTION === #
for config in cluster_configs:
    path = config["source_file"]
    col = config["column_name"]
    col_type = config["type"]

    if not os.path.exists(path):
        print(f" Missing: {path}")
        continue

    if col_type == "ordinal":
        labels = np.load(path)
        if len(labels) != len(index_all):
            print(f"  Length mismatch in {col}: {len(labels)} vs {len(index_all)}")
            continue
        feature_series = pd.Series(labels, index=index_all, name=col)

    elif col_type == "binary":
        noise_ids = pd.read_csv(path, header=None)[0].astype(str)
        binary_series = pd.Series(0, index=index_all, name=col)
        binary_series.loc[binary_series.index.isin(noise_ids)] = 1
        feature_series = binary_series

    else:
        print(f"  Unknown type for {col}")
        continue

    # Save as DataFrame
    df_out = feature_series.to_frame()
    out_path = os.path.join(output_dir, f"{col}.pkl")
    df_out.to_pickle(out_path)
    print(f" Saved: {out_path}")


In [None]:
import pandas as pd
import os

# File paths
input_csv = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/ppp_loans_preprocessed.csv"
output_path = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/X_all_scaled.pkl"

# Load and drop label
df = pd.read_csv(input_csv, index_col=0)
X = df.drop(columns=["is_fraudulent"])

# Save features only
X.to_pickle(output_path)

print(f" Saved: {output_path}")


In [12]:
import pandas as pd
import os

# File paths
input_pkl = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/ppp_loans_preprocessed_cleaned.pkl"
output_path = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/X_all_scaled.pkl"

# Load and drop label
df = pd.read_pickle(input_pkl)
X = df.drop(columns=["is_fraudulent"])

# Save features only
X.to_pickle(output_path)

print(f" Saved: {output_path}")

 Saved: /content/drive/MyDrive/NCU/Dissertation/Data/Processed/X_all_scaled.pkl


In [5]:
import os
import pandas as pd

# Define paths
base_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed"
cluster_dir = os.path.join(base_dir, "Clustering")
feature_dir = os.path.join(base_dir, "cluster_features")
os.makedirs(feature_dir, exist_ok=True)

# Filenames
index_file = os.path.join(cluster_dir, "index_all_scaled.csv")
X_scaled_file = os.path.join(base_dir, "X_all_scaled.pkl")

# Output
output_file = os.path.join(base_dir, "X_all_augmented.pkl")

# Load index and master scaled dataset
index_all = pd.read_csv(index_file, index_col=0).index
X_all_scaled = pd.read_pickle(X_scaled_file)
X_all_scaled = X_all_scaled.loc[index_all]

# Load cluster-based features
feature_files = {
    "kmeans_full2c_ordinal": os.path.join(feature_dir, "kmeans_full2c_ordinal.pkl"),
    "hier_full2c_ordinal": os.path.join(feature_dir, "hier_full2c_ordinal.pkl"),
    "dbscan_key3c_e100_noise": os.path.join(feature_dir, "dbscan_key3c_e100_noise.pkl")
}

# Merge each cluster feature into X
for label, path in feature_files.items():
    if os.path.exists(path):
        cluster_col = pd.read_pickle(path)
        cluster_col.name = label
        X_all_scaled[label] = cluster_col
        print(f" Added: {label}")
    else:
        print(f" Missing: {label} — Skipped.")

# Save augmented dataset
X_all_scaled.to_pickle(output_file)
print(f"\n Saved: {output_file}")


 Added: kmeans_full2c_ordinal
 Added: hier_full2c_ordinal
 Added: dbscan_key3c_e100_noise

 Saved: /content/drive/MyDrive/NCU/Dissertation/Data/Processed/X_all_augmented.pkl
