<a href="https://colab.research.google.com/github/sappw1/Dissertation/blob/main/Notebooks/Notebooks-Working/Modeling/01a_Clustering_Methodology_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!nvidia-smi

Tue May 13 15:16:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   59C    P0             29W /   72W |     373MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cudf
import cupy as cp
import json, time, os
from datetime import datetime
from cuml.cluster import KMeans as cuKMeans
from cuml.metrics.cluster.silhouette_score import cython_silhouette_score
from sklearn.metrics import davies_bouldin_score
# APA Style
plt.style.use("default")  # You can upload apa.mplstyle if needed

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Mount your Drive
drive.mount('/content/drive')

# Set the path where your files are stored
input_dir = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/PCA_Arrays"

# Load each file
X_all_pca = cp.load(os.path.join(input_dir, "X_all_pca.npy"))
X_all_pca_2 = cp.load(os.path.join(input_dir, "X_all_pca_2.npy"))
X_all_pca_3 = cp.load(os.path.join(input_dir, "X_all_pca_3.npy"))
X_key_pca = cp.load(os.path.join(input_dir, "X_key_pca.npy"))
X_key_pca_2 = cp.load(os.path.join(input_dir, "X_key_pca_2.npy"))
X_key_pca_3 = cp.load(os.path.join(input_dir, "X_key_pca_3.npy"))

print("All PCA arrays loaded from Google Drive.")

# Convert to GPU cuDF format
def to_cudf(cp_array):
    return cudf.DataFrame(cp_array)

# Input dictionary
pca_inputs = {
    "Full (95%)": to_cudf(X_all_pca),
    "Full (2C)": to_cudf(X_all_pca_2),
    "Full (3C)": to_cudf(X_all_pca_3),
    "Key (95%)": to_cudf(X_key_pca),
    "Key (2C)": to_cudf(X_key_pca_2),
    "Key (3C)": to_cudf(X_key_pca_3),
}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All PCA arrays loaded from Google Drive.


In [5]:
# Storage
kmeans_results = {}
checkpoint_path = "/content/drive/MyDrive/NCU/Dissertation/Data/Processed/kmeans_gpu_progress.json"  # Save to Drive if mounted

os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)

# Loop
for name, X in pca_inputs.items():
    print(f"\n [{datetime.now().strftime('%H:%M:%S')}] Starting GPU K-Means for: {name} | Shape: {X.shape}")
    start_time = time.time()

    distortions = []
    silhouette_scores = []
    K_range = range(2, 10)

    for k in K_range:
        t0 = time.time()
        km = cuKMeans(n_clusters=k, random_state=42, n_init="auto")
        km.fit(X)
        labels = km.predict(X)
        inertia = float(km.inertia_)
        sil = float(cython_silhouette_score(X, labels, metric='euclidean'))
        distortions.append(inertia)
        silhouette_scores.append(sil)
        print(f"     k={k} | Silhouette={sil:.4f} | Time: {time.time() - t0:.2f}s")

    optimal_k = K_range[np.argmax(silhouette_scores)]
    print(f" Optimal k: {optimal_k}")

    # Final model
    final_km = cuKMeans(n_clusters=optimal_k, random_state=42)
    final_labels = final_km.fit_predict(X)
    db_index = davies_bouldin_score(cp.asnumpy(X), cp.asnumpy(final_labels))
    final_silhouette = float(cython_silhouette_score(X, final_labels, metric='euclidean'))

    print(f"Final Metrics | Silhouette={final_silhouette:.4f} | Time: {time.time() - start_time:.2f}s")

    # Store results
    kmeans_results[name] = {
        "optimal_k": int(optimal_k),
        "silhouette": final_silhouette,
        "db_index": db_index,
        "k_values": list(K_range),
        "silhouette_scores": silhouette_scores,
        "distortions": distortions,
    }

    # Save checkpoint
    with open(checkpoint_path, "w") as f:
        json.dump(kmeans_results, f, indent=4)

    print(f" Results saved to {checkpoint_path}")

print("\n All GPU KMeans runs complete.")



 [15:04:31] Starting GPU K-Means for: Full (95%) | Shape: (940481, 1)


AttributeError: 'float' object has no attribute 'get'