In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import joblib
import os

# Load raw unscaled data
df = pd.read_csv("C:/Swinburne/2025Sem1/COS40007-Artificial Intelligence for Engineering/Group Assignment/5G Zone Prediction System/ProcessedData/clean_data_clst.csv")

# Select features
feature_cols = ['latitude', 'longitude', 'average_latency', 'total_throughput', 'total_bandwidth']
X_raw = df[feature_cols].copy()

# Apply MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_raw)

# Determine Best K with silhouette score
best_k = 2
best_score = -1
samples_per_cluster = 3000

for k in range(3, 11):
    kmeans_tmp = KMeans(n_clusters=k, random_state=42, n_init='auto')
    labels_tmp = kmeans_tmp.fit_predict(X_scaled)
    sampled_indices = []

    for cluster_id in np.unique(labels_tmp):
        cluster_rows = np.where(labels_tmp == cluster_id)[0]
        chosen = np.random.choice(cluster_rows, size=min(samples_per_cluster, len(cluster_rows)), replace=False)
        sampled_indices.extend(chosen)

    X_sample = X_scaled[sampled_indices]
    labels_sample = labels_tmp[sampled_indices]
    score = silhouette_score(X_sample, labels_sample)

    if score > best_score:
        best_k = k
        best_score = score

# Train final KMeans with best_k
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init='auto')
df['cluster'] = kmeans.fit_predict(X_scaled)

# Create Cluster Summary
cluster_summary = df.groupby('cluster')[['average_latency', 'total_throughput']].mean()
cluster_summary['score'] = cluster_summary['total_throughput'] / cluster_summary['average_latency']
ranking_map = cluster_summary['score'].rank(ascending=False).astype(int) - 1
label_map = {0: 'High Performance', 1: 'Moderate', 2: 'Low Performance'}
cluster_summary['quality_rank'] = ranking_map
cluster_summary['performance_label'] = ranking_map.map(label_map)

# Merge performance labels back
df = df.merge(cluster_summary[['performance_label']], left_on='cluster', right_index=True)

# Save files
output_dir = "C:/Swinburne/2025Sem1/COS40007-Artificial Intelligence for Engineering/Group Assignment/5G Zone Prediction System/TrainedModel/Clustering/"
os.makedirs(output_dir, exist_ok=True)

df.to_csv(f"{output_dir}clustered_output.csv", index=False)
joblib.dump({'scaler': scaler, 'features': feature_cols}, f"{output_dir}cluster_label_scaler.pkl")
joblib.dump({'kmeans': kmeans, 'features': feature_cols}, f"{output_dir}cluster_label_kmeans.pkl")

zone_map = df.drop_duplicates(subset=['latitude', 'longitude'])[['latitude', 'longitude', 'cluster', 'performance_label']]
zone_map.to_csv(f"{output_dir}zone_cluster_map.csv", index=False)

print("✔ Training complete. Files saved.")


✔ Training complete. Files saved.
