In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [3]:
# -------------------------------
# Load METR-LA traffic data
# -------------------------------
df = pd.read_hdf("METR-LA.h5", key="df")

# Fill missing values (standard practice)
df = df.interpolate().bfill()

print("Data shape (time, sensors):", df.shape)

Data shape (time, sensors): (34272, 207)


In [4]:
# -------------------------------
# Sensors are samples â†’ transpose
# -------------------------------
X_time = df.values.T     # (207 sensors, time_steps)

print("Sensor matrix shape:", X_time.shape)

Sensor matrix shape: (207, 34272)


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_time)

joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

In [6]:
pca = PCA(n_components=20, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print("PCA output shape:", X_pca.shape)

joblib.dump(pca, "pca.joblib")

PCA output shape: (207, 20)


['pca.joblib']

In [7]:
k = 4  # traffic regimes

kmeans = KMeans(
    n_clusters=k,
    random_state=42,
    n_init=10
)

temporal_labels = kmeans.fit_predict(X_pca)

joblib.dump(kmeans, "kmeans_temporal.joblib")

['kmeans_temporal.joblib']

In [8]:
sil = silhouette_score(X_pca, temporal_labels)
db  = davies_bouldin_score(X_pca, temporal_labels)

print("Silhouette Score:", sil)
print("Davies-Bouldin Index:", db)

Silhouette Score: 0.2196041867420056
Davies-Bouldin Index: 1.539541313425608


In [9]:
import pickle

with open("adj_METR-LA.pkl", "rb") as f:
    sensor_ids, node_ids, adj_matrix = pickle.load(f, encoding="latin1")

print("Adjacency matrix shape:", adj_matrix.shape)

Adjacency matrix shape: (207, 207)


In [10]:
spectral = SpectralClustering(
    n_clusters=k,
    affinity="precomputed",
    random_state=42
)

spatial_labels = spectral.fit_predict(adj_matrix)

  adjacency = check_symmetric(adjacency)


In [11]:
sensor_cluster_df = pd.DataFrame({
    "sensor_id": df.columns,
    "temporal_cluster": temporal_labels,
    "spatial_cluster": spatial_labels
})

sensor_cluster_df.to_csv(
    "sensor_clusters_final.csv",
    index=False
)

print(sensor_cluster_df.head())

  sensor_id  temporal_cluster  spatial_cluster
0    773869                 0                2
1    767541                 0                2
2    767542                 0                2
3    717447                 3                3
4    717446                 1                3


In [13]:
# ================================
# 7. Evaluation Metrics
# ================================

sil_score = silhouette_score(X_scaled, cluster_labels)
db_score = davies_bouldin_score(X_scaled, cluster_labels)

print("Silhouette Score:", sil_score)
print("Davies-Bouldin Index:", db_score)

NameError: name 'cluster_labels' is not defined