In [2]:
# Urban Traffic Flow Clustering using METR-LA Dataset
# --------------------------------------------------
# This notebook loads METR-LA traffic speed data, extracts features,
# performs clustering, evaluates the model, and saves it for reuse.

# ================================
# 1. Imports
# ================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
import joblib

# ================================
# ================================
# 2. Load Dataset (.h5 support)
# ================================
# METR-LA.h5: rows = timestamps, columns = sensors, values = speed

data_path = "METR-LA.h5"  # change path if needed

df = pd.read_hdf(data_path)

print("Raw data shape:", df.shape)

# Handle missing values
# METR-LA commonly contains gaps
# METR-LA commonly contains gaps
df = df.interpolate().bfill()

# ================================
# 3. Feature Extraction
# ================================

def extract_features(data, window_size=3):
    """
    Extract traffic-state features from raw speed time-series
    window_size=3 -> 15 minutes (3 x 5-min intervals)
    """
    rows = []

    for sensor in data.columns:
        series = data[sensor]

        for i in range(window_size, len(series)):
            window = series[i - window_size:i]

            avg_speed = window.mean()
            speed_std = window.std()
            speed_change = window.iloc[-1] - window.iloc[0]

            ts = window.index[-1]
            time_of_day = ts.hour + ts.minute / 60.0
            day_of_week = ts.weekday()

            rows.append([
                sensor,
                avg_speed,
                speed_std,
                speed_change,
                time_of_day,
                day_of_week
            ])

    columns = [
        "sensor_id",
        "avg_speed",
        "speed_std",
        "speed_change",
        "time_of_day",
        "day_of_week"
    ]

    return pd.DataFrame(rows, columns=columns)

features_df = extract_features(df, window_size=3)
print("Extracted features shape:", features_df.shape)

# ================================
# 4. Feature Scaling
# ================================

X = features_df.drop(columns=["sensor_id"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler
joblib.dump(scaler, "scaler.joblib")

# ================================
# 5. Dimensionality Reduction (Optional but Recommended)
# ================================

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# ================================
# 6. Clustering (K-Means)
# ================================

k = 4  # number of traffic states
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

features_df["cluster"] = cluster_labels

# Save model
joblib.dump(kmeans, "kmeans_traffic_model.joblib")

# ================================
# 7. Evaluation Metrics
# ================================

sil_score = silhouette_score(X_scaled, cluster_labels)
db_score = davies_bouldin_score(X_scaled, cluster_labels)

print("Silhouette Score:", sil_score)
print("Davies-Bouldin Index:", db_score)

# ================================
# 8. Visualization
# ================================

plt.figure()
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels)
plt.title("Traffic Flow Clusters (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()

# ================================
# 9. Spatial Clustering using Adjacency Matrix (Spectral)
# ================================
# Uses adj_METR-LA.pkl for spatially-aware clustering

import pickle

with open("adj_METR-LA.pkl", "rb") as f:
    adj_matrix = pickle.load(f)

print("Adjacency matrix shape:", adj_matrix.shape)

spectral_spatial = SpectralClustering(
    n_clusters=k,
    affinity='precomputed',
    random_state=42
)

spatial_labels = spectral_spatial.fit_predict(adj_matrix)

# Map spatial cluster labels to sensors
spatial_cluster_map = dict(zip(df.columns, spatial_labels))

features_df["spatial_cluster"] = features_df["sensor_id"].map(spatial_cluster_map)

# ================================
# 10. Save Final Feature Data
# ================================

features_df.to_csv("traffic_features_with_clusters.csv", index=False)

print("Training complete. Models and features saved.")

Raw data shape: (34272, 207)
Extracted features shape: (7093683, 6)


: 