In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Perform spatial and SHAP-based clustering on metro station data

# === 1) Load data ===
df = pd.read_csv("TJ_2023-2025SHAP.csv")

# === 2) Aggregate: mean of real_flow and SHAP values; first value for coordinates/name ===
shap_cols = [c for c in df.columns if c.startswith("SHAP_")]
agg_dict = {col: "mean" for col in shap_cols + ["real_flow"]}
for col in ["stationID", "name", "longitude", "latitude"]:
    if col in df.columns:
        agg_dict[col] = "first"

gdf = df.groupby("stationID", as_index=False).agg(agg_dict)

# === 3) Spatial clustering (real_flow mean + coordinates) ===
features_spatial = ["real_flow", "longitude", "latitude"]
X_spatial = gdf[features_spatial].fillna(0)

scaler_spatial = StandardScaler()
X_spatial_scaled = scaler_spatial.fit_transform(X_spatial)

inertia_spatial = {}
for k in [3, 4, 5]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_spatial_scaled)
    inertia_spatial[k] = km.inertia_

best_k_spatial = min(inertia_spatial, key=inertia_spatial.get)
print(f"[Spatial clustering] Best K: {best_k_spatial}")

kmeans_spatial = KMeans(n_clusters=best_k_spatial, random_state=42, n_init=10)
gdf["Cluster_flow_coord"] = kmeans_spatial.fit_predict(X_spatial_scaled)

# === 4) SHAP clustering (only SHAP means) ===
X_shap = gdf[shap_cols].fillna(0)

scaler_shap = StandardScaler()
X_shap_scaled = scaler_shap.fit_transform(X_shap)

inertia_shap = {}
for k in [3, 4, 5]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_shap_scaled)
    inertia_shap[k] = km.inertia_

best_k_shap = min(inertia_shap, key=inertia_shap.get)
print(f"[SHAP clustering] Best K: {best_k_shap}")

kmeans_shap = KMeans(n_clusters=best_k_shap, random_state=42, n_init=10)
gdf["Cluster_SHAP"] = kmeans_shap.fit_predict(X_shap_scaled)

# === 5) Save results ===
out_path = "TJ_2023-2025SHAP_clusters_no167.csv"
gdf.to_csv(out_path, index=False)
print(f"Saved: {out_path}")




[空间聚类] 最佳 K: 5
[SHAP聚类] 最佳 K: 5
已保存: TJ_2023-2025SHAP_clusters_no167.csv
