In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

city_files = [
    'predictions/baltimore_predictions.csv', 'predictions/chicago_predictions.csv', 'predictions/elpaso_predictions.csv',
    'predictions/houston_predictions.csv', 'predictions/losangeles_predictions.csv', 'predictions/nyc_predictions.csv',
    'predictions/omaha_predictions.csv', 'predictions/philadelphia_predictions.csv', 'predictions/portland_predictions.csv',
    'predictions/san_diego_predictions.csv', 'predictions/sanantonio_predictions.csv', 'predictions/seattle_predictions.csv',
    'predictions/tallahassee_predictions.csv', 'predictions/tampa_predictions.csv', 'predictions/tuscon_predictions.csv'
]

city_names = [
    'Baltimore', 'Chicago', 'El Paso', 'Houston', 'Los Angeles', 'NYC',
    'Omaha', 'Philadelphia', 'Portland', 'San Diego', 'San Antonio',
    'Seattle', 'Tallahassee', 'Tampa', 'Tucson'
]

city_features = []

for file in city_files:
    df = pd.read_csv(file)
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])
    df_grouped = df.groupby('Date').agg({'Actual': ['mean', 'std']}).reset_index()
    df_grouped.columns = ['Date', 'mean_actual', 'std_actual']
    df_grouped.fillna(df_grouped.mean(), inplace=True)
    city_features.append(df_grouped[['mean_actual', 'std_actual']].mean().values)

features_df = pd.DataFrame(city_features, columns=['mean_actual', 'std_actual'])
features_df.fillna(features_df.mean(), inplace=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)

kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(scaled_features)
features_df['cluster'] = kmeans.labels_

clustered_cities = {i: [] for i in range(kmeans.n_clusters)}
for city, cluster in zip(city_names, kmeans.labels_):
    clustered_cities[cluster].append(city)

for cluster_id, cities in clustered_cities.items():
    print(f"Cluster {cluster_id}:")
    for city in cities:
        print(f"  - {city}")

plt.scatter(features_df['mean_actual'], features_df['std_actual'], c=kmeans.labels_, cmap='viridis')
plt.xlabel('Mean Actual Energy Demand')
plt.ylabel('Standard Deviation of Actual Energy Demand')
plt.title('K-Means Clustering of Cities based on Actual Energy Demand')
plt.colorbar(label='Cluster')
plt.show()

Cluster 0:
  - Baltimore
  - NYC
  - Philadelphia
  - San Antonio
Cluster 1:
  - Chicago
  - Los Angeles
Cluster 2:
  - El Paso
  - Omaha
  - Portland
  - San Diego
  - Seattle
  - Tallahassee
  - Tampa
  - Tucson
Cluster 3:
  - Houston
