In [3]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Load data file
df = pd.read_csv('comed_201910.csv')

# Filter out only residential accounts
df_residential = df[df['service_name'].str.contains('residential', case=False, na=False)]

# Filter for one day, ex. 2019-10-01
desired_date = '2019-10-01'
df_residential = df_residential[df_residential['date_time'].str.contains(desired_date)]

# Convert the time from HH:MM to seconds past since the beginning of the day
df_residential['date_time'] = pd.to_datetime(df_residential['date_time'])
df_residential['seconds_past'] = df_residential['date_time'].dt.hour * 3600 + df_residential['date_time'].dt.minute * 60

# Aggregate data to find time of maximum energy consumption for each zip code
df_grouped = df_residential.groupby('zip5')['energy'].idxmax()
peak_times = df_residential.loc[df_grouped, ['zip5', 'energy', 'seconds_past']]
peak_times.columns = ['zip_code', 'peak_energy_value', 'peak_energy_time']

# Load the geojson file
gdf = gpd.read_file('Chicago_ZC.geojson')

# Convert the 'GEOID20' column to integer for join operation
gdf['GEOID20'] = gdf['GEOID20'].astype(int)

# Merge the peak_times DataFrame with the gdf GeoDataFrame based on 'zip_code'
gdf = gdf.merge(peak_times, left_on='GEOID20', right_on='zip_code', how='left')

# Convert the geometries to a projected CRS
gdf = gdf.to_crs('EPSG:3857')

# Calculate the centroid of each polygon
gdf['centroid'] = gdf['geometry'].centroid

# Convert back to geographic for lat/long coordinates
gdf = gdf.to_crs('EPSG:4269')

# Convert the centroid to latitude and longitude coordinates
gdf['Lat_centroid'] = gdf['centroid'].y
gdf['Long_centroid'] = gdf['centroid'].x

# Keep only the desired columns and handle missing peak energy data
gdf = gdf[['zip_code', 'peak_energy_value', 'peak_energy_time', 'Lat_centroid', 'Long_centroid']]

# Convert 'zip_code' to int, handling NaN values which cannot be converted to int
gdf['zip_code'] = gdf['zip_code'].astype('Int64')

gdf = gdf.dropna()

# Load the data
data = gdf[['Lat_centroid', 'Long_centroid', 'peak_energy_time']]

# Split the data into X and Y variables
X = data[['Lat_centroid', 'Long_centroid']]
Y = data['peak_energy_time']

# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# *** THIS CODE USES GRID SEARCH AND IS COMPUTATIONALLY EXPENSIVE *** 

# Define the parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [7, 8, 9],
    'learning_rate': [0.03, 0.05, 0.07],
    'gamma': [0.4, 0.5, 0.6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]  # ensure all values are within the range 0 to 1
}

xgb_model = xgb.XGBRegressor()

# Perform Grid Search for XGBoost
xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, n_jobs=-1)
xgb_grid_search.fit(X_train, Y_train)

# Print the best parameters and best score for XGBoost
print("Best XGBoost Parameters:", xgb_grid_search.best_params_)
print("Best XGBoost Score:", xgb_grid_search.best_score_)

# Define the parameter grid for RandomForest
rf_param_grid = {
    'n_estimators': [200, 250, 300],
    'max_depth': [15, 20, 25],
    'min_samples_split': [7, 8, 9],
    'min_samples_leaf': [1, 2, 3],
    'bootstrap': [True, False]
}

rf_model = RandomForestRegressor(random_state=42)

# Perform Grid Search for RandomForest
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, Y_train)

# Print the best parameters and best score for RandomForest
print("Best RandomForest Parameters:", rf_grid_search.best_params_)
print("Best RandomForest Score:", rf_grid_search.best_score_)

# *** THIS CODE USES RANDOMIZED SEARCH AND IS MUCH LESS COMPUTATIONALLY EXPENSIVE THAN GRID SEARCH *** 

# # Hyperparameters for XGBoost
# xgb_param_grid = {
#     'n_estimators': [150, 200, 250],  # More focused around 200
#     'max_depth': [8, 10, 12],  # More focused around 10
#     'learning_rate': [0.025, 0.05, 0.075],  # More focused around 0.05
#     'gamma': [0.4, 0.5, 0.6],  # More focused around 0.5
#     'colsample_bytree': [0.8, 0.9, 1.0],  # More focused around 1.0
#     'subsample': [0.6, 0.7, 0.8]  # More focused around 0.7
# }

# xgb_model = xgb.XGBRegressor()

# xgb_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_grid, n_iter=20, cv=5, random_state=42)
# xgb_search.fit(X_train, Y_train)

# print("Best XGBoost Parameters:", xgb_search.best_params_)
# print("Best XGBoost Score:", xgb_search.best_score_)

# # Hyperparameters for RandomForest
# rf_param_grid = {
#     'n_estimators': [150, 200, 250],  # More focused around 200
#     'max_depth': [15, 20, 25],  # More focused around 20
#     'min_samples_split': [8, 10, 12],  # More focused around 10
#     'min_samples_leaf': [1],  # The best value from the last search
#     'bootstrap': [True]  # The best value from the last search
# }

# rf_model = RandomForestRegressor(random_state=42)

# rf_search = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, n_iter=20, cv=5, random_state=42)
# rf_search.fit(X_train, Y_train)

# print("Best RandomForest Parameters:", rf_search.best_params_)
# print("Best RandomForest Score:", rf_search.best_score_)

# # Linear regression model for comparison
# lin_reg_model = LinearRegression()
# lin_reg_scores = cross_val_score(lin_reg_model, X_train, Y_train, cv=5)
# lin_reg_mean_score = lin_reg_scores.mean()

# print("Linear Regression Score:", lin_reg_mean_score)



Best XGBoost Parameters: {'colsample_bytree': 1.0, 'gamma': 0.4, 'learning_rate': 0.03, 'max_depth': 8, 'n_estimators': 200, 'subsample': 0.7}
Best XGBoost Score: 0.34427834965421944
Best RandomForest Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 200}
Best RandomForest Score: 0.39553098550348237
