In [33]:
import numpy as np
import pandas as pd 
import os
from pandas.tseries.holiday import USFederalHolidayCalendar

import utils
# from utils import load_data, get_train_val_split, get_stratified_splitter
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedGroupKFold, train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, make_scorer

from lightgbm import LGBMRegressor
from scipy.stats import kstest, kruskal, mannwhitneyu
from itertools import combinations
from collections import defaultdict
from tqdm import tqdm

import optuna

In [2]:
import importlib
importlib.reload(utils)

<module 'utils' from 'C:\\Users\\johns\\Desktop\\probstats2\\EnergyPrediction-ASHRAE\\code\\utils.py'>

In [3]:
data_dict = utils.load_data('ashrae-energy-prediction')

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.88%


  weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 3.07 MB
Decreased by 68.05%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.13 MB
Decreased by 73.04%


  train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 289.19 MB
Decreased by 53.12%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.53 MB
Decreased by 71.82%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading'] = 0.2931 * train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading']


In [4]:
# Add weather features 
weather_features = ['cloud_coverage', 'dew_temperature', 'air_temperature', 
                    'sea_level_pressure', 'wind_direction', 'wind_speed', 'precip_depth_1_hr',]

hourly_by_site = data_dict["X_train"].groupby(['hour', 'month', 'site_id'])[weather_features].mean().reset_index()

data_dict["X_train"] = data_dict["X_train"].merge(
    hourly_by_site, 
    on=['hour', 'month', 'site_id'], 
    how='left', 
    suffixes=(None, '_hourly_by_site')
)

del hourly_by_site

for feature in weather_features:
    # Fill in NA values from weather with hourly by site columns 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature + "_hourly_by_site"],
        inplace=True
    )
    
    # Fill in the rest with the median 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(),
        inplace=True
    )
    
    data_dict["X_train"][feature + "_diff_hourly_from_mean"] = data_dict["X_train"][feature] - \
        data_dict["X_train"][feature + "_hourly_by_site"]
    
data_dict["X_train"] = data_dict["X_train"].drop(columns = [feat + "_hourly_by_site" for feat in weather_features])

In [5]:
# Fill in NA with median values for floor count and year_built
for feature in ['year_built', 'floor_count']:
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(), 
        inplace=True
    )

In [6]:
data_dict.keys()

dict_keys(['weather_test', 'X_train', 'X_test', 'y_train'])

## Examine Differences (Non-Parametric)
Using Bonferonni's Correction

### Milestone 2. 
- Show difference in sites across meter readings 
- get average meter reading per day per site 
- conduct a (non-parametric ANOVA) KS OR pairs (mann-whitney) to show that they are diff 
- train a model per site id (with rudimentary hyperparameter tuning) 
- John sites 0-7, Sharad sites 8-15 

### Milestone 3. Determine, per site, which primary uses are similar (if they have only a few buildings), which are diff
- for a given primary use, if diff, identify "clusters" of buildings that are similar 

In [7]:
features = ['year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed',
       'air_temperature_mean_lag7', 'air_temperature_max_lag7',
       'air_temperature_min_lag7', 'air_temperature_std_lag7',
       'cloud_coverage_mean_lag7', 'cloud_coverage_max_lag7',
       'cloud_coverage_min_lag7', 'cloud_coverage_std_lag7',
       'dew_temperature_mean_lag7', 'dew_temperature_max_lag7',
       'dew_temperature_min_lag7', 'dew_temperature_std_lag7',
       'precip_depth_1_hr_mean_lag7', 'precip_depth_1_hr_max_lag7',
       'precip_depth_1_hr_min_lag7', 'precip_depth_1_hr_std_lag7',
       'sea_level_pressure_mean_lag7', 'sea_level_pressure_max_lag7',
       'sea_level_pressure_min_lag7', 'sea_level_pressure_std_lag7',
       'wind_direction_mean_lag7', 'wind_direction_max_lag7',
       'wind_direction_min_lag7', 'wind_direction_std_lag7',
       'wind_speed_mean_lag7', 'wind_speed_max_lag7', 'wind_speed_min_lag7',
       'wind_speed_std_lag7', 'log_square_feet', 'weekday', 'hour', 'day',
       'weekend', 'month', 'primary_use_enc']

In [36]:
def run_optuna_search_cv(
    site: int,
    meter: int,
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    features: list,
    n_trials: int = 50,
    
):
    """
    Runs Optuna Search for LGBMRegressor
    """
    SITE_FILTER = X_train["site_id"] == site
    METER_FILTER = X_train["meter"] == meter
    
    X, y = X_train.loc[SITE_FILTER&METER_FILTER, features], y_train[SITE_FILTER&METER_FILTER]
    if X.shape[0] == 0:
        return None

    splitter_gen = utils.get_stratified_splitter(X_train[SITE_FILTER&METER_FILTER], y_train[SITE_FILTER&METER_FILTER])

    

    regressor = LGBMRegressor()

    param_distributions = {
        "max_depth": optuna.distributions.IntDistribution(-1, len(data_dict['X_train'].columns)),
        "num_leaves": optuna.distributions.IntDistribution(5, 50),
        "learning_rate": optuna.distributions.FloatDistribution(1e-7, 1, log=True),
        "n_estimators": optuna.distributions.IntDistribution(1, 300),
        "reg_alpha": optuna.distributions.FloatDistribution(1e-7, 1e7, log=True),
        "reg_lambda": optuna.distributions.FloatDistribution(1e-7, 1e7, log=True),            
    }

    """
    Parameters not searched over: 
    subsample_for_bin: int = 200000,
    min_split_gain: float = 0.0,
    min_child_weight: float = 0.001,
    min_child_samples: int = 20,
    subsample: float = 1.0,
    subsample_freq: int = 0,
    colsample_bytree: float = 1.0,
    random_state: Union[int, numpy.random.mtrand.RandomState, NoneType] = None,
    n_jobs: int = -1,
    """
    def rmse(estimator, X_test, y_test):
        y_pred = estimator.predict(X_test)
        return -1 * mean_squared_error(y_test, y_pred, squared=False)
    
    optuna_search = optuna.integration.OptunaSearchCV(
        regressor, 
        param_distributions,
        n_trials=n_trials,
        cv = splitter_gen,
        random_state=0, # IMPORTANT,
        refit=True,
        n_jobs=4,
        scoring = rmse,
        verbose=0
    )

    optuna_search.fit(X, y)
    y_pred = optuna_search.predict(X)
    
    return optuna_search

In [None]:
# This fits sites 0-7
N_TRIALS = 50
models = defaultdict(dict)

for site in tqdm(range(8,16)):
    for meter in range(4):
        
        optuna_search = run_optuna_search_cv(site, meter, data_dict["X_train"], data_dict["y_train"], features, N_TRIALS)
        
        model_identifier = "_".join([str(site), str(meter)])
        models[model_identifier] = optuna_search

  optuna_search = optuna.integration.OptunaSearchCV(
[32m[I 2023-04-08 21:30:29,167][0m A new study created in memory with name: no-name-45ba9912-9235-4160-9952-0c3502d36374[0m
[32m[I 2023-04-08 21:30:32,702][0m Trial 1 finished with value: -1.6440997451935304 and parameters: {'max_depth': 7, 'num_leaves': 12, 'learning_rate': 1.072279428083805e-05, 'n_estimators': 8, 'reg_alpha': 1.0420196495538015e-07, 'reg_lambda': 0.3159849934970688}. Best is trial 1 with value: -1.6440997451935304.[0m
[32m[I 2023-04-08 21:30:38,560][0m Trial 0 finished with value: -1.6441514307483276 and parameters: {'max_depth': 22, 'num_leaves': 10, 'learning_rate': 7.632402473104205e-07, 'n_estimators': 196, 'reg_alpha': 778194.4896699354, 'reg_lambda': 1.9035019966313074}. Best is trial 1 with value: -1.6440997451935304.[0m
[32m[I 2023-04-08 21:30:43,365][0m Trial 3 finished with value: -1.554511943409628 and parameters: {'max_depth': 41, 'num_leaves': 12, 'learning_rate': 0.0009086467503470516, 'n_