In [1]:
import numpy as np
import pandas as pd 
from pandas.tseries.holiday import USFederalHolidayCalendar

from utils import load_data, get_train_val_split
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedGroupKFold, train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor
from scipy.stats import kstest, kruskal, mannwhitneyu
from itertools import combinations
from collections import defaultdict

import optuna

In [2]:
data_dict = load_data('ashrae-energy-prediction')

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.88%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 3.07 MB
Decreased by 68.05%


  weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.13 MB
Decreased by 73.04%


  train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 289.19 MB
Decreased by 53.12%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.53 MB
Decreased by 71.82%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading'] = 0.2931 * train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading']


In [3]:
# Add weather features 
weather_features = ['cloud_coverage', 'dew_temperature', 'air_temperature', 
                    'sea_level_pressure', 'wind_direction', 'wind_speed', 'precip_depth_1_hr',]

hourly_by_site = data_dict["X_train"].groupby(['hour', 'month', 'site_id'])[weather_features].mean().reset_index()

data_dict["X_train"] = data_dict["X_train"].merge(
    hourly_by_site, 
    on=['hour', 'month', 'site_id'], 
    how='left', 
    suffixes=(None, '_hourly_by_site')
)

del hourly_by_site

for feature in weather_features:
    # Fill in NA values from weather with hourly by site columns 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature + "_hourly_by_site"],
        inplace=True
    )
    
    # Fill in the rest with the median 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(),
        inplace=True
    )
    
    data_dict["X_train"][feature + "_diff_hourly_from_mean"] = data_dict["X_train"][feature] - \
        data_dict["X_train"][feature + "_hourly_by_site"]
    
data_dict["X_train"] = data_dict["X_train"].drop(columns = [feat + "_hourly_by_site" for feat in weather_features])

In [4]:
# Fill in NA with median values for floor count and year_built
for feature in ['year_built', 'floor_count']:
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(), 
        inplace=True
    )

## Examine Differences (Non-Parametric)
Using Bonferonni's Correction

### Milestone 2. 
- Show difference in sites across meter readings 
- get average meter reading per day per site 
- conduct a (non-parametric ANOVA) KS OR pairs (mann-whitney) to show that they are diff 
- train a model per site id (with rudimentary hyperparameter tuning) 
- John sites 0-7, Sharad sites 8-15 

### Milestone 3. Determine, per site, which primary uses are similar (if they have only a few buildings), which are diff
- for a given primary use, if diff, identify "clusters" of buildings that are similar 

In [None]:
# This fits sites 0-7
N_TRIALS = 100
models = defaultdict(dict)

for site in range(8):
    for meter in range(4):
        regressor = LGBMRegressor()
        
        param_distributions = {
            "max_depth": optuna.distributions.IntDistribution(-1, len(data_dict['X_train'].columns)),
            "num_leaves": optuna.distributions.IntDistribution(1, 50),
            "learning_rate": optuna.distributions.FloatDistribution(1e-7, 1, log=True),
            "n_estimators": optuna.distributions.IntDistribution(1, 200),
            "reg_alpha": optuna.distributions.FloatDistribution(1e-7, 1e7, log=True),
            "reg_lambda": optuna.distributions.FloatDistribution(1e-7, 1e7, log=True),            
        }
        
        """
        Parameters not searched over: 
        subsample_for_bin: int = 200000,
        min_split_gain: float = 0.0,
        min_child_weight: float = 0.001,
        min_child_samples: int = 20,
        subsample: float = 1.0,
        subsample_freq: int = 0,
        colsample_bytree: float = 1.0,
        random_state: Union[int, numpy.random.mtrand.RandomState, NoneType] = None,
        n_jobs: int = -1,
        """
        optuna_search = optuna.integration.OptunaSearchCV(
            regressor, 
            param_distributions,
            n_trials=N_TRIALS,
    
        )
        optuna_search.fit(X, y)
        y_pred = optuna_search.predict(X)      