In [9]:
import numpy as np
import pandas as pd 
from pandas.tseries.holiday import USFederalHolidayCalendar
from collections import defaultdict

from utils import load_data, get_train_val_split
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedGroupKFold, train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor

In [4]:
data_dict = load_data('ashrae-energy-prediction')

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.88%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 3.07 MB
Decreased by 68.05%


  weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.13 MB
Decreased by 73.04%


  train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 289.19 MB
Decreased by 53.12%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.53 MB
Decreased by 71.82%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading'] = 0.2931 * train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading']


In [5]:
# Add weather features 
weather_features = ['cloud_coverage', 'dew_temperature', 'air_temperature', 
                    'sea_level_pressure', 'wind_direction', 'wind_speed', 'precip_depth_1_hr',]

hourly_by_site = data_dict["X_train"].groupby(['hour', 'month', 'site_id'])[weather_features].mean().reset_index()

data_dict["X_train"] = data_dict["X_train"].merge(
    hourly_by_site, 
    on=['hour', 'month', 'site_id'], 
    how='left', 
    suffixes=(None, '_hourly_by_site')
)

del hourly_by_site

for feature in weather_features:
    # Fill in NA values from weather with hourly by site columns 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature + "_hourly_by_site"],
        inplace=True
    )
    
    # Fill in the rest with the median 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(),
        inplace=True
    )
    
    data_dict["X_train"][feature + "_diff_hourly_from_mean"] = data_dict["X_train"][feature] - \
        data_dict["X_train"][feature + "_hourly_by_site"]
    
data_dict["X_train"] = data_dict["X_train"].drop(columns = [feat + "_hourly_by_site" for feat in weather_features])

In [6]:
# Fill in NA with median values for floor count and year_built
for feature in ['year_built', 'floor_count']:
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(), 
        inplace=True
    )

In [7]:
# Create binary attribute for holidays
calendar = USFederalHolidayCalendar()
holidays = calendar.holidays(
    start=data_dict["X_train"]['timestamp'].min(),
    end=data_dict["X_train"]['timestamp'].max()
)
data_dict["X_train"]["is_holiday"] = data_dict["X_train"]['timestamp'].isin(holidays)

## Generate Splits

In [10]:
# Train per site, per meter 

# TODO: Use Optuna
classifiers = defaultdict(dict)
feature_cols = ['building_id', 'site_id', 'square_feet', 'is_holiday',
       'year_built', 'floor_count', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'log_square_feet', 'weekday', 'hour',
       'day', 'weekend', 'month', 'primary_use_enc']

for site_id in data_dict['X_train']['site_id'].unique():
    for meter in data_dict['X_train']['meter'].unique():
        X = data_dict['X_train'][(data_dict['X_train']['site_id'] == site_id) & (data_dict['X_train']['meter'] == meter)]
        y = data_dict['y_train'][(data_dict['X_train']['site_id'] == site_id) & (data_dict['X_train']['meter'] == meter)]
        
        discretized_target = np.digitize(
            y, 
            bins = np.linspace(0, y.max(), 30)
        )
        
        sgkf = StratifiedGroupKFold(n_splits=4)
        sgkf_gen = sgkf.split(
            X, 
            discretized_target, 
            X["building_id"].astype(str) + "_" + X["meter"].astype(str)
        )
        
        X = X[feature_cols]
        
        for train_idx, val_idx in sgkf_gen:
    
            classifier = LGBMRegressor(
                n_estimators=50,
                max_depth=5,
                n_jobs=2
            )

            classifier.fit(X.loc[train_idx, :], y.loc[train_idx])
            y_hat = classifier.predict(X.loc[val_idx, :])

            print(f"RMSE on site {site_id} meter {meter_id}: {mean_squared_error(y.loc[val_idx], y_hat, squared=False)}")
            
        

KeyError: 'meter'

In [None]:
for train_idx, val_idx in sgkf_gen:
    
    classifier = LGBMRegressor(
        n_estimators=50,
        max_depth=5,
        n_jobs=2
    )
#     classifier = DecisionTreeRegressor()
    
    classifier.fit(data_dict["X_train"].loc[train_idx, :], data_dict["y_train"].loc[train_idx])
    y_hat = classifier.predict(data_dict["X_train"].loc[val_idx, :])
    
    print(f"RMSE: {mean_squared_error(data_dict['y_train'].loc[val_idx], y_hat, squared=False)}")

In [None]:
# for train_idx, val_idx in sgkf_gen:
#     classifier = RandomForestRegressor(
#         n_estimators=50,
#         max_depth=5,
#         max_samples=0.3,
#     )
# #     classifier = DecisionTreeRegressor()
    
#     classifier.fit(data_dict["X_train"].loc[train_idx, :], data_dict["y_train"].loc[train_idx])
#     y_hat = classifier.predict(data_dict["X_train"].loc[val_idx, :])
    
#     print(f"RMSE: {mean_squared_error(data_dict['y_train'].loc[val_idx], y_hat, squared=False)}")
    