In [1]:
import numpy as np
import pandas as pd 

from utils import load_data, get_train_val_split
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedGroupKFold, train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor

In [2]:
data_dict = load_data('ashrae-energy-prediction')

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.88%


  weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 3.07 MB
Decreased by 68.05%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.13 MB
Decreased by 73.04%


  train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format = True, utc = True).astype('datetime64[ns]')


Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 289.19 MB
Decreased by 53.12%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.53 MB
Decreased by 71.82%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading'] = 0.2931 * train[(train['site_id'] == 0) & (train['meter'] == 0)]['meter_reading']


In [3]:
# Add weather features 
weather_features = ['cloud_coverage', 'dew_temperature', 'air_temperature', 
                    'sea_level_pressure', 'wind_direction', 'wind_speed', 'precip_depth_1_hr',]

hourly_by_site = data_dict["X_train"].groupby(['hour', 'month', 'site_id'])[weather_features].mean().reset_index()

data_dict["X_train"] = data_dict["X_train"].merge(
    hourly_by_site, 
    on=['hour', 'month', 'site_id'], 
    how='left', 
    suffixes=(None, '_hourly_by_site')
)

del hourly_by_site

for feature in weather_features:
    # Fill in NA values from weather with hourly by site columns 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature + "_hourly_by_site"],
        inplace=True
    )
    
    # Fill in the rest with the median 
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(),
        inplace=True
    )
    
    data_dict["X_train"][feature + "_diff_hourly_from_mean"] = data_dict["X_train"][feature] - \
        data_dict["X_train"][feature + "_hourly_by_site"]
    
data_dict["X_train"] = data_dict["X_train"].drop(columns = [feat + "_hourly_by_site" for feat in weather_features])

In [4]:
# Fill in NA with median values for floor count and year_built
for feature in ['year_built', 'floor_count']:
    data_dict["X_train"][feature].fillna(
        data_dict["X_train"][feature].median(), 
        inplace=True
    )

In [5]:
# https://towardsdatascience.com/holiday-calendars-with-pandas-9c01f1ee5fee

## Generate Splits

In [6]:
discretized_target = np.digitize(
    data_dict["y_train"], 
    bins = np.linspace(0, data_dict["y_train"].max(), 30)
)

sgkf = StratifiedGroupKFold(n_splits=4)
sgkf_gen = sgkf.split(
    data_dict["X_train"], 
    discretized_target, 
    data_dict["X_train"]["building_id"].astype(str) + "_" + data_dict["X_train"]["meter"].astype(str)
)

feature_cols = ['building_id', 'site_id', 'square_feet',
       'year_built', 'floor_count', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'log_square_feet', 'weekday', 'hour',
       'day', 'weekend', 'month', 'primary_use_enc']

data_dict["X_train"] = data_dict["X_train"][feature_cols]

for train_idx, val_idx in sgkf_gen:
    classifier = RandomForestRegressor(
        50,
        max_depth=5,
        max_samples=0.3,
    )
#     classifier = DecisionTreeRegressor()
    
    classifier.fit(data_dict["X_train"].loc[train_idx, :], data_dict["y_train"].loc[train_idx])
    y_hat = classifier.predict(data_dict["X_train"].loc[val_idx, :])
    
    print(f"RMSE: {mean_squared_error(data_dict['y_train'].loc[val_idx], y_hat, squared=False)}")
    



RMSE: 1.833983894195993
RMSE: 1.8208702086612767
RMSE: 1.8424372489579612
RMSE: 1.8188909459292395
