## Random Forest & GBM Development for Energy Consumption
### John Matune
### 2/5/19

In [None]:
# imports

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [89]:
df = pd.read_csv('../processed/consumption_master_with_SF.csv')

## i guess i have to bring in consumption weather and join it to the consumption dataset

c_weather = pd.read_csv('../raw/powercity_weather_consumption.csv')
c_weather.head().T

Unnamed: 0,0,1,2,3,4
City,Power City,Power City,Power City,Power City,Power City
Year,Consumption,Consumption,Consumption,Consumption,Consumption
Month,1,1,1,1,1
Day,1,1,1,1,1
Hour,1,2,3,4,5
Cloud_Cover_Fraction,1,1,1,1,1
Dew_Point,-8.9,-9.4,-10.6,-10.6,-11.7
Humidity_Fraction,0.81,0.84,0.81,0.81,0.77
Precipitable_Water,6,6,6,6,6
Temperature,-6.1,-7.2,-7.8,-7.8,-8.3


In [90]:
# create datetime index to join both tables
c_weather['Hour_str'] = [str(x-1) if len(str(x-1))==2 else '0'+str(x-1) for x in c_weather['Hour']]
c_weather['datetime_str'] = ['1900-'+str(x)+'-'+str(y)+' '+str(z)+':00:00' for x, y, z in zip(c_weather.Month, c_weather.Day, c_weather.Hour_str)]
c_weather['datetime'] = pd.to_datetime(c_weather.datetime_str, infer_datetime_format=True)

c_weather.head()

# set new index
#c_weather.set_index()

Unnamed: 0,City,Year,Month,Day,Hour,Cloud_Cover_Fraction,Dew_Point,Humidity_Fraction,Precipitable_Water,Temperature,Visibility,Hour_str,datetime_str,datetime
0,Power City,Consumption,1,1,1,1.0,-8.9,0.81,6,-6.1,4.0,0,1900-1-1 00:00:00,1900-01-01 00:00:00
1,Power City,Consumption,1,1,2,1.0,-9.4,0.84,6,-7.2,4.0,1,1900-1-1 01:00:00,1900-01-01 01:00:00
2,Power City,Consumption,1,1,3,1.0,-10.6,0.81,6,-7.8,4.0,2,1900-1-1 02:00:00,1900-01-01 02:00:00
3,Power City,Consumption,1,1,4,1.0,-10.6,0.81,6,-7.8,4.0,3,1900-1-1 03:00:00,1900-01-01 03:00:00
4,Power City,Consumption,1,1,5,1.0,-11.7,0.77,6,-8.3,4.0,4,1900-1-1 04:00:00,1900-01-01 04:00:00


In [91]:
c_weather.dtypes

City                            object
Year                            object
Month                            int64
Day                              int64
Hour                             int64
Cloud_Cover_Fraction           float64
Dew_Point                      float64
Humidity_Fraction              float64
Precipitable_Water               int64
Temperature                    float64
Visibility                     float64
Hour_str                        object
datetime_str                    object
datetime                datetime64[ns]
dtype: object

In [92]:
c_weather = c_weather.set_index(pd.DatetimeIndex(c_weather['datetime']))

In [93]:
c_weather.drop(['datetime', 'datetime_str', 'Hour_str', 'Year', 'City'], axis=1, inplace=True)
c_weather.head()

Unnamed: 0_level_0,Month,Day,Hour,Cloud_Cover_Fraction,Dew_Point,Humidity_Fraction,Precipitable_Water,Temperature,Visibility
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1900-01-01 00:00:00,1,1,1,1.0,-8.9,0.81,6,-6.1,4.0
1900-01-01 01:00:00,1,1,2,1.0,-9.4,0.84,6,-7.2,4.0
1900-01-01 02:00:00,1,1,3,1.0,-10.6,0.81,6,-7.8,4.0
1900-01-01 03:00:00,1,1,4,1.0,-10.6,0.81,6,-7.8,4.0
1900-01-01 04:00:00,1,1,5,1.0,-11.7,0.77,6,-8.3,4.0


In [94]:
for field in ['Month', 'Day', 'Hour']:
    c_weather[field] = c_weather[field].astype(str)

c_weather.dtypes

Month                    object
Day                      object
Hour                     object
Cloud_Cover_Fraction    float64
Dew_Point               float64
Humidity_Fraction       float64
Precipitable_Water        int64
Temperature             float64
Visibility              float64
dtype: object

In [95]:
# join to main consumption data
df = df.set_index(pd.DatetimeIndex(df['Time']))
df.drop('Time', inplace=True, axis=1)
df.dtypes
df.head(25)


Unnamed: 0_level_0,FOOD_SERVICE,GROCERY,HEALTH_CARE,K12_SCHOOLS,LODGING,OFFICE,RESIDENTIAL,STAND_ALONE_RETAIL,Weekdays,HolidayName,...,RESIDENTIAL_SF,SA_RTL_SF,FOOD_SVC_TOTAL,GROCERY_TOTAL,HEALTH_CARE_TOTAL,K12_TOTAL,LODGING_TOTAL,OFFICE_TOTAL,RESIDENTIAL_TOTAL,SA_RTL_TOTAL
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1900-01-01 01:00:00,0.005416,0.00203,0.00239,0.000818,0.001209,0.001911,0.000541,0.000491,Sun,New Year's Day,...,84823410.0,971961.735665,5240.358908,1096.297148,2307.216574,1984.35569,1125.486086,19108.976855,45875.739073,477.001885
1900-01-01 02:00:00,0.003789,0.002032,0.002443,0.000815,0.001215,0.001664,0.000523,0.000499,Sun,New Year's Day,...,84823410.0,971961.735665,3666.285758,1097.042106,2358.377936,1977.208735,1131.096791,16635.496277,44355.771468,485.313908
1900-01-01 03:00:00,0.003797,0.001977,0.002413,0.000824,0.001149,0.002005,0.000517,0.000505,Sun,New Year's Day,...,84823410.0,971961.735665,3673.960332,1067.347195,2329.567788,1998.867496,1069.825145,20052.79651,43877.825496,490.808116
1900-01-01 04:00:00,0.003802,0.002068,0.002406,0.000817,0.001164,0.001697,0.000514,0.000507,Sun,New Year's Day,...,84823410.0,971961.735665,3678.382266,1116.774474,2322.396505,1982.546991,1083.44356,16970.89627,43589.612521,492.574948
1900-01-01 05:00:00,0.003861,0.002176,0.002428,0.000825,0.001211,0.002016,0.000575,0.00051,Sun,New Year's Day,...,84823410.0,971961.735665,3736.219406,1175.213337,2344.384257,2001.978486,1127.795007,20161.026603,48802.994869,495.828201
1900-01-01 06:00:00,0.005475,0.002133,0.002428,0.000819,0.001317,0.001707,0.000708,0.000512,Sun,New Year's Day,...,84823410.0,971961.735665,5297.753197,1151.933455,2344.087307,1988.04491,1226.523277,17068.217656,60068.578241,497.266218
1900-01-01 07:00:00,0.011265,0.002751,0.002672,0.000825,0.001738,0.002039,0.000863,0.000515,Sun,New Year's Day,...,84823410.0,971961.735665,10899.968385,1485.322602,2579.123223,2002.441701,1618.414467,20389.164477,73170.902234,500.855187
1900-01-01 08:00:00,0.011246,0.002952,0.002654,0.000811,0.002038,0.001738,0.000944,0.000517,Sun,New Year's Day,...,84823410.0,971961.735665,10880.914457,1594.136249,2562.145474,1968.781085,1897.651049,17378.428326,80086.079642,502.612688
1900-01-01 09:00:00,0.008233,0.003439,0.002884,0.000731,0.002322,0.00186,0.000753,0.000943,Sun,New Year's Day,...,84823410.0,971961.735665,7966.313362,1857.060615,2784.451095,1774.689619,2162.337995,18595.182652,63836.561241,916.235282
1900-01-01 10:00:00,0.008237,0.002828,0.002837,0.000692,0.002067,0.001434,0.000643,0.000866,Sun,New Year's Day,...,84823410.0,971961.735665,7969.918967,1526.80163,2738.780685,1679.07102,1924.445127,14339.313937,54558.97559,842.01327


In [96]:
model_df = df.merge(c_weather, left_index=True, right_index=True, how='inner')
model_df.head().T

Unnamed: 0,1900-01-01 01:00:00,1900-01-01 02:00:00,1900-01-01 03:00:00,1900-01-01 04:00:00,1900-01-01 05:00:00
FOOD_SERVICE,0.00541607,0.00378922,0.00379715,0.00380172,0.00386149
GROCERY,0.00203025,0.00203163,0.00197664,0.00206817,0.0021764
HEALTH_CARE,0.00238997,0.00244297,0.00241312,0.0024057,0.00242847
K12_SCHOOLS,0.000817792,0.000814846,0.000823773,0.000817046,0.000825055
LODGING,0.00120873,0.00121476,0.00114895,0.00116358,0.00121121
OFFICE,0.00191095,0.00166359,0.00200533,0.00169714,0.00201616
RESIDENTIAL,0.000540838,0.000522919,0.000517284,0.000513887,0.000575348
STAND_ALONE_RETAIL,0.000490762,0.000499314,0.000504966,0.000506784,0.000510131
Weekdays,Sun,Sun,Sun,Sun,Sun
HolidayName,New Year's Day,New Year's Day,New Year's Day,New Year's Day,New Year's Day


In [97]:
model_df['Total_Consumption'] = df.FOOD_SVC_TOTAL + df.GROCERY_TOTAL + df.HEALTH_CARE_TOTAL + df.K12_TOTAL + df.LODGING_TOTAL + df.OFFICE_TOTAL + df.RESIDENTIAL_TOTAL + df.SA_RTL_TOTAL

In [98]:
model_df.head()

Unnamed: 0,FOOD_SERVICE,GROCERY,HEALTH_CARE,K12_SCHOOLS,LODGING,OFFICE,RESIDENTIAL,STAND_ALONE_RETAIL,Weekdays,HolidayName,...,Month,Day,Hour,Cloud_Cover_Fraction,Dew_Point,Humidity_Fraction,Precipitable_Water,Temperature,Visibility,Total_Consumption
1900-01-01 01:00:00,0.005416,0.00203,0.00239,0.000818,0.001209,0.001911,0.000541,0.000491,Sun,New Year's Day,...,1,1,2,1.0,-9.4,0.84,6,-7.2,4.0,77215.43222
1900-01-01 02:00:00,0.003789,0.002032,0.002443,0.000815,0.001215,0.001664,0.000523,0.000499,Sun,New Year's Day,...,1,1,3,1.0,-10.6,0.81,6,-7.8,4.0,71706.592979
1900-01-01 03:00:00,0.003797,0.001977,0.002413,0.000824,0.001149,0.002005,0.000517,0.000505,Sun,New Year's Day,...,1,1,4,1.0,-10.6,0.81,6,-7.8,4.0,74560.998077
1900-01-01 04:00:00,0.003802,0.002068,0.002406,0.000817,0.001164,0.001697,0.000514,0.000507,Sun,New Year's Day,...,1,1,5,1.0,-11.7,0.77,6,-8.3,4.0,71236.627536
1900-01-01 05:00:00,0.003861,0.002176,0.002428,0.000825,0.001211,0.002016,0.000575,0.00051,Sun,New Year's Day,...,1,1,6,1.0,-12.2,0.74,5,-8.3,4.0,79845.440165


In [99]:
columns_to_drop = ['FOOD_SERVICE', 
                   'GROCERY', 
                   'HEALTH_CARE', 
                   'K12_SCHOOLS',
                   'LODGING',
                   'OFFICE',
                   'RESIDENTIAL',
                   'STAND_ALONE_RETAIL',
                   'FOOD_SVC_TOTAL',
                   'GROCERY_TOTAL',
                   'HEALTH_CARE_TOTAL',
                   'K12_TOTAL',
                   'LODGING_TOTAL',
                   'OFFICE_TOTAL',
                   'RESIDENTIAL_TOTAL',
                   'SA_RTL_TOTAL',
                   'ELECTRIC_CAR']
                   
model_df.drop(columns_to_drop, inplace=True, axis=1)

In [100]:
# convert school day boolean to string
model_df.School_Day = model_df.School_Day.astype(str)
model_df.dtypes

Weekdays                 object
HolidayName              object
School_Day               object
Workday                  object
FOOD_SVC_SF             float64
GROCERY_SF              float64
HEALTH_CARE_SF          float64
K12_SF                  float64
LODGING_SF              float64
OFFICE_SF               float64
RESIDENTIAL_SF          float64
SA_RTL_SF               float64
Month                    object
Day                      object
Hour                     object
Cloud_Cover_Fraction    float64
Dew_Point               float64
Humidity_Fraction       float64
Precipitable_Water        int64
Temperature             float64
Visibility              float64
Total_Consumption       float64
dtype: object

In [101]:
model_df_dummies = pd.get_dummies(model_df)
model_df_dummies.head().T

Unnamed: 0,1900-01-01 01:00:00,1900-01-01 02:00:00,1900-01-01 03:00:00,1900-01-01 04:00:00,1900-01-01 05:00:00
FOOD_SVC_SF,9.675581e+05,9.675581e+05,9.675581e+05,9.675581e+05,9.675581e+05
GROCERY_SF,5.399811e+05,5.399811e+05,5.399811e+05,5.399811e+05,5.399811e+05
HEALTH_CARE_SF,9.653743e+05,9.653743e+05,9.653743e+05,9.653743e+05,9.653743e+05
K12_SF,2.426480e+06,2.426480e+06,2.426480e+06,2.426480e+06,2.426480e+06
LODGING_SF,9.311291e+05,9.311291e+05,9.311291e+05,9.311291e+05,9.311291e+05
OFFICE_SF,9.999731e+06,9.999731e+06,9.999731e+06,9.999731e+06,9.999731e+06
RESIDENTIAL_SF,8.482341e+07,8.482341e+07,8.482341e+07,8.482341e+07,8.482341e+07
SA_RTL_SF,9.719617e+05,9.719617e+05,9.719617e+05,9.719617e+05,9.719617e+05
Cloud_Cover_Fraction,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
Dew_Point,-9.400000e+00,-1.060000e+01,-1.060000e+01,-1.170000e+01,-1.220000e+01


In [103]:
list(model_df_dummies)

['FOOD_SVC_SF',
 'GROCERY_SF',
 'HEALTH_CARE_SF',
 'K12_SF',
 'LODGING_SF',
 'OFFICE_SF',
 'RESIDENTIAL_SF',
 'SA_RTL_SF',
 'Cloud_Cover_Fraction',
 'Dew_Point',
 'Humidity_Fraction',
 'Precipitable_Water',
 'Temperature',
 'Visibility',
 'Total_Consumption',
 'Weekdays_Fri',
 'Weekdays_Mon',
 'Weekdays_Sat',
 'Weekdays_Sun',
 'Weekdays_Thu',
 'Weekdays_Tue',
 'Weekdays_Wed',
 'HolidayName_Christmas',
 'HolidayName_Columbus Day',
 'HolidayName_Easter',
 'HolidayName_FALSE',
 'HolidayName_Independence Day',
 'HolidayName_Labor Day',
 'HolidayName_Martin Luther King Day',
 'HolidayName_Memorial Day',
 "HolidayName_New Year's Day",
 "HolidayName_Presidents' Day",
 'HolidayName_Thanksgiving Day',
 'HolidayName_Veterans Day',
 'School_Day_0.0',
 'School_Day_1.0',
 'Workday_False',
 'Workday_True',
 'Month_1',
 'Month_10',
 'Month_11',
 'Month_12',
 'Month_2',
 'Month_3',
 'Month_4',
 'Month_5',
 'Month_6',
 'Month_7',
 'Month_8',
 'Month_9',
 'Day_1',
 'Day_10',
 'Day_11',
 'Day_12',
 'Day_

In [105]:
X = model_df_dummies.drop('Total_Consumption', axis=1)
y = model_df_dummies['Total_Consumption']

print(X.shape)
print(y.shape)

(8759, 104)
(8759,)


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [119]:
rf = RandomForestRegressor(bootstrap=True, 
                           criterion='mse', 
                           max_depth=10, 
                           max_features=30, 
                           n_estimators=150)

model = rf.fit(X_train, y_train)

In [120]:
rf.score(X_train, y_train)

0.8519485890450424

In [121]:
rf.score(X_test, y_test)

0.8223348830376203