In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error

In [30]:
national_demand = pd.read_csv('../data/demand_lower_48')

In [31]:
def drop_utc(string):
    return string[:-6]

In [32]:
def clean_energy_demand_data(df):
    df['Time'] = df['Time'].apply(lambda x: drop_utc(x))
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    df = df.loc[::-1]
    df = df[1:]
    df['Year'] = df['Time'].dt.year
    df['Month'] = df['Time'].dt.month
    df['Hour'] = df['Time'].dt.hour
    df['Day_of_week'] = df['Time'].dt.dayofweek
    df['Day_of_month'] = df['Time'].dt.day
    df['Day_of_year'] = df['Time'].dt.dayofyear
    df['Week_of_year'] = df['Time'].dt.isocalendar().week
    df.set_index('Time', inplace=True, drop=True)
    df = df.astype('int')
    return df

In [33]:
national_demand = clean_energy_demand_data(national_demand)

In [34]:
national_demand.head()

Unnamed: 0_level_0,Megawatthours,Year,Month,Hour,Day_of_week,Day_of_month,Day_of_year,Week_of_year
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-07-01 02:00:00,335153,2015,7,2,2,1,182,27
2015-07-01 03:00:00,333837,2015,7,3,2,1,182,27
2015-07-01 04:00:00,398386,2015,7,4,2,1,182,27
2015-07-01 05:00:00,388954,2015,7,5,2,1,182,27
2015-07-01 06:00:00,392487,2015,7,6,2,1,182,27


In [35]:
def split_data(df, target='Megawatthours', split_date=None):
    X = df
    y = df.pop(target)
    if split_date:
        X_train = X.loc[:split_date]
        X_test = X.loc[split_date:]
        y_train = y.loc[:split_date]
        y_test = y.loc[split_date:]
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y)
    return X_train, X_test, y_train, y_test


    

In [36]:
X_train, X_test, y_train, y_test = split_data(national_demand, split_date='2020-07-01 00:00:00')

In [37]:
xgbr = XGBRegressor()

In [54]:
def grid_search_model(X, y, model, grid, cv_splits=5, metric='neg_root_mean_squared_error'):
    grid_search = GridSearchCV(model, 
                            grid, 
                            cv=TimeSeriesSplit(n_splits=cv_splits),
                            n_jobs=-1,
                            verbose=True,
                            scoring=metric)
    grid_search.fit(X,y)
    return grid_search.best_params_

In [57]:
grid = {'n_estimators': [1000, 2000],
           'learning_rate': [.1, .2],
           'max_depth': [2, 4]}

In [None]:
best_params = grid_search_model(X_train, y_train, xgbr, grid)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
best_model

In [49]:
best_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [50]:
predictions = best_model.predict(X_test)

In [52]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))
rmse

32523.824184638128

In [None]:
from xgboost import plot_importance, plot_tree