### Import Libraries

In [18]:
import pandas as pd

### Load the dataset

In [19]:
df = pd.read_csv('./datasets/household_power_consumption.txt', delimiter=';', low_memory=False)

### Grab time series of electricity usage

In [20]:
df['date_time'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['electricity_usage'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
df = df[['date_time', 'electricity_usage']].copy()

### Set the index as and sort by date and time

In [21]:
df = df.set_index('date_time').sort_index()

### Resample to hourly frequency

In [22]:
df_hour = df.resample('1H').mean()

### Create shifted time series

In [23]:
df_hour['electricity_usage'] = df_hour['electricity_usage'].fillna(method='ffill')
df_hour['electricity_usage_1hr_lag'] = df_hour['electricity_usage'].shift()
df_hour['electricity_usage_2hr_lag'] = df_hour['electricity_usage'].shift(2)
df_hour['electricity_usage_3hr_lag'] = df_hour['electricity_usage'].shift(3)
df_hour['electricity_usage_4hr_lag'] = df_hour['electricity_usage'].shift(4)
df_hour['electricity_usage_5hr_lag'] = df_hour['electricity_usage'].shift(5)
df_hour['electricity_usage_6hr_lag'] = df_hour['electricity_usage'].shift(6)
df_hour['electricity_usage_7hr_lag'] = df_hour['electricity_usage'].shift(7)
df_hour['electricity_usage_8hr_lag'] = df_hour['electricity_usage'].shift(8)

### Create month/seasonality as a feature

In [24]:
df_hour['month'] = df_hour.index.month

### Drop missing data

In [25]:
df_hour = df_hour.dropna()

### Split train test sets

In [26]:
sample_size = df_hour.shape[0] - 200
df_train = df_hour.iloc[:sample_size].copy()
df_test = df_hour.iloc[sample_size:].copy()

In [27]:
df_train.head()

Unnamed: 0_level_0,electricity_usage,electricity_usage_1hr_lag,electricity_usage_2hr_lag,electricity_usage_3hr_lag,electricity_usage_4hr_lag,electricity_usage_5hr_lag,electricity_usage_6hr_lag,electricity_usage_7hr_lag,electricity_usage_8hr_lag,month
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-17 01:00:00,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,3.400233,3.6322,4.222889,12
2006-12-17 02:00:00,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,3.400233,3.6322,12
2006-12-17 03:00:00,1.6622,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,3.400233,12
2006-12-17 04:00:00,2.215767,1.6622,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,12
2006-12-17 05:00:00,1.996733,2.215767,1.6622,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,12


<hr>

### Install PyCaret

In [28]:
!pip install pycaret



In [29]:
from pycaret.regression import setup, compare_models, predict_model
pycaret_automl = setup(data=df_train, target = 'electricity_usage', session_id=666)

Unnamed: 0,Description,Value
0,Session id,666
1,Target,electricity_usage
2,Target type,Regression
3,Original data shape,"(34381, 10)"
4,Transformed data shape,"(34381, 10)"
5,Transformed train set shape,"(24066, 10)"
6,Transformed test set shape,"(10315, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


### Evaluate models performance

In [30]:
pycaret_models = compare_models(sort='MSE', budget_time=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.4119,0.3543,0.5951,0.5536,0.2485,0.5526,9.251
rf,Random Forest Regressor,0.4133,0.3586,0.5987,0.5481,0.2503,0.5576,19.011
et,Extra Trees Regressor,0.4143,0.3588,0.5989,0.5479,0.25,0.5573,5.762
lr,Linear Regression,0.4413,0.3831,0.6188,0.5173,0.2593,0.6295,0.033
ridge,Ridge Regression,0.4413,0.3831,0.6188,0.5173,0.2593,0.6296,0.116
br,Bayesian Ridge,0.4413,0.3831,0.6188,0.5173,0.2593,0.6297,0.029
lar,Least Angle Regression,0.4415,0.3836,0.6192,0.5167,0.2594,0.6289,0.056
omp,Orthogonal Matching Pursuit,0.4447,0.3885,0.6232,0.5104,0.2621,0.6346,0.029
huber,Huber Regressor,0.4189,0.3915,0.6255,0.5067,0.2568,0.4997,0.186
knn,K Neighbors Regressor,0.4557,0.4281,0.6542,0.4605,0.2772,0.6093,0.196


In [31]:
print(pycaret_models)

GradientBoostingRegressor(random_state=666)


<hr>

In [36]:
print(predictions.columns)

Index(['electricity_usage_1hr_lag', 'electricity_usage_2hr_lag',
       'electricity_usage_3hr_lag', 'electricity_usage_4hr_lag',
       'electricity_usage_5hr_lag', 'electricity_usage_6hr_lag',
       'electricity_usage_7hr_lag', 'electricity_usage_8hr_lag', 'month',
       'electricity_usage', 'prediction_label'],
      dtype='object')


<hr>

In [37]:
from sklearn.metrics import mean_squared_error

predictions = predict_model(pycaret_models, data=df_test)
mean_squared_error(predictions['electricity_usage'], predictions['prediction_label'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,0.4174,0.3793,0.6159,0.4438,0.2476,0.54


0.3792958655534004

In [43]:
x_columns = df.drop(df.columns[-1], axis=1, inplace=True)

In [47]:
# predictions.plot(x= x_columns, y=['electricity_usage', 'prediction_label'])
# plt.show()

In [48]:
# predictions[['electricity_usage', 'prediction_label']].plot()