### Import Libraries

In [23]:
import pandas as pd

### Load the dataset

In [24]:
df = pd.read_csv('./datasets/household_power_consumption.txt', delimiter=';', low_memory=False)

### Grab time series of electricity usage

In [25]:
df['date_time'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['electricity_usage'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
df = df[['date_time', 'electricity_usage']].copy()

### Set the index as and sort by date and time

In [26]:
df = df.set_index('date_time').sort_index()

### Resample to hourly frequency

In [27]:
df_hour = df.resample('1H').mean()

### Create shifted time series

In [28]:
df_hour['electricity_usage'] = df_hour['electricity_usage'].fillna(method='ffill')
df_hour['electricity_usage_1hr_lag'] = df_hour['electricity_usage'].shift()
df_hour['electricity_usage_2hr_lag'] = df_hour['electricity_usage'].shift(2)
df_hour['electricity_usage_3hr_lag'] = df_hour['electricity_usage'].shift(3)
df_hour['electricity_usage_4hr_lag'] = df_hour['electricity_usage'].shift(4)
df_hour['electricity_usage_5hr_lag'] = df_hour['electricity_usage'].shift(5)
df_hour['electricity_usage_6hr_lag'] = df_hour['electricity_usage'].shift(6)
df_hour['electricity_usage_7hr_lag'] = df_hour['electricity_usage'].shift(7)
df_hour['electricity_usage_8hr_lag'] = df_hour['electricity_usage'].shift(8)

### Create month/seasonality as a feature

In [29]:
df_hour['month'] = df_hour.index.month

### Drop missing data

In [30]:
df_hour = df_hour.dropna()

### Split train test sets

In [31]:
sample_size = df_hour.shape[0] - 200
df_train = df_hour.iloc[:sample_size].copy()
df_test = df_hour.iloc[sample_size:].copy()

In [32]:
df_train.head()

Unnamed: 0_level_0,electricity_usage,electricity_usage_1hr_lag,electricity_usage_2hr_lag,electricity_usage_3hr_lag,electricity_usage_4hr_lag,electricity_usage_5hr_lag,electricity_usage_6hr_lag,electricity_usage_7hr_lag,electricity_usage_8hr_lag,month
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-12-17 01:00:00,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,3.400233,3.6322,4.222889,12
2006-12-17 02:00:00,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,3.400233,3.6322,12
2006-12-17 03:00:00,1.6622,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,3.400233,12
2006-12-17 04:00:00,2.215767,1.6622,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,3.268567,12
2006-12-17 05:00:00,1.996733,2.215767,1.6622,1.587267,3.3494,1.882467,2.0616,2.200133,3.056467,12


<hr>

### Install PyCaret

In [33]:
!pip install pycaret

In [34]:
from pycaret.regression import setup, compare_models, predict_model
pycaret_automl = setup(data=df_train, target = 'electricity_usage', session_id=666)

Unnamed: 0,Description,Value
0,Session id,666
1,Target,electricity_usage
2,Target type,Regression
3,Original data shape,"(34381, 10)"
4,Transformed data shape,"(34381, 10)"
5,Transformed train set shape,"(24066, 10)"
6,Transformed test set shape,"(10315, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


### Evaluate models performance

In [35]:
pycaret_models = compare_models(sort='MSE', budget_time=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.4062,0.35,0.5915,0.5589,0.2457,0.5362,45.989
gbr,Gradient Boosting Regressor,0.4119,0.3543,0.5951,0.5536,0.2485,0.5526,2.984
rf,Random Forest Regressor,0.4133,0.3586,0.5987,0.5481,0.2503,0.5576,8.765
et,Extra Trees Regressor,0.4143,0.3588,0.5989,0.5479,0.25,0.5573,2.603
lr,Linear Regression,0.4413,0.3831,0.6188,0.5173,0.2593,0.6295,0.348
ridge,Ridge Regression,0.4413,0.3831,0.6188,0.5173,0.2593,0.6296,0.017
br,Bayesian Ridge,0.4413,0.3831,0.6188,0.5173,0.2593,0.6297,0.02
lar,Least Angle Regression,0.4415,0.3836,0.6192,0.5167,0.2594,0.6289,0.019
omp,Orthogonal Matching Pursuit,0.4447,0.3885,0.6232,0.5104,0.2621,0.6346,0.017
huber,Huber Regressor,0.4189,0.3915,0.6255,0.5067,0.2568,0.4997,0.097


In [36]:
print(pycaret_models)

LGBMRegressor(n_jobs=-1, random_state=666)


<hr>

In [37]:
predictions = predict_model(pycaret_models, data=df_test)
print(predictions.columns)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.3964,0.3575,0.5979,0.4758,0.2369,0.4906


Index(['electricity_usage_1hr_lag', 'electricity_usage_2hr_lag',
       'electricity_usage_3hr_lag', 'electricity_usage_4hr_lag',
       'electricity_usage_5hr_lag', 'electricity_usage_6hr_lag',
       'electricity_usage_7hr_lag', 'electricity_usage_8hr_lag', 'month',
       'electricity_usage', 'prediction_label'],
      dtype='object')


<hr>

In [38]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions['electricity_usage'], predictions['prediction_label'])

0.3574659342307327

In [39]:
x_columns = df.drop(df.columns[-1], axis=1, inplace=True)

In [None]:
# predictions.plot(x= x_columns, y=['electricity_usage', 'prediction_label'])
# plt.show()

In [None]:
# predictions[['electricity_usage', 'prediction_label']].plot()

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
import matplotlib.pyplot as plt

# Assuming 'datetime_column' is the name of your datetime column
predictions['datetime_column'] = pd.to_datetime(predictions['datetime_column'])

# Assuming 'electricity_usage' and 'prediction_label' are numerical columns
predictions[['electricity_usage', 'prediction_label']] = predictions[['electricity_usage', 'prediction_label']].astype(float)

# Now, you can use the plot method
predictions.plot(x='datetime_column', y=['electricity_usage', 'prediction_label'])
plt.show()


KeyError: 'datetime_column'

In [41]:
import matplotlib.pyplot as plt

predictions.plot(x= x_columns, y=['electricity_usage', 'prediction_label'])
plt.show()

AttributeError: 'DataFrame' object has no attribute '_convert'