# Imports

In [43]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Loading the Preprocessed Data

In [44]:
# Define the Paths for loading the data
interim_folder = "../data/interim"

# Load the CSV with the data for all years
cleaned_df = pd.read_csv(os.path.join(interim_folder, "cleaned_df.csv"), sep=';', decimal=',')

# Take a quick look at the data
cleaned_df.head(3)

Unnamed: 0,timestamp,load_MWh
0,2021-01-01 00:00:00,44569.25
1,2021-01-01 01:00:00,42806.0
2,2021-01-01 02:00:00,41049.75


In [45]:
# Convert 'timestamp' column to datetime
cleaned_df["timestamp"] = pd.to_datetime(cleaned_df["timestamp"])
cleaned_df.head(3)

Unnamed: 0,timestamp,load_MWh
0,2021-01-01 00:00:00,44569.25
1,2021-01-01 01:00:00,42806.0
2,2021-01-01 02:00:00,41049.75


In [46]:
cleaned_df.dtypes

timestamp    datetime64[ns]
load_MWh            float64
dtype: object

# Feature Engineering

## Temporal Features

To capture temporal dependencies and recurring consumption patterns, a set of lagged features is generated.  
Each lag represents the electricity load from a specific number of hours before the current timestamp,  
allowing the model to learn from short-term, medium-term, and long-term relationships in the data.

- **Short-term lags (1, 2, 3, 6 hours):** capture immediate temporal dependencies and short fluctuations.  
- **Daily lag (24 hours):** accounts for daily consumption cycles.  
- **Weekly and multi-week lags (168, 336, 672 hours):** capture weekly and monthly seasonality effects.  
- **Yearly lag (8760 hours):** represents the load at the same hour one year earlier, enabling the model to learn annual seasonality.

These engineered features provide the model with historical context and improve its ability to recognize  
seasonal and cyclical load patterns over different time horizons.

In [47]:
# Create lagged features (short-, mid-, and long-term)
LAGS = [1, 2, 3, 6, 24, 168, 336, 672, 8760]
for lag in LAGS:
    cleaned_df[f"lag_{lag}"] = cleaned_df["load_MWh"].shift(lag)

# Take a look at the changes
cleaned_df.head(2)

Unnamed: 0,timestamp,load_MWh,lag_1,lag_2,lag_3,lag_6,lag_24,lag_168,lag_336,lag_672,lag_8760
0,2021-01-01 00:00:00,44569.25,,,,,,,,,
1,2021-01-01 01:00:00,42806.0,44569.25,,,,,,,,


## Adding Basic Calendar Features

To account for regular consumption patterns driven by time-based behavior,  
a set of simple calendar-related features is added to the dataset:

- **Hour:** represents the hour of the day (0–23) and captures daily load fluctuations such as morning and evening peaks.  
- **Weekday:** encodes the day of the week (0 = Monday, 6 = Sunday), allowing the model to differentiate between workdays and weekends.  
- **Is_weekend:** a binary indicator (1 = Saturday/Sunday, 0 = weekday) used to distinguish lower weekend demand from higher weekday consumption.

These features help the model learn typical daily and weekly cycles,  
which are essential for accurately forecasting electricity load patterns.


In [48]:
# Add basic calendar features
cleaned_df["hour"] = cleaned_df["timestamp"].dt.hour
cleaned_df["weekday"] = cleaned_df["timestamp"].dt.weekday
cleaned_df["is_weekend"] = (cleaned_df["weekday"] >= 5).astype(int)

# Take a look at the changes
cleaned_df.head(2)

Unnamed: 0,timestamp,load_MWh,lag_1,lag_2,lag_3,lag_6,lag_24,lag_168,lag_336,lag_672,lag_8760,hour,weekday,is_weekend
0,2021-01-01 00:00:00,44569.25,,,,,,,,,,0,4,0
1,2021-01-01 01:00:00,42806.0,44569.25,,,,,,,,,1,4,0


### Removing Missing Values from Lagged Features

Lagged feature creation introduces missing values at the beginning of the dataset,  
as earlier timestamps do not have sufficient historical data to compute all lags  
— especially for long-term features such as the one-year lag (8760 hours).  

To ensure data consistency and prevent issues during model training,  
rows containing any missing values are removed.  
This operation effectively discards the initial portion of the dataset (approximately the first year),  
leaving only complete observations with a full set of lagged and calendar features.

The resulting dataset now contains clean, fully populated records  
that can be reliably used for model training, validation, and forecasting.

In [49]:
# Remove rows with missing lag values (mainly the first year)
cleaned_df = cleaned_df.dropna().reset_index(drop=True)

print("Data shape after feature engineering:", cleaned_df.shape)
print("Columns:", cleaned_df.columns.tolist())
cleaned_df.head(3)

Data shape after feature engineering: (26300, 14)
Columns: ['timestamp', 'load_MWh', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_24', 'lag_168', 'lag_336', 'lag_672', 'lag_8760', 'hour', 'weekday', 'is_weekend']


Unnamed: 0,timestamp,load_MWh,lag_1,lag_2,lag_3,lag_6,lag_24,lag_168,lag_336,lag_672,lag_8760,hour,weekday,is_weekend
0,2022-01-01 01:00:00,41535.75,43915.5,45616.75,47461.5,55785.75,43679.5,41509.0,49242.25,54252.25,44569.25,1,5,1
1,2022-01-01 02:00:00,40480.75,41535.75,43915.5,45616.75,51849.0,42277.75,40378.0,48546.75,52329.5,42806.0,2,5,1
2,2022-01-01 03:00:00,39564.0,40480.75,41535.75,43915.5,48751.75,41903.5,40437.75,48432.0,51662.25,41049.75,3,5,1


## Train–Test Split

To prepare the dataset for model development and evaluation, the full time series is chronologically divided into a **training/validation set** and a **test set**.  
This ensures that future information is not used for model training, maintaining the temporal integrity of the forecasting task.

- **Training/Validation set:** Includes all data from January 2022 to December 2023.  
  These observations are used to train the forecasting models and tune their hyperparameters.  

- **Test set:** Contains data from January to December 2024.  
  This portion of the dataset is completely unseen during training and is used for the final model evaluation and recursive forecasting.


In [53]:
# sort and ensure timestamp is datetime
cleaned_df = cleaned_df.sort_values("timestamp")
cleaned_df["timestamp"] = pd.to_datetime(cleaned_df["timestamp"])

# define cutoff for test set
test_start = "2024-01-01"

# split data
train_val_df = cleaned_df[cleaned_df["timestamp"] < test_start].copy()
test_df = cleaned_df[cleaned_df["timestamp"] >= test_start].copy()

# show info
print(f"Train/Validation period: {train_val_df['timestamp'].min()} → {train_val_df['timestamp'].max()}")
print(f"Test period: {test_df['timestamp'].min()} → {test_df['timestamp'].max()}")
print(f"Train/Validation shape: {train_val_df.shape}")
print(f"Test shape: {test_df.shape}")

# set folder path manually (relative to project structure)
processed_folder = "../data/processed"  # adjust if needed
os.makedirs(processed_folder, exist_ok=True)

# save csv files
train_val_path = os.path.join(processed_folder, "train_val_df.csv")
test_path = os.path.join(processed_folder, "test_df.csv")

train_val_df.to_csv(train_val_path, sep=';', decimal=',', index=False)
test_df.to_csv(test_path, sep=';', decimal=',', index=False)

print(f" Saved train_val_df to: {train_val_path}")
print(f" Saved test_df to:      {test_path}")

Train/Validation period: 2022-01-01 01:00:00 → 2023-12-31 23:00:00
Test period: 2024-01-01 00:00:00 → 2024-12-31 23:00:00
Train/Validation shape: (17517, 14)
Test shape: (8783, 14)
 Saved train_val_df to: ../data/processed/train_val_df.csv
 Saved test_df to:      ../data/processed/test_df.csv
