# Imports

In [21]:
import pandas as pd
import numpy as np
import os

## Load the Preprocessed Data

In [22]:
# Paths
interim_folder = "../data/interim"
processed_folder = "../data/processed"
os.makedirs(processed_folder, exist_ok=True)

# Load preprocessed base data
cleaned_df = pd.read_csv(
    os.path.join(interim_folder, "cleaned_df.csv"),
    sep=";",
    decimal=","
)

# Ensure timestamp is datetime and data is sorted
cleaned_df["timestamp"] = pd.to_datetime(cleaned_df["timestamp"])
cleaned_df = cleaned_df.sort_values("timestamp").reset_index(drop=True)

# Feature Engineering
## Tuned Lag Features Engineering
To capture temporal dependencies and recurring consumption patterns, an extended set of lagged features is generated.  
Each lag represents the electricity load observed a specific number of hours before the current timestamp, enabling the model to learn intraday, daily, and seasonal dynamics.

- **Intraday lags (1–24 hours):** capture short-term dependencies and within-day load fluctuations.  
- **Multi-day lags (48–144 hours):** represent load values from two to six days earlier and capture short-term recurring patterns across days.  
- **Weekly and multi-week lags (168, 336, 672 hours):** account for weekly and longer periodic consumption cycles.  
- **Yearly lag (8760 hours):** captures annual seasonality by referencing the same hour one year earlier.

This tuned feature set provides richer temporal context while preserving full comparability with the baseline preprocessing pipeline.


In [23]:
# Intraday lags: 1..24 hours (exclude lag_5 if you don't want it)
intraday_lags = [h for h in range(1, 25) if h != 5]

# Daily lags up to one week: 2..6 days (48..144 hours)
# (24 and 168 already covered by intraday_lags (24) and weekly lag (168))
weekly_daily_lags = [24 * d for d in range(2, 7)]  # 48, 72, 96, 120, 144

# Longer seasonal lags (keep what you already use)
seasonal_lags = [168, 336, 672, 8760]

# Final tuned lag list (unique + sorted)
extended_lags = sorted(set(intraday_lags + weekly_daily_lags + seasonal_lags))

# Create lagged features on the full time series
for lag in extended_lags:
    col = f"lag_{lag}"
    if col not in cleaned_df.columns:
        cleaned_df[col] = cleaned_df["load_MWh"].shift(lag)

## Adding Basic Calendar Features

To account for regular consumption patterns driven by time-based behavior,  
a set of simple calendar-related features is added to the dataset:

- **Hour:** represents the hour of the day (0–23) and captures daily load fluctuations such as morning and evening peaks.  
- **Weekday:** encodes the day of the week (0 = Monday, 6 = Sunday), allowing the model to differentiate between workdays and weekends.  
- **Is_weekend:** a binary indicator (1 = Saturday/Sunday, 0 = weekday) used to distinguish lower weekend demand from higher weekday consumption.

These features help the model learn typical daily and weekly cycles,  
which are essential for accurately forecasting electricity load patterns.

In [24]:
# Basic calendar features
cleaned_df["hour"] = cleaned_df["timestamp"].dt.hour
cleaned_df["weekday"] = cleaned_df["timestamp"].dt.weekday
cleaned_df["is_weekend"] = (cleaned_df["weekday"] >= 5).astype(int)

### Removing Missing Values from Lagged Features

Lagged feature creation introduces missing values at the beginning of the dataset,  
as earlier timestamps do not have sufficient historical data to compute all lags  
— especially for long-term features such as the one-year lag (8760 hours).  

To ensure data consistency and prevent issues during model training,  
rows containing any missing values are removed.  
This operation effectively discards the initial portion of the dataset (approximately the first year),  
leaving only complete observations with a full set of lagged and calendar features.

The resulting dataset now contains clean, fully populated records  
that can be reliably used for model training, validation, and forecasting.

In [25]:
# Lag creation introduces NaNs at the beginning (up to max lag)
cleaned_df_extended = cleaned_df.dropna().reset_index(drop=True)

print("Extended dataset shape:", cleaned_df_extended.shape)
cleaned_df_extended.head(3)

Extended dataset shape: (26300, 37)


Unnamed: 0,timestamp,load_MWh,lag_1,lag_2,lag_3,lag_4,lag_6,lag_7,lag_8,lag_9,...,lag_96,lag_120,lag_144,lag_168,lag_336,lag_672,lag_8760,hour,weekday,is_weekend
0,2022-01-01 01:00:00,41535.75,43915.5,45616.75,47461.5,48751.75,55785.75,58528.5,58442.5,55700.75,...,45682.0,44758.0,44051.5,41509.0,49242.25,54252.25,44569.25,1,5,1
1,2022-01-01 02:00:00,40480.75,41535.75,43915.5,45616.75,47461.5,51849.0,55785.75,58528.5,58442.5,...,45020.5,43949.25,43100.5,40378.0,48546.75,52329.5,42806.0,2,5,1
2,2022-01-01 03:00:00,39564.0,40480.75,41535.75,43915.5,45616.75,48751.75,51849.0,55785.75,58528.5,...,45397.0,44202.25,42820.5,40437.75,48432.0,51662.25,41049.75,3,5,1


## Train–Test Split

To prepare the dataset for model development and evaluation, the full time series is chronologically divided into a **training/validation set** and a **test set**.  
This ensures that future information is not used for model training, maintaining the temporal integrity of the forecasting task.

- **Training/Validation set:** Includes all data from January 2022 to December 2023.  
  These observations are used to train the forecasting models and tune their hyperparameters.  

- **Test set:** Contains data from January to December 2024.  
  This portion of the dataset is completely unseen during training and is used for the final model evaluation and recursive forecasting.


In [26]:
# Ensure correct ordering and datetime format (already done above, but safe)
cleaned_df_extended = cleaned_df_extended.sort_values("timestamp").reset_index(drop=True)
cleaned_df_extended["timestamp"] = pd.to_datetime(cleaned_df_extended["timestamp"])

test_start = "2024-01-01"

train_val_df_extended = cleaned_df_extended[cleaned_df_extended["timestamp"] < test_start].copy()
test_df_extended      = cleaned_df_extended[cleaned_df_extended["timestamp"] >= test_start].copy()

print(f"Extended Train/Validation period: {train_val_df_extended['timestamp'].min()} → {train_val_df_extended['timestamp'].max()}")
print(f"Extended Test period:            {test_df_extended['timestamp'].min()} → {test_df_extended['timestamp'].max()}")
print(f"Extended Train/Validation shape: {train_val_df_extended.shape}")
print(f"Extended Test shape:             {test_df_extended.shape}")

# Save the extended datasets
processed_folder = "../data/processed"
os.makedirs(processed_folder, exist_ok=True)

train_val_path = os.path.join(processed_folder, "train_val_df_extended.csv")
test_path = os.path.join(processed_folder, "test_df_extended.csv")

train_val_df_extended.to_csv(train_val_path, sep=";", decimal=",", index=False)
test_df_extended.to_csv(test_path, sep=";", decimal=",", index=False)

print(f"Saved extended_train_val_df to: {train_val_path}")
print(f"Saved extended_test_df to:{test_path}")

Extended Train/Validation period: 2022-01-01 01:00:00 → 2023-12-31 23:00:00
Extended Test period:            2024-01-01 00:00:00 → 2024-12-31 23:00:00
Extended Train/Validation shape: (17517, 37)
Extended Test shape:             (8783, 37)
Saved extended_train_val_df to: ../data/processed/train_val_df_extended.csv
Saved extended_test_df to:../data/processed/test_df_extended.csv
