In [22]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

import numpy as np
import pandas as pd

from energy_forecast.evaluate import root_mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from energy_forecast.split import walk_forward_splits
from energy_forecast.features import add_lag_features, add_rolling_features


In [16]:
df = pd.read_csv(ROOT / "data" / "Energy Production Dataset.csv")  
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

TARGET = "Production"
TIME_COL = "Date"

df.shape, df.head()


((51864, 9),
         Date  Start_Hour  End_Hour Source  Day_of_Year   Day_Name Month_Name  \
 0 2020-01-01          13        14  Solar            1  Wednesday    January   
 1 2020-01-01          21        22   Wind            1  Wednesday    January   
 2 2020-01-01          20        21   Wind            1  Wednesday    January   
 3 2020-01-01           6         7   Wind            1  Wednesday    January   
 4 2020-01-01          19        20   Wind            1  Wednesday    January   
 
    Season  Production  
 0  Winter        2179  
 1  Winter        1228  
 2  Winter        1268  
 3  Winter        2293  
 4  Winter        1181  )

In [4]:
# --- Load dataset ---
df = pd.read_csv(ROOT / "data" / "Energy Production Dataset.csv")

df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

TARGET = "Production"

print("Shape:", df.shape)
df.head()


Shape: (51864, 9)


Unnamed: 0,Date,Start_Hour,End_Hour,Source,Day_of_Year,Day_Name,Month_Name,Season,Production
0,2020-01-01,13,14,Solar,1,Wednesday,January,Winter,2179
1,2020-01-01,21,22,Wind,1,Wednesday,January,Winter,1228
2,2020-01-01,20,21,Wind,1,Wednesday,January,Winter,1268
3,2020-01-01,6,7,Wind,1,Wednesday,January,Winter,2293
4,2020-01-01,19,20,Wind,1,Wednesday,January,Winter,1181


In [17]:
df_feat = add_lag_features(df.copy(), target_col=TARGET, time_col=TIME_COL, lags=(1, 24))
df_feat = add_rolling_features(df_feat, target_col=TARGET, time_col=TIME_COL, windows=(24,))

# Lag/rolling create NaNs at the beginning; we must drop them
df_feat = df_feat.dropna().reset_index(drop=True)

print("Original:", df.shape)
print("After features + dropna:", df_feat.shape)

# sanity check: these columns should exist
cols_check = [f"{TARGET}_lag_1", f"{TARGET}_lag_24", f"{TARGET}_rollmean_24"]
print(df_feat[cols_check].head())


Original: (51864, 9)
After features + dropna: (51840, 12)
   Production_lag_1  Production_lag_24  Production_rollmean_24
0            1775.0              950.0             1967.583333
1            1267.0             2725.0             1929.583333
2            4298.0             2308.0             2060.208333
3            6857.0             1955.0             2294.750000
4            3497.0             2617.0             2387.625000


In [18]:
folds = walk_forward_splits(
    df_feat,
    time_col=TIME_COL,
    initial_train_frac=0.50,
    val_frac=0.10,
    n_folds=4
)

[(f.fold, len(f.train), len(f.val)) for f in folds]


[(1, 25920, 5184), (2, 31104, 5184), (3, 36288, 5184), (4, 41472, 5184)]

In [23]:
def make_hgb_pipeline(X: pd.DataFrame):
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", "passthrough", num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
        ],
        remainder="drop"
    )

    model = HistGradientBoostingRegressor(
        random_state=42,
        learning_rate=0.05,
        max_depth=8,
        max_iter=500
    )

    return Pipeline(steps=[
        ("prep", preprocessor),
        ("model", model)
    ])


In [25]:
results = []

for f in folds:
    train_df = f.train
    val_df = f.val

    X_train = train_df.drop(columns=[TARGET, TIME_COL])
    y_train = train_df[TARGET]

    X_val = val_df.drop(columns=[TARGET, TIME_COL])
    y_val = val_df[TARGET]

    pipe = make_hgb_pipeline(X_train)
    pipe.fit(X_train, y_train)

    preds = pipe.predict(X_val)
    rmse = root_mean_squared_error(y_val, preds)

    results.append({
        "fold": f.fold,
        "train_rows": len(train_df),
        "val_rows": len(val_df),
        "rmse": rmse
    })

results_df = pd.DataFrame(results)
results_df
print("Mean RMSE:", results_df["rmse"].mean())
print("Std RMSE:", results_df["rmse"].std())


Mean RMSE: 2313.870373780608
Std RMSE: 175.62713952142477


In [26]:
print("Mean RMSE:", results_df["rmse"].mean())
print("Std RMSE:", results_df["rmse"].std())
print("Best fold RMSE:", results_df["rmse"].min())
print("Worst fold RMSE:", results_df["rmse"].max())


Mean RMSE: 2313.870373780608
Std RMSE: 175.62713952142477
Best fold RMSE: 2113.378633215558
Worst fold RMSE: 2489.758458441334
