### imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import dataclass
from typing import Iterator, Optional
import random
from datetime import datetime
import lightgbm as lgb

### load

In [None]:
df = pd.read_csv('consumption_temp.csv')
df.head(5)

# Prepare the data

### Preprocessing

In [None]:
def preprocess_data(df, city, starting_date):
    """Basic preprocessing: filter by city, convert time to datetime, set index, sort, filter by starting date."""
    df = df.copy()
    df["datetime"] = pd.to_datetime(df["time"])

    city_norm = city.strip().lower()
    df_city = df[df["location"].astype(str).str.strip().str.lower() == city_norm].copy()

    df_city = df_city.set_index("datetime").sort_index()
    df_city = df_city.loc[pd.Timestamp(starting_date):]

    return df_city


In [None]:
location_df = preprocess_data(df, city='bergen', starting_date='2022-08-01') 
location_df.head(5)

### Feature engineering

In [None]:
def add_time_features(
    df: pd.DataFrame,
    *,
    target_col: str = "consumption",
    temp_col: str | None = "temperature",
    delay_days: int = 5,
    drop_cols: list[str] | None = None,
    add_cyclical: bool = True,
    add_consumption_lags: bool = True,
    add_consumption_rollings: bool = True,
    cons_lags: tuple[int, ...] = (120, 168),
    cons_rolling_windows: tuple[int, ...] = (168, 336),
    temp_lags: tuple[int, ...] = (24, 168),
    temp_rolling_windows: tuple[int, ...] = (24, 168),
) -> pd.DataFrame:
    """
    Feature engineering for hourly day-ahead. 5 day data delay on consumption.

    Returns a NEW dataframe.
    """
    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("DataFrame index must be a DatetimeIndex.")

    out = df.copy()
    delay_h = int(delay_days * 24)

    # Basic calendar features (used for cyclical encodings)
    hour = out.index.hour
    dow = out.index.dayofweek
    month = out.index.month
    woy = out.index.isocalendar().week.astype(int)

    out["is_weekend"] = (dow >= 5).astype(int)
    out["hour"] = hour
    out["day_of_week"] = dow
    out["month"] = month
    out["week_of_year"] = woy

    # Cyclical encodings
    if add_cyclical:
        out["hour_sin"] = np.sin(2 * np.pi * hour / 24.0)
        out["hour_cos"] = np.cos(2 * np.pi * hour / 24.0)

        out["dow_sin"] = np.sin(2 * np.pi * dow / 7.0)
        out["dow_cos"] = np.cos(2 * np.pi * dow / 7.0)

        out["month_sin"] = np.sin(2 * np.pi * (month - 1) / 12.0)
        out["month_cos"] = np.cos(2 * np.pi * (month - 1) / 12.0)

        out["woy_sin"] = np.sin(2 * np.pi * (woy - 1) / 52.0)
        out["woy_cos"] = np.cos(2 * np.pi * (woy - 1) / 52.0)

    # Temperature features 
    out[temp_col] = out[temp_col].astype(float)

    for k in temp_lags:
        out[f"temp_lag_{k}"] = out[temp_col].shift(k)

    shifted_temp = out[temp_col].shift(1)
    for w in temp_rolling_windows:
        out[f"temp_roll_mean_{w}"] = shifted_temp.rolling(window=w, min_periods=w).mean()
        out[f"temp_roll_std_{w}"]  = shifted_temp.rolling(window=w, min_periods=w).std()

    out["heating_degree_18"] = np.maximum(0.0, 18.0 - out[temp_col])

    # Consumption lag features, available with 5 day delay
    if add_consumption_lags:
        for k in cons_lags:
            out[f"lag_{k}"] = out[target_col].shift(k)

    # Consumption rolling stats
    if add_consumption_rollings:
        avail = out[target_col].shift(delay_h)
        for w in cons_rolling_windows:
            out[f"avail_roll_mean_{w}"] = avail.rolling(window=w, min_periods=w).mean()
            out[f"avail_roll_std_{w}"]  = avail.rolling(window=w, min_periods=w).std()

    # Drop redundant raw time columns
    out = out.drop(columns=[c for c in ["hour", "day_of_week", "month", "week_of_year"] if c in out.columns])

    # Optional drops
    if drop_cols:
        out = out.drop(columns=[c for c in drop_cols if c in out.columns])

    return out

In [None]:
# Feature engineering cell
data = location_df.copy()

data = add_time_features(
    data,
    target_col="consumption",
    temp_col="temperature",
    delay_days=5,
    drop_cols=["time", "location", "datetime"],
    cons_lags=(120, 168), # safe under 5-day delay data constraint
    cons_rolling_windows=(168, 336),
    temp_lags=(24, 168),
    temp_rolling_windows=(24, 168),
)

# drop NaNs introduced by lag/rolling features
data = data.dropna()

data.head(5)

### Split the data

In [None]:
@dataclass
class SlidingSplit:
    forecast_date: pd.Timestamp
    # boundaries (useful for logging/debug)
    train_start: pd.Timestamp
    train_end: pd.Timestamp
    val_start: pd.Timestamp
    val_end: pd.Timestamp
    test_start: pd.Timestamp
    test_end: pd.Timestamp
    # data
    X_train: pd.DataFrame
    y_train: pd.Series
    X_val: pd.DataFrame
    y_val: pd.Series
    X_test: pd.DataFrame
    y_test: pd.Series


def iter_sliding_day_ahead_splits(
    df: pd.DataFrame,
    target_col: str,
    feature_cols: list[str],
    *,
    last_incomplete_day: str = "2023-04-02",
    last_complete_day: str = "2023-04-01",
    test_days: int = 14,
    history_days: int = 180, # train+val horizon (sliding)
    val_days: int = 14, # inside the 180d window
) -> Iterator[SlidingSplit]:
    """
    Sliding time-window splits for day-ahead forecasting.

    - Last day dropped for simplicity
    - Uses the last complete day as `last_complete_day`.
    - Test window = last `test_days` days ending `last_complete_day` (inclusive).
    - For each test day D:
        * history window = [D - history_days, D)
        * validation = last `val_days` of history
        * training = the rest of history before validation
        * test = [D, D+1) one day ahead
    """

    if not isinstance(df.index, pd.DatetimeIndex):
        raise TypeError("df.index must be a DatetimeIndex")

    df = df.sort_index()

    tz = df.index.tz
    def _ts(s: str) -> pd.Timestamp:
        t = pd.Timestamp(s)
        if tz is not None:
            t = t.tz_localize(tz) if t.tzinfo is None else t.tz_convert(tz)
        return t.normalize()

    last_incomplete = _ts(last_incomplete_day)
    last_complete = _ts(last_complete_day)

    # 1) Exclude incomplete last day (drop everything from 2023-04-02 00:00 and onward)
    df = df.loc[: last_incomplete - pd.Timedelta(seconds=1)]

    # 2) Ensure we donâ€™t accidentally include anything beyond last_complete day
    df = df.loc[: last_complete + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)]

    # 3) Define test window days (inclusive)
    test_start_day = last_complete - pd.Timedelta(days=test_days - 1)
    test_days_range = pd.date_range(test_start_day, last_complete, freq="D", tz=tz)

    for D in test_days_range:
        test_start = D
        test_end = D + pd.Timedelta(days=1)

        # history window immediately before D
        hist_end = test_start
        hist_start = hist_end - pd.Timedelta(days=history_days)

        # clip to available data
        hist_start = max(hist_start, df.index.min().normalize())

        # validation = last val_days inside history
        val_end = hist_end
        val_start = val_end - pd.Timedelta(days=val_days)
        val_start = max(val_start, hist_start)

        train_start = hist_start
        train_end = val_start  # exclusive

        # Slice hourly data (assumes hourly grid; if DST causes 23/25h days, this still works)
        train_slice = df.loc[train_start: train_end - pd.Timedelta(hours=1)]
        val_slice   = df.loc[val_start:   val_end   - pd.Timedelta(hours=1)]
        test_slice  = df.loc[test_start:  test_end  - pd.Timedelta(hours=1)]

        # Safety: skip if missing coverage
        if train_slice.empty or val_slice.empty or test_slice.empty:
            continue

        yield SlidingSplit(
            forecast_date=D,
            train_start=train_start, train_end=train_end,
            val_start=val_start, val_end=val_end,
            test_start=test_start, test_end=test_end,
            X_train=train_slice[feature_cols],
            y_train=train_slice[target_col],
            X_val=val_slice[feature_cols],
            y_val=val_slice[target_col],
            X_test=test_slice[feature_cols],
            y_test=test_slice[target_col],
        )


target_col = "consumption"
feature_cols = [c for c in data.columns if c != target_col]

splits = list(iter_sliding_day_ahead_splits(
    data,
    target_col=target_col,
    feature_cols=feature_cols,
    last_incomplete_day="2023-04-02",
    last_complete_day="2023-04-01",
    test_days=14,
    history_days=180,
    val_days=14,
))

print("Num test days:", len(splits))
print("Test window:", splits[0].forecast_date.date(), "->", splits[-1].forecast_date.date())
print("First split train/val/test:",
      splits[0].train_start.date(), "->", (splits[0].train_end - pd.Timedelta(days=0)).date(),
      "|", splits[0].val_start.date(), "->", (splits[0].val_end - pd.Timedelta(days=0)).date(),
      "| test day", splits[0].forecast_date.date())
