# Cell 0 — (optional) installs

In [None]:
# If using Colab, uncomment as needed. Keep them separate to avoid version conflicts.
!pip -q install xgboost==1.7.6
!pip -q install pmdarima
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip -q install pytorch-lightning==2.2.5
!pip -q install pytorch-forecasting==1.0.0  # (optional TFT; may be heavy)


[31mERROR: Ignored the following versions that require a different python version: 0.10.2 Requires-Python >=3.8,<3.11; 0.10.3 Requires-Python >=3.8,<3.11; 1.0.0 Requires-Python >=3.8,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement pytorch-forecasting==1.0.0 (from versions: 0.1.0, 0.1.1, 0.1.2, 0.2.0, 0.2.1, 0.2.2, 0.2.3, 0.2.4, 0.3.0, 0.3.1, 0.4.0, 0.4.1, 0.5.0, 0.5.1, 0.5.2, 0.5.3, 0.6.0, 0.6.1, 0.7.0, 0.7.1, 0.8.0, 0.8.1, 0.8.2, 0.8.3, 0.8.4, 0.8.5, 0.9.0, 0.9.1, 0.9.2, 0.10.0, 0.10.1, 1.1.0, 1.1.1, 1.2.0, 1.3.0, 1.4.0)[0m[31m
[0m[31mERROR: No matching distribution found for pytorch-forecasting==1.0.0[0m[31m
[0m

# Cell 1 — Imports, config, seeding

In [None]:
import os, importlib, math, gc, warnings
import numpy as np, pandas as pd
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pyrandom = importlib.import_module("random"); pyrandom.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)

try:
    from google.colab import drive
    drive.mount('/content/drive')
except Exception:
    pass  # not Colab

# Paths & columns
CSV_PATH = "/content/drive/MyDrive/Colab Notebooks/renewables_outputs/merged_by_continent/refined_df.csv"
TS_COL   = "timestamp"
SITE_COL = "site"

# Detect targets by name (adjust if yours differ)
SOLAR_TARGET = None
WIND_TARGET  = None

# Time features to auto-build
CAL_FEATURES = ["hour","dow","month","is_weekend"]
FLAG_FEATURES = [c for c in ["is_daylight","solar_struct_zero","wind_near_zero"]]

# RQ knobs
HORIZONS_MINUTES = [15, 60, 360, 1440]   # 15-min, 1h, 6h, 24h
N_SPLITS = 5
GAP_HOURS = 24

# XGB (traditional)
XGB_GRID = [
    dict(n_estimators=200, max_depth=5, learning_rate=0.12),
    dict(n_estimators=350, max_depth=6, learning_rate=0.10),
]
XGB_COMMON = dict(subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
                  tree_method="hist", n_jobs=-1, random_state=RANDOM_STATE, verbosity=0)

# Deep model sizes (keep tiny; you can scale later)
IN_STEPS = 48          # history window length (adjust per freq)
BATCH_SZ = 512
EPOCHS   = 8
LR       = 1e-3

warnings.filterwarnings("ignore")


Mounted at /content/drive


# Cell 2 — Load, parse, feature basics, frequency check

In [None]:
# Cell 2 — Load + robust detection of timestamp/site/targets + calendar flags

import pandas as pd
import numpy as np

df = pd.read_csv(CSV_PATH)

# ---------- 1) Detect timestamp column (case-insensitive, common aliases)
TS_COL = None
name_map = {c.lower(): c for c in df.columns}
ts_candidates = [
    "timestamp","datetime","date_time","utc_timestamp","time_utc","time","date",
    "period_start","period_end","start_time","dt","ts"
]
for k in ts_candidates:
    if k in name_map:
        TS_COL = name_map[k]
        break

# Fallback: try columns that can be parsed as datetime
if TS_COL is None:
    dt_like = []
    for c in df.columns:
        try:
            pd.to_datetime(df[c].head(100), utc=True, errors="raise")
            dt_like.append(c)
        except Exception:
            pass
    if dt_like:
        TS_COL = dt_like[0]

if TS_COL is None:
    raise ValueError(
        "Could not find a timestamp column. Please rename your time column to 'timestamp' "
        "or one of: " + ", ".join(ts_candidates)
    )

# Parse timestamp, drop unparseable, sort
df[TS_COL] = pd.to_datetime(df[TS_COL], utc=True, errors="coerce")
df = df.dropna(subset=[TS_COL]).sort_values(TS_COL).reset_index(drop=True)

# ---------- 2) Detect site column
SITE_COL = None
site_candidates = [
    "site","site_id","plant","plant_id","station","park","farm","region","location",
    "continent_site","area","country_site"
]
for k in site_candidates:
    if k in name_map:
        SITE_COL = name_map[k]
        break

# Heuristic fallback: pick an object/categorical column with reasonable cardinality
if SITE_COL is None:
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    best = None; best_n = 0
    for c in obj_cols:
        n = df[c].nunique(dropna=True)
        if 2 <= n <= 5000 and n > best_n:
            best, best_n = c, n
    SITE_COL = best if best is not None else "_site_fallback"
    if SITE_COL == "_site_fallback":
        # Create a single global site if nothing usable exists
        df[SITE_COL] = "global"

print(f"Detected TS_COL='{TS_COL}', SITE_COL='{SITE_COL}'")

# ---------- 3) Calendar features
df["hour"]  = df[TS_COL].dt.hour
df["dow"]   = df[TS_COL].dt.dayofweek
df["month"] = df[TS_COL].dt.month
df["is_weekend"] = (df["dow"] >= 5).astype(int)

# ---------- 4) Detect targets (solar/wind), preferring MW columns
SOLAR_TARGET = None
WIND_TARGET  = None

def _pick_target(patterns):
    # prefer columns that contain any of 'mw','power','gen' and pattern match
    ranked = []
    for c in df.columns:
        lc = c.lower()
        if any(p in lc for p in patterns):
            score = 0
            score += 3 if "mw"   in lc else 0
            score += 2 if "power" in lc or "gen" in lc else 0
            ranked.append((score, c))
    if ranked:
        ranked.sort(reverse=True)
        return ranked[0][1]
    return None

SOLAR_TARGET = _pick_target(["solar","pv"])
WIND_TARGET  = _pick_target(["wind"])

print(f"Detected targets: SOLAR_TARGET='{SOLAR_TARGET}', WIND_TARGET='{WIND_TARGET}'")

# ---------- 5) Frequency detection (in minutes)
def _freq_minutes(series):
    d = series.sort_values().diff().dropna().dt.total_seconds()
    if d.empty:
        return 60  # fallback
    # use median to be robust to occasional gaps
    return int(max(1, round(np.median(d) / 60)))

BASE_MINUTES = _freq_minutes(df[TS_COL])
print(f"Detected base sampling interval ≈ {BASE_MINUTES} minutes.")

# Map requested horizons (skip those not aligned to BASE_MINUTES)
def horizon_to_steps(mins: int) -> int | None:
    return mins // BASE_MINUTES if mins % BASE_MINUTES == 0 else None

HORIZON_STEPS = {m: horizon_to_steps(m) for m in HORIZONS_MINUTES}
print("Horizon steps (None = not supported at this frequency):", HORIZON_STEPS)

# ---------- 6) Make sure flag list only includes columns that exist
FLAG_FEATURES = [c for c in ["is_daylight","solar_struct_zero","wind_near_zero"] if c in df.columns]

# (Leave CAL_FEATURES from Cell 1 as-is: ["hour","dow","month","is_weekend"])


Detected TS_COL='lat', SITE_COL='site_id'
Detected targets: SOLAR_TARGET='target_solar_mw', WIND_TARGET='target_wind_mw'
Detected base sampling interval ≈ 1 minutes.
Horizon steps (None = not supported at this frequency): {15: 15, 60: 60, 360: 360, 1440: 1440}


# Cell 3 — Utilities: blocked splits with gap, metrics, baselines

In [None]:
def blocked_splits_by_time(df_in: pd.DataFrame, n_splits:int, gap_hours:int=24):
    times = np.array(sorted(df_in[TS_COL].unique()))
    blocks = np.array_split(times, n_splits)
    for i in range(1, n_splits):
        tr_times = np.concatenate(blocks[:i])
        te_times = blocks[i]
        if len(tr_times) and len(te_times):
            cutoff = te_times.min() - pd.Timedelta(hours=gap_hours)
            tr_times = tr_times[tr_times <= cutoff]
        yield tr_times, te_times

def rmse(y, yhat): return float(np.sqrt(mean_squared_error(y, yhat)))
def mae(y, yhat):  return float(mean_absolute_error(y, yhat))

def eval_metrics(y_true, y_pred):
    y = np.asarray(y_true, float); yhat = np.asarray(y_pred, float)
    m = ~np.isnan(y) & ~np.isnan(yhat)
    if not np.any(m): return dict(RMSE=np.nan, MAE=np.nan)
    return dict(RMSE=rmse(y[m],yhat[m]), MAE=mae(y[m],yhat[m]))

# Aligned seasonal baselines for target y_{t+H}
def seasonal_naive(series, H_steps, season_steps): return series.shift(season_steps - H_steps)
def persistence(series): return series.shift(0)


# Cell 4 — Supervised builder (trees) and sequence builder (DL)

In [None]:
def make_supervised_tree(df_task, target_col, H_steps, lags=[1,2,3,6,12,24,48,72]):
    g = df_task.copy()
    # lag features
    for L in lags:
        g[f"lag_{L}"] = g.groupby(SITE_COL)[target_col].shift(L)
    # calendar + flags already in df
    # target at horizon
    g["y"] = g.groupby(SITE_COL)[target_col].shift(-H_steps)
    feat_cols = [c for c in g.columns if c.startswith("lag_")] + CAL_FEATURES + [c for c in FLAG_FEATURES if c in g.columns] + [SITE_COL]
    g = g.dropna(subset=feat_cols+["y"])
    num_cols = [c for c in feat_cols if c not in [SITE_COL]]
    cat_cols = [SITE_COL]
    Xc, yc = feat_cols, "y"
    return g[[TS_COL, SITE_COL] + Xc + [yc]].sort_values(TS_COL), Xc, yc, num_cols, cat_cols

# -------- DL sequence dataset (CNN-LSTM)
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

class SeqDataset(Dataset):
    def __init__(self, df_task, target_col, H_steps, in_steps=48, num_cols=None, cat_site=True):
        self.df = df_task
        self.target_col = target_col
        self.H = H_steps
        self.in_steps = in_steps
        self.num_cols = num_cols or []
        self.cat_site = cat_site

        # site encoding
        self.sites = {s:i for i,s in enumerate(df_task[SITE_COL].unique())}
        self.idx = []  # (site, end_pos)
        # Pre-index by site
        for s, g in df_task.groupby(SITE_COL):
            g = g.sort_values(TS_COL)
            for end in range(in_steps, len(g)-H_steps):
                self.idx.append((s, g.index[end]))
        self.df = df_task

    def __len__(self): return len(self.idx)

    def __getitem__(self, i):
        s, end_idx = self.idx[i]
        site_df = self.df.loc[self.df[SITE_COL]==s].sort_values(TS_COL)
        # find position of end_idx within site_df
        pos = site_df.index.get_loc(end_idx)
        seq = site_df.iloc[pos-self.in_steps:pos][self.num_cols].to_numpy(dtype=np.float32)
        y   = site_df.iloc[pos + self.H][self.target_col].astype(np.float32)
        site_id = self.sites[s]
        return torch.from_numpy(seq), torch.tensor([site_id], dtype=torch.long), torch.tensor([y], dtype=torch.float32)

class CNNLSTM(nn.Module):
    def __init__(self, n_features, n_sites, hid=64, lstm=64, out=1):
        super().__init__()
        self.conv = nn.Conv1d(n_features, hid, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(hid, lstm, batch_first=True)
        self.site_emb = nn.Embedding(num_embeddings=n_sites, embedding_dim=16)
        self.head = nn.Sequential(
            nn.Linear(lstm + 16, 64), nn.ReLU(),
            nn.Linear(64, out)
        )
    def forward(self, x_seq, x_site):
        # x_seq: [B, T, F] → conv expects [B, F, T]
        z = self.conv(x_seq.transpose(1,2))
        z = self.relu(z).transpose(1,2)
        _, (h, _) = self.lstm(z)
        h = h[-1]                              # [B, lstm]
        e = self.site_emb(x_site.squeeze(1))   # [B, 16]
        z = torch.cat([h, e], dim=1)
        return self.head(z)


# Cell 5 — Traditional track: XGB (blocked CV, pooled with weights)

In [None]:
def build_preprocessor(num_cols, cat_cols):
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    num_trans = Pipeline([("imp", SimpleImputer(strategy="median")),
                          ("sc",  StandardScaler(with_mean=False))])
    cat_trans = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                          ("ohe", ohe)])
    return ColumnTransformer([("num", num_trans, num_cols),
                              ("cat", cat_trans, cat_cols)],
                              remainder="drop", sparse_threshold=1.0)

def run_xgb(df_task, target_col, H_steps):
    g, Xc, yc, num_cols, cat_cols = make_supervised_tree(df_task, target_col, H_steps)
    rows=[]
    for fold, (tr_times, te_times) in enumerate(blocked_splits_by_time(g, N_SPLITS, GAP_HOURS), 1):
        tr = g[g[TS_COL].isin(tr_times)].copy()
        te = g[g[TS_COL].isin(te_times)].copy()
        if len(tr) < 500 or len(te) < 200: continue

        pre = build_preprocessor(num_cols, cat_cols)
        Xt_tr = pre.fit_transform(tr[Xc]); Xt_te = pre.transform(te[Xc])
        y_tr  = tr[yc].to_numpy();       y_te  = te[yc].to_numpy()

        # per-site sample weights for fairness
        counts = tr[SITE_COL].value_counts()
        w = tr[SITE_COL].map(lambda s: 1.0 / max(1, counts.get(s,1))).to_numpy()
        w = w / w.mean()

        # tiny grid; choose by RMSE
        best_score, best_mdl = 1e9, None
        for p in XGB_GRID:
            mdl = XGBRegressor(**XGB_COMMON, **p, objective="reg:squarederror")
            mdl.fit(Xt_tr, y_tr, sample_weight=w)
            rmse_va = rmse(y_te, mdl.predict(Xt_te))
            if rmse_va < best_score: best_score, best_mdl = rmse_va, mdl

        yhat = best_mdl.predict(Xt_te)

        # baselines aligned
        tgt  = te["y"].to_numpy()  # already aligned in g via shift(-H)
        tser = te.groupby(SITE_COL)["y"].transform(lambda s: s)  # placeholder
        pers    = persistence(te.groupby(SITE_COL)["y"]).to_numpy()
        seas_dy = seasonal_naive(te.groupby(SITE_COL)["y"], H_steps, season_steps=24*(60//BASE_MINUTES)).to_numpy()
        seas_wk = seasonal_naive(te.groupby(SITE_COL)["y"], H_steps, season_steps=168*(60//BASE_MINUTES)).to_numpy()

        m_xgb = eval_metrics(y_te, yhat)
        m_p   = eval_metrics(y_te, pers)
        m_d   = eval_metrics(y_te, seas_dy)
        m_w   = eval_metrics(y_te, seas_wk)

        rows.append({
            "fold": fold, "horizon_steps": H_steps,
            "XGB_RMSE": m_xgb["RMSE"], "XGB_MAE": m_xgb["MAE"],
            "BestBase_RMSE": np.nanmin([m_p["RMSE"], m_d["RMSE"], m_w["RMSE"]]),
            "BestBase_MAE":  np.nanmin([m_p["MAE"],  m_d["MAE"],  m_w["MAE"] ]),
        })
    return pd.DataFrame(rows)


# Cell 6 — Traditional track: ARIMA (per site, sampled; safe fallback)

In [None]:
def run_arima(df_task, target_col, H_steps, max_sites=8):
    try:
        import pmdarima as pm
    except Exception as e:
        print("[ARIMA] pmdarima not available → ARIMA skipped.")
        return pd.DataFrame()

    rows=[]
    # Sample sites by most data
    site_sizes = df_task[SITE_COL].value_counts().sort_values(ascending=False).index.tolist()[:max_sites]
    for fold, (tr_times, te_times) in enumerate(blocked_splits_by_time(df_task, N_SPLITS, GAP_HOURS), 1):
        tr_all = df_task[df_task[TS_COL].isin(tr_times)]
        te_all = df_task[df_task[TS_COL].isin(te_times)]
        for sid in site_sizes:
            tr = tr_all[tr_all[SITE_COL]==sid].sort_values(TS_COL)
            te = te_all[te_all[SITE_COL]==sid].sort_values(TS_COL)
            if len(tr) < 200 or len(te) < 60: continue

            y_tr = tr[target_col].to_numpy()
            y_te = te[target_col].shift(-H_steps).dropna().to_numpy()
            # align: forecast H steps ahead
            try:
                model = pm.auto_arima(y_tr, seasonal=True, m=int(24*(60//BASE_MINUTES)), suppress_warnings=True, error_action="ignore")
                yhat  = model.predict(n_periods=len(y_te))
                m = eval_metrics(y_te, yhat)
                rows.append({"fold":fold,"site":sid,"horizon_steps":H_steps,
                             "ARIMA_RMSE":m["RMSE"],"ARIMA_MAE":m["MAE"]})
            except Exception as e:
                continue
    return pd.DataFrame(rows)


# Cell 7 — Deep track: training loop (CNN-LSTM)

In [None]:
# Cell 7 — Robust CNN-LSTM trainer (fold fallback; time-range split)

import torch
from torch import nn
from torch.utils.data import DataLoader

def train_cnnlstm(df_task, target_col, H_steps, num_cols, in_steps=IN_STEPS,
                  min_train_seqs=16, min_test_seqs=16, max_trials=4):
    folds = list(blocked_splits_by_time(df_task, N_SPLITS, GAP_HOURS))
    if not folds:
        return dict(skipped=True, reason="no_splits")

    # Try last fold, then earlier ones if needed
    fold_offsets = [-1, -2, -3]
    found = False
    for off in fold_offsets:
        if len(folds) < abs(off):
            continue
        tr_times, te_times = folds[off]
        te_start = te_times.min()
        te_end   = te_times.max()
        tr_end   = te_start - pd.Timedelta(hours=GAP_HOURS)

        tr_df = df_task[df_task[TS_COL] <= tr_end].sort_values([SITE_COL, TS_COL]).copy()
        te_df = df_task[(df_task[TS_COL] >= te_start) & (df_task[TS_COL] <= te_end)].sort_values([SITE_COL, TS_COL]).copy()
        if not tr_df.empty and not te_df.empty:
            found = True
            break
    if not found:
        return dict(skipped=True, reason="empty_split")

    def make_ds(frame, steps):
        return SeqDataset(frame, target_col, H_steps, in_steps=steps, num_cols=num_cols)

    tried = []
    steps_try = min(in_steps, max(12, 4*H_steps))
    ds_tr = ds_te = None
    for _ in range(max_trials):
        ds_tr = make_ds(tr_df, steps_try); ntr = len(ds_tr)
        ds_te = make_ds(te_df, steps_try); nte = len(ds_te)
        tried.append((steps_try, ntr, nte))
        if ntr >= min_train_seqs and nte >= min_test_seqs:
            break
        steps_try = max(12, steps_try // 2)
        ds_tr = ds_te = None

    if ds_tr is None or len(ds_tr) == 0 or len(ds_te) == 0:
        print(f"[DL] skip (insufficient sequences) tried={tried}")
        return dict(skipped=True, reason="insufficient_sequences", tried=tried)

    bs = max(16, min(BATCH_SZ, len(ds_tr)))
    tr_loader = DataLoader(ds_tr, batch_size=bs, shuffle=True,  drop_last=False)
    te_loader = DataLoader(ds_te, batch_size=bs, shuffle=False, drop_last=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_features = len(num_cols); n_sites = df_task[SITE_COL].nunique()
    model = CNNLSTM(n_features, n_sites, hid=64, lstm=64).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.MSELoss()

    for epoch in range(EPOCHS):
        model.train()
        for x_seq, x_site, y in tr_loader:
            x_seq, x_site, y = x_seq.to(device), x_site.to(device), y.to(device)
            opt.zero_grad()
            pred = model(x_seq, x_site)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()

        model.eval()
        with torch.no_grad():
            y_true, y_pred = [], []
            for x_seq, x_site, y in te_loader:
                x_seq, x_site = x_seq.to(device), x_site.to(device)
                p = model(x_seq, x_site)
                y_true.append(y.numpy()); y_pred.append(p.cpu().numpy())
        y_true = np.vstack(y_true).ravel(); y_pred = np.vstack(y_pred).ravel()
        print(f"[DL] epoch {epoch+1}/{EPOCHS} RMSE={rmse(y_true,y_pred):.4f} MAE={mae(y_true,y_pred):.4f}")

    return dict(model=model, y_true=y_true, y_pred=y_pred, skipped=False)


# Cell 8 — Runner for tasks × horizons (Traditional + DL) and collection

In [None]:
# Cell 8 — Runner for tasks × horizons (Traditional + DL) — STABLE

import gc
import numpy as np
import pandas as pd

def _safe_concat(frames):
    """Concat only real, non-empty DataFrames."""
    good = [f for f in frames if isinstance(f, pd.DataFrame) and not f.empty]
    return pd.concat(good, ignore_index=True) if good else pd.DataFrame()

def run_task(df, task: str):
    target = SOLAR_TARGET if task == "solar" else WIND_TARGET
    if target is None:
        print(f"[{task}] target not found, skipping.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # numeric features for DL (exclude current target and any lag_* if present)
    num_cols = [c for c in df.select_dtypes(include=[np.number]).columns
                if c != target and not c.startswith("lag_")]

    xgb_frames   = []   # list[DataFrame]
    arima_frames = []   # list[DataFrame]
    dl_rows      = []   # list[dict] → will be turned into a DataFrame at the end

    for mins, Hs in HORIZON_STEPS.items():
        if Hs is None:
            print(f"[{task}] Horizon {mins} min not supported at base frequency → skip.")
            continue

        print(f"\n=== {task.upper()} — Horizon {mins} min (steps={Hs}) ===")

        # ----- Traditional: XGB (blocked CV, pooled with weights)
        xgb_tbl = run_xgb(df, target, Hs)
        if isinstance(xgb_tbl, pd.DataFrame) and not xgb_tbl.empty:
            xgb_tbl = xgb_tbl.copy()
            xgb_tbl["task"] = task
            xgb_tbl["horizon_min"] = mins
            xgb_frames.append(xgb_tbl)
        else:
            print(f"[XGB] {task} {mins}m: no rows (skipped).")

        # ----- Traditional: ARIMA (sampled sites); safe fallback if pkg missing
        ar_tbl = run_arima(df, target, Hs, max_sites=8)
        if isinstance(ar_tbl, pd.DataFrame) and not ar_tbl.empty:
            ar_tbl = ar_tbl.copy()
            ar_tbl["task"] = task
            ar_tbl["horizon_min"] = mins
            arima_frames.append(ar_tbl)
        else:
            print(f"[ARIMA] {task} {mins}m: skipped or empty.")

        # ----- Deep: CNN-LSTM (last fold, quick) — may skip if no train sequences
        dl_out = train_cnnlstm(
            df, target, Hs,
            num_cols=num_cols,
            in_steps=min(IN_STEPS, 4 * Hs),
            min_train_seqs=16,
            min_test_seqs=16
        )
        if isinstance(dl_out, dict) and dl_out.get("skipped", False):
            note  = f"DL_skipped:{dl_out.get('reason','')}"
            tried = dl_out.get("tried", "")
            print(f"[DL] {task} {mins}m: {note} {tried if tried else ''}")
            dl_rows.append({
                "task": task, "horizon_min": mins,
                "DL_RMSE": np.nan, "DL_MAE": np.nan,
                "note": note, "tried": str(tried)
            })
        else:
            dl_rows.append({
                "task": task, "horizon_min": mins,
                "DL_RMSE": rmse(dl_out["y_true"], dl_out["y_pred"]),
                "DL_MAE":  mae(dl_out["y_true"], dl_out["y_pred"])
            })

        gc.collect()

    # ---- return THREE DATAFRAMES (never dicts)
    xgb_all   = _safe_concat(xgb_frames)
    arima_all = _safe_concat(arima_frames)
    dl_all    = pd.DataFrame(dl_rows) if dl_rows else pd.DataFrame()
    return xgb_all, arima_all, dl_all

# ---- run
xgb_solar, arima_solar, dl_solar = run_task(df, "solar")
xgb_wind,  arima_wind,  dl_wind  = run_task(df, "wind")

xgb_all   = _safe_concat([xgb_solar, xgb_wind])
arima_all = _safe_concat([arima_solar, arima_wind])
dl_all    = _safe_concat([dl_solar, dl_wind]) if isinstance(dl_solar, pd.DataFrame) else pd.DataFrame()

display(
    xgb_all.head(10) if not xgb_all.empty else "XGB: no rows",
    arima_all.head(10) if not arima_all.empty else "ARIMA skipped",
    dl_all.head(10) if not dl_all.empty else "DL skipped"
)



=== SOLAR — Horizon 15 min (steps=15) ===
[XGB] solar 15m: no rows (skipped).
[ARIMA] pmdarima not available → ARIMA skipped.
[ARIMA] solar 15m: skipped or empty.
[DL] solar 15m: DL_skipped:empty_split 

=== SOLAR — Horizon 60 min (steps=60) ===
[XGB] solar 60m: no rows (skipped).
[ARIMA] pmdarima not available → ARIMA skipped.
[ARIMA] solar 60m: skipped or empty.
[DL] solar 60m: DL_skipped:empty_split 

=== SOLAR — Horizon 360 min (steps=360) ===
[XGB] solar 360m: no rows (skipped).
[ARIMA] pmdarima not available → ARIMA skipped.
[ARIMA] solar 360m: skipped or empty.
[DL] solar 360m: DL_skipped:empty_split 

=== SOLAR — Horizon 1440 min (steps=1440) ===
[XGB] solar 1440m: no rows (skipped).
[ARIMA] pmdarima not available → ARIMA skipped.
[ARIMA] solar 1440m: skipped or empty.
[DL] solar 1440m: DL_skipped:empty_split 

=== WIND — Horizon 15 min (steps=15) ===
[XGB] wind 15m: no rows (skipped).
[ARIMA] pmdarima not available → ARIMA skipped.
[ARIMA] wind 15m: skipped or empty.
[DL] win

'XGB: no rows'

'ARIMA skipped'

Unnamed: 0,task,horizon_min,DL_RMSE,DL_MAE,note,tried
0,solar,15,,,DL_skipped:empty_split,
1,solar,60,,,DL_skipped:empty_split,
2,solar,360,,,DL_skipped:empty_split,
3,solar,1440,,,DL_skipped:empty_split,
4,wind,15,,,DL_skipped:empty_split,
5,wind,60,,,DL_skipped:empty_split,
6,wind,360,,,DL_skipped:empty_split,
7,wind,1440,,,DL_skipped:empty_split,


# Cell 8B — TCN (pure PyTorch) runner

In [None]:
# Cell 8B — TCN (pure PyTorch) as an alternative hybrid DL

import torch
from torch import nn
from torch.utils.data import DataLoader

class TemporalBlock(nn.Module):
    def __init__(self, in_ch, out_ch, k=3, d=1, p=0.1):
        super().__init__()
        pad = (k - 1) * d
        self.net = nn.Sequential(
            nn.Conv1d(in_ch, out_ch, kernel_size=3, padding=pad, dilation=d),
            nn.ReLU(),
            nn.Dropout(p),
            nn.Conv1d(out_ch, out_ch, kernel_size=3, padding=pad, dilation=d),
            nn.ReLU(),
            nn.Dropout(p),
        )
        self.downsample = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
    def forward(self, x):
        out = self.net(x)
        return out + self.downsample(x)

class TCNHead(nn.Module):
    def __init__(self, n_features, channels=(32,64,64), dilations=(1,2,4,8), p=0.1):
        super().__init__()
        layers = []
        in_ch = n_features
        for d in dilations:
            layers.append(TemporalBlock(in_ch, channels[0], d=d, p=p))
            in_ch = channels[0]
        self.tcn = nn.Sequential(*layers)
        self.fc  = nn.Sequential(nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Linear(in_ch, 64), nn.ReLU(), nn.Linear(64, 1))
        self.site_emb = None  # kept simple; you can add site embeddings if desired
    def forward(self, x_seq):
        # x_seq: [B, T, F] → Conv1d expects [B, F, T]
        z = x_seq.transpose(1,2)
        z = self.tcn(z)
        out = self.fc(z)
        return out

def train_tcn(df_task, target_col, H_steps, num_cols, in_steps=IN_STEPS,
              min_train_seqs=16, min_test_seqs=16, max_trials=4):
    folds = list(blocked_splits_by_time(df_task, N_SPLITS, GAP_HOURS))
    if not folds:
        return dict(skipped=True, reason="no_splits")
    # try last then earlier folds
    found=False
    for off in [-1,-2,-3]:
        if len(folds) < abs(off): continue
        tr_times, te_times = folds[off]
        te_start, te_end = te_times.min(), te_times.max()
        tr_end = te_start - pd.Timedelta(hours=GAP_HOURS)
        tr_df = df_task[df_task[TS_COL] <= tr_end].sort_values([SITE_COL, TS_COL]).copy()
        te_df = df_task[(df_task[TS_COL] >= te_start) & (df_task[TS_COL] <= te_end)].sort_values([SITE_COL, TS_COL]).copy()
        if not tr_df.empty and not te_df.empty:
            found=True; break
    if not found:
        return dict(skipped=True, reason="empty_split")

    tried=[]; steps_try = min(in_steps, max(12, 4*H_steps))
    ds_tr = ds_te = None
    for _ in range(max_trials):
        ds_tr = SeqDataset(tr_df, target_col, H_steps, in_steps=steps_try, num_cols=num_cols)
        ds_te = SeqDataset(te_df, target_col, H_steps, in_steps=steps_try, num_cols=num_cols)
        ntr, nte = len(ds_tr), len(ds_te); tried.append((steps_try, ntr, nte))
        if ntr >= min_train_seqs and nte >= min_test_seqs: break
        steps_try = max(12, steps_try // 2)
        ds_tr = ds_te = None
    if ds_tr is None or len(ds_tr)==0 or len(ds_te)==0:
        print(f"[TCN] skip (insufficient sequences) tried={tried}")
        return dict(skipped=True, reason="insufficient_sequences", tried=tried)

    bs = max(16, min(BATCH_SZ, len(ds_tr)))
    tr_loader = DataLoader(ds_tr, batch_size=bs, shuffle=True,  drop_last=False)
    te_loader = DataLoader(ds_te, batch_size=bs, shuffle=False, drop_last=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TCNHead(n_features=len(num_cols)).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.MSELoss()

    for epoch in range(EPOCHS):
        model.train()
        for x_seq, x_site, y in tr_loader:
            x_seq, y = x_seq.to(device), y.to(device)
            opt.zero_grad()
            loss = loss_fn(model(x_seq), y)
            loss.backward()
            opt.step()

    # quick val
    model.eval()
    with torch.no_grad():
        y_true, y_pred = [], []
        for x_seq, x_site, y in te_loader:
            x_seq = x_seq.to(device)
            p = model(x_seq)
            y_true.append(y.numpy()); y_pred.append(p.cpu().numpy())
    y_true = np.vstack(y_true).ravel(); y_pred = np.vstack(y_pred).ravel()
    return dict(model=model, y_true=y_true, y_pred=y_pred, skipped=False)

def run_tcn_all(df, task:str):
    target = SOLAR_TARGET if task=="solar" else WIND_TARGET
    if target is None:
        return pd.DataFrame()
    num_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target and not c.startswith("lag_")]
    rows=[]
    for mins, Hs in HORIZON_STEPS.items():
        if Hs is None: continue
        out = train_tcn(df, target, Hs, num_cols=num_cols, in_steps=min(IN_STEPS, 4*Hs))
        if out.get("skipped", False):
            rows.append(dict(task=task, horizon_min=mins, TCN_RMSE=np.nan, TCN_MAE=np.nan, note="TCN_skipped"))
        else:
            rows.append(dict(task=task, horizon_min=mins, TCN_RMSE=rmse(out["y_true"], out["y_pred"]),
                             TCN_MAE=mae(out["y_true"], out["y_pred"])))
    return pd.DataFrame(rows)

tcn_solar = run_tcn_all(df, "solar")
tcn_wind  = run_tcn_all(df, "wind")
tcn_all   = pd.concat([tcn_solar, tcn_wind], ignore_index=True) if (not tcn_solar.empty or not tcn_wind.empty) else pd.DataFrame()
display(tcn_all.head(10) if not tcn_all.empty else "TCN skipped")


Unnamed: 0,task,horizon_min,TCN_RMSE,TCN_MAE,note
0,solar,15,,,TCN_skipped
1,solar,60,,,TCN_skipped
2,solar,360,,,TCN_skipped
3,solar,1440,,,TCN_skipped
4,wind,15,,,TCN_skipped
5,wind,60,,,TCN_skipped
6,wind,360,,,TCN_skipped
7,wind,1440,,,TCN_skipped


# Cell 9 — Summaries per horizon & task + paired tests (RQ1 & RQ2)

In [None]:
from scipy.stats import wilcoxon, binomtest

def best_baseline_cols(rows):
    # We already stored BestBase_* in XGB table; for DL we need to recompute by fold if desired.
    return rows

# RQ1: Accuracy vs horizon (using XGB as stable comparator)
rq1 = (xgb_all.groupby(["task","horizon_min"], as_index=False)
       .agg(mean_XGB_RMSE=("XGB_RMSE","mean"),
            mean_BestBase_RMSE=("BestBase_RMSE","mean"),
            mean_delta_RMSE=("BestBase_RMSE", "mean") - ("XGB_RMSE","mean")))

print("RQ1 — XGB accuracy vs horizon (lower RMSE is better):")
display(rq1.sort_values(["task","horizon_min"]))

# RQ2: Hybrid (DL) vs Traditional (XGB) — paired test on the last fold block
def rq2_compare(dl_tbl, xgb_tbl):
    rows=[]
    for task, mins in dl_tbl[["task","horizon_min"]].drop_duplicates().itertuples(index=False):
        dls = dl_tbl[(dl_tbl.task==task)&(dl_tbl.horizon_min==mins)]
        xgs = xgb_tbl[(xgb_tbl.task==task)&(xgb_tbl.horizon_min==mins)]
        if dls.empty or xgs.empty: continue
        # Use DL (one value per (task, horizon) from last-fold quick run) vs XGB mean across folds
        d_rmse = float(xgs["XGB_RMSE"].mean() - dls["DL_RMSE"].iloc[0])  # + => DL better
        d_mae  = float(xgs["XGB_MAE"].mean()  - dls["DL_MAE"].iloc[0])
        rows.append({"task":task, "horizon_min":mins, "delta_RMSE(DL-XGB)":d_rmse, "delta_MAE(DL-XGB)":d_mae})
    return pd.DataFrame(rows)

rq2_quick = rq2_compare(dl_all, xgb_all)
print("RQ2 — Quick DL vs XGB deltas (positive = DL better):")
display(rq2_quick.sort_values(["task","horizon_min"]))


KeyError: 'task'