# Gradient Boosting Model (Standalone)

Trains Gradient Boosting regressors on the prepared splits while preserving the existing data locations.

In [None]:
import warnings
import logging
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
logging.getLogger("cmdstanpy").setLevel(logging.WARNING)
logging.getLogger("prophet").setLevel(logging.WARNING)

RAW_DATA_PATH = Path("15_minute_timeseries_data_cleaned.xlsx")
DATA_DIR = Path("content/data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COL = "baseline.out.site_energy.total.energy_consumption.kwh"


In [None]:
def load_and_clean_15min(path: Path = RAW_DATA_PATH) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Expected to find {path} for the 15-minute dataset.")
    df = pd.read_excel(path)
    print("Loaded raw data:", df.shape)

    df = df.dropna(axis=1, how="all")
    df = df.drop_duplicates()

    time_cols = [c for c in df.columns if "time" in c.lower() or "date" in c.lower()]
    time_col = time_cols[0] if time_cols else df.columns[0]
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
    df = df.set_index(time_col).sort_index()
    df = df[~df.index.isna()]

    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df = df.interpolate(method="time", limit_direction="both")

    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        mask = df[col] < 0
        if mask.any():
            df.loc[mask, col] = np.nan
    df = df.interpolate(method="time")

    df["hour"] = df.index.hour
    df["day_of_week"] = df.index.dayofweek
    df["month"] = df.index.month
    df["day_of_year"] = df.index.dayofyear
    df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

    ready_path = DATA_DIR / "15_minute_timeseries_data_cleaned_ready.csv"
    df.to_csv(ready_path)
    print(f"Saved cleaned 15-minute data to {ready_path}")

    return df


def aggregate_hourly(df: pd.DataFrame) -> pd.DataFrame:
    hourly = pd.DataFrame()
    for col in df.columns:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in ["kwh", "energy", "consumption"]):
            hourly[col] = df[col].resample("H").sum(min_count=1)
        elif any(keyword in col_lower for keyword in ["kw", "power"]):
            hourly[col] = df[col].resample("H").mean()
        else:
            if df[col].dtype == "object":
                hourly[col] = df[col].resample("H").ffill()
            else:
                hourly[col] = df[col].resample("H").mean()
    hourly = hourly.interpolate(method="time", limit_direction="both")
    hourly_path = DATA_DIR / "hourly_from_15min.csv"
    hourly.to_csv(hourly_path)
    print(f"Saved hourly data to {hourly_path}")
    return hourly


def chronological_split(data: pd.DataFrame, train_frac: float = 0.7):
    n_total = len(data)
    n_train = max(1, int(n_total * train_frac))
    return data.iloc[:n_train], data.iloc[n_train:]


def ensure_splits() -> None:
    required = [
        DATA_DIR / "train_15min.csv",
        DATA_DIR / "test_15min.csv",
        DATA_DIR / "train_hourly.csv",
        DATA_DIR / "test_hourly.csv",
    ]
    if all(path.exists() for path in required):
        print("Using existing train/test splits in content/data")
        return
    print("Creating train/test splits under content/data ...")
    df_15 = load_and_clean_15min(RAW_DATA_PATH)
    hourly_df = aggregate_hourly(df_15)
    train_15, test_15 = chronological_split(df_15, 0.7)
    train_hourly, test_hourly = chronological_split(hourly_df, 0.7)
    train_15.to_csv(DATA_DIR / "train_15min.csv")
    test_15.to_csv(DATA_DIR / "test_15min.csv")
    train_hourly.to_csv(DATA_DIR / "train_hourly.csv")
    test_hourly.to_csv(DATA_DIR / "test_hourly.csv")
    print("Saved new train/test splits.")


In [None]:
ensure_splits()


def _read_split_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    first = df.columns[0]
    dt_index = pd.to_datetime(df[first], errors="coerce")
    df = df.set_index(dt_index).drop(columns=[first])
    if getattr(df.index, "tz", None) is not None:
        df.index = df.index.tz_localize(None)
    df.index.name = "timestamp"
    return df.sort_index()


train_15min = _read_split_csv(DATA_DIR / "train_15min.csv")
test_15min = _read_split_csv(DATA_DIR / "test_15min.csv")
train_hourly = _read_split_csv(DATA_DIR / "train_hourly.csv")
test_hourly = _read_split_csv(DATA_DIR / "test_hourly.csv")

print("hourly train/test:", train_hourly.shape, test_hourly.shape)
print("15min train/test:", train_15min.shape, test_15min.shape)


def evaluate_model(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mae = np.mean(np.abs(y_true - y_pred))
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100
    return {"MAE": mae, "RMSE": rmse, "MAPE": mape}


In [None]:
import joblib
from sklearn.ensemble import GradientBoostingRegressor

MODEL_DIR = Path("/models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)


def _time_features(idx: pd.DatetimeIndex, gran: str) -> pd.DataFrame:
    if gran == "hourly":
        return pd.DataFrame({
            "hour": idx.hour,
            "day_of_week": idx.dayofweek,
            "month": idx.month,
        }, index=idx)
    return pd.DataFrame({
        "hour": idx.hour,
        "minute": idx.minute,
        "quarter": (idx.minute // 15),
        "day_of_week": idx.dayofweek,
        "month": idx.month,
    }, index=idx)


results = []
for gran, train_df, test_df in [
    ("hourly", train_hourly, test_hourly),
    ("15min", train_15min, test_15min),
]:
    out_dir = MODEL_DIR / gran
    out_dir.mkdir(parents=True, exist_ok=True)

    y_train = train_df[TARGET_COL].astype(float)
    y_test = test_df[TARGET_COL].astype(float)
    X_train = _time_features(train_df.index, gran)
    X_test = _time_features(test_df.index, gran)

    print(f"
[{gran}] training Gradient Boosting on {X_train.shape[0]} samples")
    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train, y_train)
    joblib.dump({"model": model, "features": X_train.columns.tolist()}, out_dir / "gbr.joblib")
    print(f"[{gran}] saved model to {(out_dir / 'gbr.joblib').as_posix()}")

    yhat = model.predict(X_test)
    metrics = evaluate_model(y_test.values, yhat)
    metrics.update({"Model": "GradientBoosting", "Granularity": gran})
    results.append(metrics)

results_df = pd.DataFrame(results).set_index(["Granularity", "Model"])
results_df
