In [1]:
import sys
import os
import pandas as pd
from typing import Iterable, List, Tuple, Union, Generator
import torch

sys.path.append(os.path.abspath(".."))

from pipeline import train_and_evaluate_model
from data_prep.data_prep import prepare_all_data
from model import burglary_model
from training.training import prepare_model_data, grid_search

model_tuple, occupation_mappings,ward_idx_map = prepare_all_data("../merged_data.parquet", "lsoa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_parquet("../merged_data.parquet")
data.columns

Index(['LSOA code (2021)', 'date',
       'Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)',
       'Income Rank (where 1 is most deprived)',
       'Employment Rank (where 1 is most deprived)',
       'Education, Skills and Training Rank (where 1 is most deprived)',
       'Health Deprivation and Disability Rank (where 1 is most deprived)',
       'Crime Rank (where 1 is most deprived)',
       'Barriers to Housing and Services Rank (where 1 is most deprived)',
       'Living Environment Rank (where 1 is most deprived)',
       'Burglaries amount', 'Education locations', 'Emergency locations',
       'Entertainment locations', 'Food locations', 'Leisure locations',
       'Parking locations', 'Shopping locations', 'Public transport locations',
       'Dwelling type|Flat, maisonette or apartment (%)',
       'Ethnic Group|Asian/Asian British (%)', 'Ethnic Group|BAME (%)',
       'Ethnic Group|Black/African/Caribbean/Black British (%)',
       'Ethnic Group|Mixed/mul

In [4]:
import numpy as np
import pandas as pd
from calendar import month_name
from typing import Any, Dict, List, Sequence, Tuple   # ← 2️⃣ add this

def df_to_tensor_dict(
    df: pd.DataFrame,
    *,
    lsoa_col: str = "LSOA code (2021)",
    occ_col:  str | None = None,
    static_cols: Sequence[str] | None = None,
    dynamic_cols: Sequence[str] | None = None,
    date_col: str = "date",
    target_col: str = "Burglaries amount",
    drop_cols: tuple[str, ...] = ("geometry",)
) -> dict[str, np.ndarray]:

    n = len(df)
    if n == 0:
        raise ValueError("Empty slice passed to df_to_tensor_dict")

    # ── 1.  categorical indices ───────────────────────────────────────
    ward_idx = df[lsoa_col].astype("category").cat.codes.to_numpy(np.int64)
    occupation_idx = (
        np.zeros(n, dtype=np.int64) if occ_col is None or occ_col not in df
        else df[occ_col].astype("category").cat.codes.to_numpy(np.int64)
    )

    # ── 2.  choose numeric columns ────────────────────────────────────
    numeric_cols = [c for c in df.select_dtypes("number").columns if c not in drop_cols]

    if static_cols is None or dynamic_cols is None:
        varying = df[numeric_cols].nunique(dropna=False) > 1
        inferred_dyn  = [c for c, v in varying.items() if v]
        inferred_stat = [c for c in numeric_cols if c not in inferred_dyn]
        dynamic_cols  = inferred_dyn  if dynamic_cols is None else list(dynamic_cols)
        static_cols   = inferred_stat if static_cols  is None else list(static_cols)

    X_static  = df[static_cols ].to_numpy(np.float32) if static_cols  else np.zeros((n, 0), np.float32)
    X_dynamic = df[dynamic_cols].to_numpy(np.float32) if dynamic_cols else np.zeros((n, 0), np.float32)

    # ── 3.  temporal features ─────────────────────────────────────────
    months       = pd.to_datetime(df[date_col]).dt.month.to_numpy()         # 1-12
    month_angle  = 2 * np.pi * (months - 1) / 12
    month_sin    = np.sin(month_angle).reshape(-1, 1)                       # (n, 1)
    month_cos    = np.cos(month_angle).reshape(-1, 1)                       # (n, 1)

    # one-hot: shape (n, 12)
    month_1h = np.eye(12, dtype=np.float32)[months - 1]

    time_trend = np.arange(n, dtype=np.float32).reshape(-1, 1)
    time_norm  = (time_trend - time_trend.mean()) / time_trend.std()

    X_temporal = np.concatenate([month_sin, month_cos, time_norm], axis=1)  # (n, 3)

    # ── 4.  map to model key names ─────────────────────────────────────
    return {
        "occupation_idx": torch.as_tensor(occupation_idx, dtype=torch.long),
        "ward_idx":       torch.as_tensor(ward_idx,       dtype=torch.long),
        "X_static":       torch.as_tensor(X_static,       dtype=torch.float32),
        "X_dynamic":      torch.as_tensor(X_dynamic,      dtype=torch.float32),
        "X_seasonal":     torch.as_tensor(month_1h,       dtype=torch.float32),  # ← fixed
        "X_time_trend":   torch.as_tensor(time_trend,     dtype=torch.float32),  # raw trend
        "X_temporal":     torch.as_tensor(X_temporal,     dtype=torch.float32),  # compact 3-col
        "X_spatial":      torch.as_tensor(X_static,       dtype=torch.float32),  # alias for model
        "y":              torch.as_tensor(df[target_col].to_numpy(),
                                          dtype=torch.float32),
        # dummy scalers so train_model never raises KeyError
        "means": torch.zeros(1, dtype=torch.float32),
        "stds":  torch.ones(1,  dtype=torch.float32),
    }

# --------------------------------------------------------------------
# rolling-origin CV for a DataFrame
# --------------------------------------------------------------------
def rolling_origin_cv(
        
        df: pd.DataFrame,
        model_function,
        inx_to_occupation_map: Dict[int, str],
        *,
        # ───────── parameters that control the rolling window ──────────
        date_col: str = "date",
        occ_col: str | None = None,          # ← NEW
        static_cols: Sequence[str] | None = None,  # ← NEW (optional)
        dynamic_cols: Sequence[str] | None = None, # ← NEW (optional)
        initial_train_size: int | float | str | None = None,
        horizon: int = 1,
        step_size: int = 1,
        expanding_window: bool = True,
        max_splits: int | None = None,
        # ───────── parameters forwarded to the learner ─────────────────
        guide_type: str = "diag",
        guide_rank: int = 10,
        lr: float = 1e-3,
        elbo_type: str = "trace",
        renyi_alpha: float = 0.5,
        num_particles: int = 1,
        training_steps: int = 500,
        testing_steps: int = 1000
) -> Tuple[Dict[str, List[float]],        # aggregated metrics
           List[Tuple[Dict[str, float],   # per-fold artefacts
                     Any, Any, Any]]]:

    if static_cols is None or dynamic_cols is None:
        # use the *entire* data set to decide what varies
        numeric_cols = df.select_dtypes(include="number").columns
        varying = df[numeric_cols].nunique(dropna=False) > 1
        dynamic_cols = [c for c, v in varying.items() if v] \
                       if dynamic_cols is None else list(dynamic_cols)
        static_cols  = [c for c in numeric_cols if c not in dynamic_cols] \
                       if static_cols  is None else list(static_cols)

    # 1 ── make sure the frame is in chronological order
    df = df.sort_values(date_col).reset_index(drop=True)
    n_obs = len(df)
    if n_obs < 3:
        raise ValueError("Need at least three rows for cross-validation.")

    # 2 ── resolve initial_train_size
    if initial_train_size is None:
        initial_train_size = int(np.floor(0.7 * n_obs))
    elif isinstance(initial_train_size, str) and initial_train_size.endswith('%'):
        pct = float(initial_train_size.rstrip('%')) / 100.0
        initial_train_size = int(np.floor(pct * n_obs))
    elif isinstance(initial_train_size, float) and 0 < initial_train_size < 1:
        initial_train_size = int(np.floor(initial_train_size * n_obs))

    if initial_train_size < 1 or initial_train_size >= n_obs:
        raise ValueError("initial_train_size must be between 1 and len(df)-1")

    # 3 ── bookkeeping containers
    train_start, train_end = 0, initial_train_size
    split_idx = 0
    per_fold_vals: Dict[str, List[float]] = {"rmse": [], "mae": [], "crps": []}
    fold_results: List[Tuple[Dict[str, float], Any, Any, Any]] = []

    # 4 ── main walk-forward loop
    while train_end + horizon <= n_obs and (max_splits is None or split_idx < max_splits):

        # slice DataFrame with iloc, then convert to dict of arrays
        train_df = df.iloc[train_start:train_end].reset_index(drop=True)
        test_df  = df.iloc[train_end:train_end + horizon].reset_index(drop=True)

        train_slice = df_to_tensor_dict(
    train_df,
    occ_col      = occ_col,
    static_cols  = static_cols,
    dynamic_cols = dynamic_cols,
    date_col     = date_col

)
        test_slice  = df_to_tensor_dict(
    test_df,
    occ_col      = occ_col,
    static_cols  = static_cols,
    dynamic_cols = dynamic_cols,
    date_col     = date_col

)


        metrics, svi, guide, tester = train_and_evaluate_model(
            training_data          = train_slice,
            testing_data           = test_slice,
            model_function         = model_function,
            inx_to_occupation_map  = inx_to_occupation_map,
            guide_type             = guide_type,
            guide_rank             = guide_rank,
            lr                     = lr,
            elbo_type              = elbo_type,
            renyi_alpha            = renyi_alpha,
            num_particles          = num_particles,
            training_steps         = training_steps,
            testing_steps          = testing_steps
        )

        for key in per_fold_vals:
            per_fold_vals[key].append(metrics[key])

        fold_results.append((metrics, svi, guide, tester))

        # shift origin
        split_idx += 1
        if expanding_window:
            train_end += step_size
        else:
            train_start += step_size
            train_end   += step_size

    # 5 ── sanity check: at least one split produced?
    if split_idx == 0:
        raise ValueError(
            f"No CV folds were created. "
            f"Check initial_train_size={initial_train_size}, horizon={horizon}, "
            f"step_size={step_size}, len(df)={n_obs}."
        )

    # 6 ── aggregate statistics (append mean as last element)
    for key, vals in per_fold_vals.items():
        per_fold_vals[key].append(float(np.mean(vals)))

    return per_fold_vals, fold_results


In [5]:
cv_metrics, folds = rolling_origin_cv(
        df                     = data,
        model_function         = burglary_model,
        inx_to_occupation_map  = occupation_mappings[1],
        horizon                = 7,
        step_size              = 10,
        initial_train_size     = "60%",
        expanding_window       = True,
        occ_col                = None          # or "your_occupation_column"
)

print("Per-fold RMSE:", cv_metrics["rmse"][:-1])
print("Overall  RMSE:", cv_metrics["rmse"][-1])


Training SVI: 100%|██████████| 500/500 [00:16<00:00, 29.85it/s]
Training SVI: 100%|██████████| 500/500 [00:17<00:00, 28.12it/s]
Training SVI: 100%|██████████| 500/500 [00:18<00:00, 26.75it/s]
Training SVI:   5%|▌         | 25/500 [00:02<00:39, 12.02it/s]


KeyboardInterrupt: 