In [None]:
# Python 3.13
# If needed in a fresh env:
#   pip install "pymc>=5.21" "arviz>=0.17" "numpy>=2" "pandas>=2.2"
from pytensor.tensor.variable import TensorVariable
import numpy as np
import pandas as pd
import pymc as pm
import pytensor.tensor as pt
import arviz as az

# ---- 1) Get synthetic data from your helpers ----
from bayes_tools.helpers.synthetic_data_helpers import (
    make_hierarchical_ou_dataset,
    aggregate_to_parent,
)

# Build OU-level monthly panel
df_ou = make_hierarchical_ou_dataset(
    n_regions=3,
    n_sites_per_region=3,
    n_ous_per_site=4,
    n_years=3,
    wave_months=(6, 12),
    wave_missing_prob=0.,
    seed=7,
)

# (Optional) aggregate to parent level (e.g., 'site' or 'region')
# df_site = aggregate_to_parent(df_ou, level="site")

# We'll model at the OU level:
df = df_ou.copy()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:

# ---- 2) Basic feature prep ----
# indices
df = df.sort_values(["ou_code", "date"]).reset_index(drop=True)
ou_codes = df["ou_code"].astype("category")
ou_idx = ou_codes.cat.codes.to_numpy().astype("int32")

# time index (month panel)
dates = pd.to_datetime(df["date"])
unique_months = np.sort(dates.unique())
month_to_idx = {m: i for i, m in enumerate(unique_months)}
time_idx = np.array([month_to_idx[d] for d in dates], dtype="int32")
T = len(unique_months)
G = ou_codes.cat.categories.size

# outcomes and predictors
y_raw = df["productivity"].to_numpy().astype("float64")
# log-transform to make noise closer to Gaussian / stabilize variance
y = np.log(y_raw).astype("float64")

x_obs = df["survey_score"].to_numpy().astype("float64")  # may contain NaN
n_resp = df["n_respondents"].to_numpy()                  # NaN where no survey
n_resp_filled = np.where(np.isnan(n_resp), 0.0, n_resp).astype("float64")
has_survey = ~np.isnan(x_obs)

# standardization helpers (kept simple; you can swap for robust scaling)
def zscore(a: np.ndarray):
    m = np.nanmean(a)
    s = np.nanstd(a)
    s = s if s > 0 else 1.0
    return (a - m) / s, m, s

y_z, y_mean, y_sd = zscore(y)      # target
x_z = x_obs.copy()
if np.isfinite(x_obs[has_survey]).any():
    x_z, x_mean, x_sd = zscore(x_obs)
else:
    # fallback if synthetic happens to miss every survey (unlikely)
    x_mean, x_sd = 0.0, 1.0
    x_z = (x_obs - x_mean) / x_sd

# ---- 3) PyMC model ----
with pm.Model() as model:
    # --- Global means for OU intercept/slope (on log-productivity scale) ---
    mu_alpha = pm.Normal("mu_alpha", 0.0, 1.0)
    mu_beta  = pm.Normal("mu_beta",  0.0, 0.5)

    # --- Correlated OU effects: [alpha, beta] via LKJ-Cholesky for robustness ---
    L_ou, corr_ab, sds_ab = pm.LKJCholeskyCov(
        "L_ou",
        n=2,
        eta=2.0,
        sd_dist=pm.HalfNormal.dist(1.0),
        compute_corr=True,
        store_in_trace=True,
    ) # type: ignore
    L_ou: TensorVariable = L_ou
    assert isinstance(L_ou, TensorVariable) & isinstance(corr_ab, TensorVariable) & isinstance(sds_ab, TensorVariable)
    z = pm.Normal("z", 0.0, 1.0, size=(G, 2))          # Gx2
    ab = pt.dot(z, L_ou.T)                              # Gx2
    alpha_ou = pm.Deterministic("alpha_ou", mu_alpha + ab[:, 0])
    beta_ou  = pm.Deterministic("beta_ou",  mu_beta  + ab[:, 1])

    # --- Time fixed effects (centered) for the outcome ---
    time_raw = pm.Normal("time_raw", 0.0, 1.0, shape=T)
    time_eff = time_raw - pt.mean(time_raw)

    # --- Latent survey process x* (z-scored scale) ---
    # OU-level mean for x*
    mu_x = pm.Normal("mu_x", 0.0, 1.0)
    sigma_mu_x = pm.HalfNormal("sigma_mu_x", 1.0)
    mu_x_ou = pm.Normal("mu_x_ou", mu_x, sigma_mu_x, shape=G)

    # Time effect for x* (optional; helps interpolate sparse waves)
    time_x_raw = pm.Normal("time_x_raw", 0.0, 1.0, shape=T)
    time_x_eff = time_x_raw - pt.mean(time_x_raw)

    # Latent x* per OU-month with residual variance
    sigma_x = pm.HalfNormal("sigma_x", 1.0)
    x_latent = pm.Normal(
        "x_latent",
        mu_x_ou[ou_idx] + 0.2 * time_x_eff[time_idx],  # small/shrunk time wiggle
        sigma_x,
        shape=y_z.shape[0],
    )

    # --- Measurement model where survey is observed ---
    # Downweight by sqrt(n_resp): larger n -> lower noise
    # Guard against n=0: clamp minimum effective n to 1.
    n_eff = pt.clip(pt.sqrt(pt.maximum(pt.as_tensor_variable(n_resp_filled), 1.0)), 1.0, 1000.0) # type: ignore
    sigma_meas_base = pm.HalfNormal("sigma_meas_base", 1.0)
    sigma_meas = sigma_meas_base / n_eff

    # Only impose likelihood where survey actually observed
    pm.Normal(
        "survey_obs",
        mu=x_latent[has_survey],
        sigma=sigma_meas[has_survey],
        observed=(x_z[has_survey]),
    )

    # --- Outcome model: log productivity (z-scored) ---
    # Residual noise
    sigma_y = pm.HalfNormal("sigma_y", 0.5)

    mu_y = alpha_ou[ou_idx] + time_eff[time_idx] + beta_ou[ou_idx] * x_latent
    y_like = pm.Normal("y_like", mu=mu_y, sigma=sigma_y, observed=y_z)

    # --- Helpful deterministics for interpretation on original scales ---
    pm.Deterministic("corr_alpha_beta", corr_ab[0, 1])
    pm.Deterministic("beta_global", mu_beta)

    # ---- 4) Sample ----
    idata = pm.sample(
        draws=1000,
        tune=1000,
        chains=4,
        cores=4,
        target_accept=0.9,
        random_seed=7,
        progressbar=True,
    )

In [None]:

# ---- 5) Quick checks ----
az.summary(idata, var_names=[
    "mu_alpha", "mu_beta", "sigma_y", "sigma_x", "sigma_mu_x",
    "sigma_meas_base", "corr_alpha_beta"
], kind="stats")