In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Bayesian & diagnostics
import pymc as pm
import arviz as az

# For train/test splits etc.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [4]:
# Load previously cleaned model
clean_df = pd.read_csv("models/clean_model.csv", index_col=0)

print(clean_df.shape)
clean_df.head()

(4965, 19)


Unnamed: 0_level_0,elisa,gada_trunc,ia2,m_iaa,znt8_c_arg,znt8_c_tryp,age_at_sample,any_fdr,label_early_stage,elisa_missing,gada_trunc_missing,ia2_missing,m_iaa_missing,znt8_c_arg_missing,znt8_c_tryp_missing,age_at_sample_missing,any_fdr_missing,label_single,era_uid
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
00032F0C-5916-4671-BB7B-A48314D14AF6,132.32,8.5,0.1,1.0,0.1,0.1,3.78,1.0,0,0,0,0,0,0,0,0,1,0,pre-2016
0033DB11-56A2-4E3D-9B8F-0556819CF005,1.335415,0.1,0.1,14.7,0.1,0.1,3.1,1.0,0,0,0,0,0,0,0,0,1,1,2019-2021
005B2E43-F96E-406F-AB19-BECB1692A4C6,26.92,0.1,0.1,0.7,0.1,0.1,3.92,1.0,0,0,0,0,0,0,0,0,1,0,pre-2016
00612B9E-AB0E-4E25-9078-219711028F73,54.409,3.8,0.1,0.6,0.1,0.1,2.16,1.0,0,0,0,0,0,0,0,0,1,0,2016-2018
00731613-E23A-4DA4-849F-9D1762C9E3D7,130.64,0.1,0.1,1.0,0.1,0.1,5.25,1.0,0,0,0,0,1,0,0,0,1,0,2022+


### 1. Bayesian Regression

In [5]:
feature_cols = [
    "gada_trunc",
    "ia2",
    "m_iaa",
    "znt8_c_arg",
    "znt8_c_tryp",
    "age_at_sample",
    "any_fdr",
]

target_col = "label_early_stage"

# Keep only rows with a valid label
bayes_df = clean_df.dropna(subset=[target_col]).copy()
y = bayes_df[target_col].astype(int).values

# Simple imputation for missing biomarker values
X_raw = bayes_df[feature_cols].copy()
for col in feature_cols:
    X_raw[col] = X_raw[col].fillna(X_raw[col].median())

# Standardize continuous covariates (helps sampling)
scaler = StandardScaler()
X_std = scaler.fit_transform(X_raw)

print("X_std shape:", X_std.shape)
print("Positive rate:", y.mean())

X_std shape: (4965, 7)
Positive rate: 0.13474320241691842


In [6]:
coords = {
    "obs_id": np.arange(X_std.shape[0]),
    "feature": feature_cols,
}

with pm.Model(coords=coords) as bayes_logit:

    # Data containers
    X_data = pm.Data("X", X_std, dims=("obs_id", "feature"))
    y_data = pm.Data("y", y, dims="obs_id")

    # Priors on coefficients (weakly informative)
    beta = pm.Normal("beta", mu=0, sigma=2, dims="feature")
    intercept = pm.Normal("intercept", mu=0, sigma=5)

    # Linear predictor & logistic link
    eta = intercept + pm.math.dot(X_data, beta)
    p = pm.math.sigmoid(eta)

    # Likelihood
    outcome = pm.Bernoulli("outcome", p=p, observed=y_data, dims="obs_id")

    # Sample from posterior
    trace_logit = pm.sample(
        draws=2000,
        tune=2000,
        target_accept=0.9,
        chains=4,
        random_seed=42,
    )

    # Compute posterior predictive for in-sample data (optional)
    ppc_logit = pm.sample_posterior_predictive(trace_logit, var_names=["outcome"])

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta, intercept]


ValueError: Not enough samples to build a trace.