In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report

import pymc as pm
import arviz as az  



In [3]:
model_df = pd.read_csv("frida-model.csv")

# Columns that are boolean but should be used as numeric 0/1 features
bool_cols = [
    "elisa_pos",
    "row_single_only_znt8",
    "row_single_only_znt8_dyn",
    "any_follow_up",
    "any_row_early",
    "any_row_early_dyn",
]

# Convert bool → int (True/False → 1/0)
model_df[bool_cols] = model_df[bool_cols].astype(int)

In [4]:
# Separate features and label
X_full = model_df.drop(columns=["label_early_stage"])
y = model_df["label_early_stage"].astype(int).values

# How many of the *original* continuous features we had
# (must match the order you used before when building model_df)
feature_cols = [
    "elisa", "gada_trunc", "ia2", "m_iaa", "znt8_c_arg", "znt8_c_tryp",
    "age_at_sample", "any_fdr", "elisa_pos",
    "effective_AB_positive", "effective_AB_positive_dyn",
    "row_single_only_znt8", "row_single_only_znt8_dyn",
    "any_follow_up", "span_days", "n_rows",
    "any_row_early", "any_row_early_dyn",
]
n_base = len(feature_cols)   # number of continuous-ish features

# Convert to numpy
X_full = X_full.values

# Scale only the first n_base columns (same idea as before)
scaler = StandardScaler()
X_cont_scaled = scaler.fit_transform(X_full[:, :n_base])
X = np.hstack([X_cont_scaled, X_full[:, n_base:]])  # scaled cont + raw missing flags

# Train/test split (same as classical models)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)


In [7]:
n_features = X_train.shape[1]

with pm.Model() as bayes_logit:
    # Data containers (train set)
    X_data = pm.MutableData("X_data", X_train)
    y_data = pm.MutableData("y_data", y_train)

    # Priors on coefficients and intercept
    # Normal(0, 1) shrinks coefficients toward 0 unless data strongly supports them
    beta = pm.Normal("beta", mu=0, sigma=1.0, shape=n_features)
    intercept = pm.Normal("intercept", mu=0, sigma=2.0)

    # Linear predictor and logistic link
    logits = intercept + pm.math.dot(X_data, beta)
    p = pm.Deterministic("p", pm.math.sigmoid(logits))

    # Likelihood (Bernoulli outcomes)
    y_obs = pm.Bernoulli("y_obs", p=p, observed=y_data)

    # Sample from the posterior
    idata = pm.sample(
        draws=2000, tune=1000, chains=4, target_accept=0.9, random_seed=42
    )


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta, intercept]


ValueError: Not enough samples to build a trace.