In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report

import pymc as pm
import arviz as az  

In [5]:
model_df = pd.read_csv("frida-model.csv")

bool_cols = [
    "elisa_pos","row_single_only_znt8","row_single_only_znt8_dyn",
    "any_follow_up","any_row_early","any_row_early_dyn",
]
model_df[bool_cols] = model_df[bool_cols].astype(int)

X_full = model_df.drop(columns=["label_early_stage"])
y = model_df["label_early_stage"].astype(int).values

In [6]:
feature_cols = [
    "elisa","gada_trunc","ia2","m_iaa","znt8_c_arg","znt8_c_tryp",
    "age_at_sample","any_fdr","elisa_pos",
    "effective_AB_positive","effective_AB_positive_dyn",
    "row_single_only_znt8","row_single_only_znt8_dyn",
    "any_follow_up","span_days","n_rows",
    "any_row_early","any_row_early_dyn"
]
n_base = len(feature_cols)

X_full = X_full.values
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_full[:, :n_base])
X = np.hstack([X_scaled, X_full[:, n_base:]])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

In [7]:
coords = {"obs_id": np.arange(X_train.shape[0])}

with pm.Model(coords=coords) as bayes_logit:

    X_data = pm.MutableData("X_data", X_train, dims=("obs_id", "feature"))
    y_data = pm.MutableData("y_data", y_train, dims="obs_id")

    beta = pm.Normal("beta", mu=0, sigma=1, shape=X_train.shape[1], dims="feature")
    intercept = pm.Normal("intercept", mu=0, sigma=2)

    logits = intercept + pm.math.dot(X_data, beta)
    p = pm.Deterministic("p", pm.math.sigmoid(logits), dims="obs_id")

    y_obs = pm.Bernoulli("y_obs", p=p, observed=y_data)

    idata = pm.sample(
        draws=1500, tune=1000, chains=4, target_accept=0.9, random_seed=42
    )


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta, intercept]


Sampling 4 chains for 1_000 tune and 1_500 draw iterations (4_000 + 6_000 draws total) took 35 seconds.


In [8]:
coef_means = idata.posterior["beta"].mean(dim=("chain","draw")).values
coef_series = pd.Series(coef_means, index=model_df.drop(columns=["label_early_stage"]).columns)
coef_series.sort_values(key=np.abs, ascending=False).head(10)

n_rows                   2.794239
m_iaa_missing           -1.247875
gada_trunc_missing       0.645969
any_fdr_missing         -0.625668
ia2_missing              0.624946
age_at_sample_missing    0.576810
elisa                    0.560458
any_follow_up            0.553753
znt8_c_tryp_missing     -0.537060
span_days               -0.530464
dtype: float64

In [9]:
beta_draws = idata.posterior["beta"].stack(sample=("chain","draw")).values.T
intercept_draws = idata.posterior["intercept"].stack(sample=("chain","draw")).values

logits = intercept_draws[:, None] + beta_draws @ X_test.T
p_samples = 1 / (1 + np.exp(-logits))

p_mean = p_samples.mean(axis=0)

In [10]:
print("ROC AUC:", roc_auc_score(y_test, p_mean))
print("PR AUC:", average_precision_score(y_test, p_mean))
print(classification_report(y_test, (p_mean >= 0.5).astype(int)))

ROC AUC: 0.9793252085809828
PR AUC: 0.9431577122120716
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1078
           1       0.90      0.88      0.89       167

    accuracy                           0.97      1245
   macro avg       0.94      0.93      0.94      1245
weighted avg       0.97      0.97      0.97      1245



In [12]:
idata.to_netcdf("bayes_logit_model.nc")
np.save("scaler.npy", scaler)
np.save("X_train_order.npy", X_train.mean(axis=0))   # sanity check