In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")
from PREDICT import PREDICT
from PREDICT.Models import *
from PREDICT.Metrics import *
from PREDICT.Triggers import *
from PREDICT.Plots import *
from Comparison.Detect_Functions import *
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime
import datetime as dt
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az
import bambi as bmb

import warnings
warnings.filterwarnings('ignore')

%env PYTENSOR_FLAGS=exception_verbosity=high,floatX=float32

env: PYTENSOR_FLAGS=exception_verbosity=high,floatX=float32


In [None]:
recalthreshold = 0.851 # Paper has AUROC of 0.889, with lower CI at 0.851

prev_increases = np.arange(1.005, 1.05, 0.005).tolist() #[1.0001] 
#prev_increases = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5": 0, "SPC7": 0, "Bayesian": 0})
bayes_dict = {"BayesianCoefficients":{}}

mean_TGFB, std_TGFB = 13.23, 5.18
mean_ADMA, std_ADMA= 101.1, 64.8
mean_BUN, std_BUN = 5.45, 1.11
mean_age, std_age = 63.27, 10.09 

TGFB_coef = 1.84
ADMA_coef = 1.137
DM_coef = 0.84
BUN_coef = 0.497
elderly_coef = 0.603

perc_dm = 0.05 # 5.5%


startDate = pd.to_datetime('01-06-2019', dayfirst=True) # 01-06-2019
endDate = pd.to_datetime('31-12-2021', dayfirst=True) # 31-12-2021
numdays = (endDate - startDate).days

In [None]:
# Pretrain on fake data
num_patients = 500
numdays_pretrain = 1000

mydict = {
    'date': list(),
    'outcome': list(),
    'prediction': list(),
    'TGFB': list(),
    'ADMA':list(),
    'DM': list(),
    'BUN': list(),
    'elderly': list()
}

for i in range(numdays_pretrain):
    curday = startDate + dt.timedelta(days=i)

    TGFB = get_binom_from_normal(mean_TGFB, std_TGFB, num_patients, 1.011)
    ADMA = get_binom_from_normal(mean_ADMA, std_ADMA, num_patients, 0.019)
    DM = np.random.binomial(1, perc_dm, num_patients)
    BUN = get_binom_from_normal(mean_BUN, std_BUN, num_patients, 5.9)
    elderly = get_binom_from_normal(mean_age, std_age, num_patients, 60)
    epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

    # Calculate baseline log-odds
    # non_genetic_risk_score_model from paper
    lp = TGFB_coef * TGFB + ADMA_coef * ADMA + DM_coef * DM + BUN_coef * BUN + elderly_coef * elderly

    curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability
    mod_prob = 1/(1+np.exp(-(lp + epsilon)))
    curoutcomes = np.random.binomial(1, mod_prob)           
    
    # Append to dictionary from the distribution for each of the variables (Table 1)
    mydict['date'].extend([curday] * num_patients)
    mydict['outcome'].extend(curoutcomes)
    mydict['prediction'].extend(curpredictions)
    mydict['TGFB'].extend(TGFB)
    mydict['ADMA'].extend(ADMA)
    mydict['DM'].extend(DM)
    mydict['BUN'].extend(BUN)
    mydict['elderly'].extend(elderly)

pretrain_data = pd.DataFrame(mydict)

In [8]:
prefit_model = bmb.Model("outcome ~ TGFB + ADMA + DM + BUN + elderly", pretrain_data, family="bernoulli")
prefit_fitted = prefit_model.fit(
    tune=2000, draws=15000, cores=8, chains=4, target_accept=0.9)

az.summary(prefit_fitted)

Modeling the probability that outcome==1


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, TGFB, ADMA, DM, BUN, elderly]


Output()

Sampling 4 chains for 2_000 tune and 15_000 draw iterations (8_000 + 60_000 draws total) took 302 seconds.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
Intercept,-0.17,0.098,-0.354,0.014,0.0,0.0,79486.0,49266.0,1.0
TGFB,2.012,0.085,1.851,2.171,0.0,0.0,80931.0,46674.0,1.0
ADMA,1.074,0.051,0.98,1.171,0.0,0.0,80355.0,47475.0,1.0
DM,0.99,0.123,0.758,1.219,0.0,0.001,76601.0,43040.0,1.0
BUN,0.49,0.04,0.415,0.565,0.0,0.0,78541.0,48091.0,1.0
elderly,0.66,0.035,0.595,0.726,0.0,0.0,77872.0,47769.0,1.0


In [9]:
bayesian_priors = {
    "Intercept": (-0.170, 0.098),
    "TGFB": (2.012, 0.085),
    "ADMA": (1.074, 0.051),
    "DM": (0.990, 0.123),
    "BUN": (0.490, 0.04),
    "elderly": (0.660, 0.035)
}

In [10]:
# Get bootstrap OE with CI
preds = -0.170 + 2.012 * pretrain_data['TGFB'].values + 1.074 * pretrain_data['ADMA'].values +\
    0.990 * pretrain_data['DM'].values + 0.490 * pretrain_data['BUN'].values + 0.660 * pretrain_data['elderly'].values
preds = 1 / (1 + np.exp(-preds))
outcome = pretrain_data['outcome'].values
for i in range(1000):
    boot_indices = np.random.choice(range(len(outcome)), size=len(outcome), replace=True)
    boot_outcome = outcome[boot_indices]
    boot_preds = preds[boot_indices]
    boot_oe = boot_outcome.mean() / boot_preds.mean()
    if i == 0:
        oe_values = [boot_oe]
    else:
        oe_values.append(boot_oe)
        
print(f"Pretrain OE: {np.mean(oe_values)} with std: {np.std(oe_values)} and 95% CI: {np.percentile(oe_values, 2.5)} - {np.percentile(oe_values, 97.5)}")

Pretrain OE: 1.0000043910390126 with std: 0.0006133405921358241 and 95% CI: 0.9988624196578121 - 1.001185395447341


In [None]:
resultsloc = "./Results/simulation/slow_change"
os.makedirs(resultsloc, exist_ok=True)
if not os.path.exists(os.path.join(resultsloc, 'performance_metrics.csv')):
    header = pd.DataFrame(columns=['Time', 'Accuracy', 'AUROC', 'Precision', 'CalibrationSlope', 'CITL',
    'OE', 'AUPRC', 'F1Score', 'impact_or_prev', 'Method', 'Data_Type'])
    header.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), index=False)

In [None]:
for prev_increase in prev_increases:
    regular_ttd = []
    static_ttd = []
    spc_ttd3 = []
    spc_ttd5 = []
    spc_ttd7 = []
    bayesian_ttd = []
    mydict = {
        'date': list(),
        'outcome': list(),
        'prediction': list(),
        'TGFB': list(),
        'ADMA':list(),
        'DM': list(),
        'BUN': list(),
        'elderly': list()
    }
    
    for i in range(numdays):
        curday = startDate + dt.timedelta(days=i)

        TGFB = get_binom_from_normal(mean_TGFB, std_TGFB, num_patients, 1.011)
        ADMA = get_binom_from_normal(mean_ADMA, std_ADMA, num_patients, 0.019)
        DM = np.random.binomial(1, min(perc_dm * np.floor(i/30)*prev_increase, 0.99), num_patients)
        BUN = get_binom_from_normal(mean_BUN, std_BUN, num_patients, 5.9)
        elderly = get_binom_from_normal(mean_age, std_age, num_patients, 60)
        epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

        # Calculate baseline log-odds
        # non_genetic_risk_score_model from paper
        lp = TGFB_coef * TGFB + ADMA_coef * ADMA + DM_coef * DM + BUN_coef * BUN + elderly_coef * elderly

        curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability
        mod_prob = 1/(1+np.exp(-(lp + epsilon)))
        curoutcomes = np.random.binomial(1, mod_prob)           
        
        # Append to dictionary from the distribution for each of the variables (Table 1)
        mydict['date'].extend([curday] * num_patients)
        mydict['outcome'].extend(curoutcomes)
        mydict['prediction'].extend(curpredictions)
        mydict['TGFB'].extend(TGFB)
        mydict['ADMA'].extend(ADMA)
        mydict['DM'].extend(DM)
        mydict['BUN'].extend(BUN)
        mydict['elderly'].extend(elderly)
           

    df = pd.DataFrame(mydict)
    df = prevent_constant_variable(df, startDate, endDate)
    ########################################### Baseline Testing #######################################
    model = EvaluatePredictions()
    mytest = PREDICT(data=df, model=model, startDate='min', endDate='max', timestep='month')
    mytest.addLogHook(Accuracy(model))
    mytest.addLogHook(AUROC(model))
    mytest.addLogHook(Precision(model))
    mytest.addLogHook(CalibrationSlope(model))
    mytest.addLogHook(CITL(model))
    mytest.addLogHook(OE(model))
    mytest.addLogHook(AUPRC(model))
    mytest.run()
    log = mytest.getLog()


    baseline_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'impact_or_prev': list([str(prev_increase)] * len(log["Accuracy"])), 'Method':list(['Baseline'] * len(log["Accuracy"]))})
    ########################################### Save Metrics #######################################
    baseline_metrics["Data_Type"] = "Slow Change Simulation"
    baseline_metrics.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), mode='a', header=False, index=False)
    # Get OE thresholds for static recal from original model
    recalthreshold_lower = 0.9998998140365868 -3*0.0006133405921358241
    recalthreshold_upper = 0.9998998140365868 + 3*0.0006133405921358241
    print(f"Using OE Threshold of {recalthreshold_lower} - {recalthreshold_upper}")
    
    
    ############################################ Recalibration Testing #######################################
    slow_change_metrics_df = get_metrics_recal_methods(df, perc_dm, recalthreshold_lower, recalthreshold_upper, model_name='slow_change_datasim')
    undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, startDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold_lower, recalthreshold_upper)    

    ########################################### Bayesian Testing #######################################
    bayes_coef_ci = {
        key: (bayesian_priors[key][0] - 3 * bayesian_priors[key][1], bayesian_priors[key][0] + 3 * bayesian_priors[key][1])
        for key in bayesian_priors
    }
    bay_model = BayesianModel(input_data=df, 
                            model_formula = "outcome ~ TGFB + ADMA + DM + BUN + elderly", 
                            priors = bayesian_priors, 
                            verbose=False, draws=10000, tune=2000, chains=4, cores=8, target_accept=0.9)
    bay_model.trigger = AlwaysTrigger(model=bay_model)
    mytest = PREDICT(data=df, model=bay_model, startDate='min', endDate='max', timestep='month')
    mytest.addLogHook(Accuracy(bay_model))
    mytest.addLogHook(AUROC(bay_model))
    mytest.addLogHook(Precision(bay_model))
    mytest.addLogHook(CalibrationSlope(bay_model))
    mytest.addLogHook(CITL(bay_model))
    mytest.addLogHook(OE(bay_model))
    mytest.addLogHook(AUPRC(bay_model))
    mytest.addLogHook(TrackBayesianCoefs(bay_model))
    mytest.run()
    log = mytest.getLog()

    if "BayesianCoefficients" in log:
        bayes_dict["BayesianCoefficients"].update(log["BayesianCoefficients"])
    
    ttd = find_bayes_coef_change(bayes_dict["BayesianCoefficients"], detectDate=startDate, undetected=undetected, thresholds=bayes_coef_ci)
    bayesian_ttd.append(ttd)

    bayes_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'impact_or_prev': list([str(perc_dm)] * len(log["Accuracy"])), 'Method':list(['Bayesian'] * len(log["Accuracy"]))})
    
    ########################################### Save Metrics #######################################

    # concatenate all the dataframes into one
    slow_change_metrics_df = pd.concat([slow_change_metrics_df, bayes_metrics], ignore_index=True)
    slow_change_metrics_df["Data_Type"] = "Slow Change Simulation"

    slow_change_metrics_df.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), mode='a', header=False, index=False)
    update_ttd_table(regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, prev_increase, os.path.join(resultsloc, 'input_prev_ttd_tbl.csv'))
    
    # Generate plots
    plot_incidence_over_time(df, None, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, 'slow_change_'+str(prev_increase), fileloc=resultsloc)
    BayesianCoefsPlot(bayes_dict, 'slow_change_'+str(prev_increase), fileloc=resultsloc)
    pd.DataFrame(bayes_dict["BayesianCoefficients"]).to_csv(os.path.join(resultsloc, f'bayesian_coefficients_impact_{prev_increase}.csv'), index=False)

plot_time_to_detect(os.path.join(resultsloc, 'input_prev_ttd_tbl.csv'), 'slow_change')
