In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")
from PREDICT import PREDICT
from PREDICT.Models import *
from PREDICT.Metrics import *
from PREDICT.Triggers import *
from PREDICT.Plots import *
from Comparison.Detect_Functions import *
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az
import bambi as bmb

import warnings
warnings.filterwarnings('ignore')

%env PYTENSOR_FLAGS=exception_verbosity=high,floatX=float32

env: PYTENSOR_FLAGS=exception_verbosity=high,floatX=float32


In [None]:
#recalthreshold = 0.86 # Paper has AUROC of 0.91, with lower CI at 0.86

custom_impacts = [0.05, 0.75, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 1.0]
#custom_impacts = [0.5]#, 0.6, 0.7, 0.8, 0.9, 1.0]  # Faster testing with fewer impact levels
switchDateStrings = ['01-04-2020'] # Keep this as just one switchDate as other methods only look at one startDate/deployment date
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5":0, "SPC7":0, "Bayesian": 0})

or_age = 1.05
or_ldh = 2.5
or_comorbidity = 3.9

log_age = np.log(or_age)
log_ldh = np.log(or_ldh)
log_comorbidity = np.log(or_comorbidity)
bayes_dict = {"BayesianCoefficients":{}}

In [None]:
pretrain_samples = 8000

age = (np.random.normal(44, 16.3, pretrain_samples)).astype(int)  # Mean age 44, SD 16.3
sex = np.random.binomial(1, 0.562, pretrain_samples) # 56.2% are male
comorbidity = np.random.binomial(1, 0.3, pretrain_samples)  # 30% have comorbidities
ldh_high = np.random.binomial(1, 0.15, pretrain_samples)  # 15% have LDH >500 U/L
epsilon = np.random.normal(0, 0.08, pretrain_samples) # Simulate error term (mean=0, std=0.08)
# Calculate baseline log-odds
# sex influence 1.2 due to not being provided in the paper
lp = -1.5 + log_age * age +  log_ldh * ldh_high + log_comorbidity * comorbidity + 1.2 * sex
curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability

# Generate outcomes
curoutcomes = np.random.binomial(1, 1 / (1 + np.exp(-(lp  + epsilon))))  # Simulate COVID events

pretrain_data = pd.DataFrame({'date': [pd.to_datetime('01-01-1999')] * pretrain_samples,
                            'outcome': curoutcomes,
                            'prediction': curpredictions,
                            'age': age,
                            'sex': sex,
                            'comorbidity': comorbidity,
                            'ldh_high': ldh_high})

In [None]:
prefit_model = bmb.Model("outcome ~ age + sex + comorbidity + ldh_high", pretrain_data, family="bernoulli")
prefit_fitted = prefit_model.fit(
    tune=2000, draws=15000, cores=8, chains=4, target_accept=0.9)

az.summary(prefit_fitted)

Modeling the probability that outcome==1


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

In [10]:
resultsloc = "./Results/simulation/fast_change"
os.makedirs(resultsloc, exist_ok=True)
if not os.path.exists(os.path.join(resultsloc, 'performance_metrics.csv')):
    header = pd.DataFrame(columns=['Time', 'Accuracy', 'AUROC', 'Precision', 'CalibrationSlope', 'CITL',
    'OE', 'AUPRC', 'F1Score', 'impact_or_prev', 'Method', 'Data_Type'])
    header.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), index=False)

In [None]:
startDate = pd.to_datetime('01-06-2019', dayfirst=True) # 01-06-2019
endDate = pd.to_datetime('31-12-2021', dayfirst=True) # 31-12-2021
num_patients = 200 # number of patients per each timestep

In [None]:
for switchDateidx, switchDateString in enumerate(switchDateStrings):
    for custom_impact in custom_impacts:
        regular_ttd = []
        static_ttd = []
        spc_ttd3 = []
        spc_ttd5 = []
        spc_ttd7 = []
        bayesian_ttd = []
        mydict = {
                'date': list(),
                'outcome': list(),
                'prediction': list(),
                'age': list(),
                'sex': list(),
                'comorbidity': list(),
                'ldh_high': list()
            }

        # Define date range and COVID shock periods
        switchDate = pd.to_datetime(switchDateString, dayfirst=True)  # COVID starts spreading
        switchDate2 = pd.to_datetime('01-06-2020', dayfirst=True)  # Peak of the pandemic
        recoveryDate = pd.to_datetime('01-06-2021', dayfirst=True)  # Start of recovery phase
        numdays = (endDate - startDate).days
        switchDays = (switchDate - startDate).days
        switch2Days = (switchDate2 - startDate).days
        recoveryDays = (recoveryDate - startDate).days

        for i in range(numdays):
            curday = startDate + dt.timedelta(days=i)

            age = (np.random.normal(44, 16.3, num_patients)).astype(int)  # Mean age 44, SD 16.3
            sex = np.random.binomial(1, 0.562, num_patients) # 56.2% are male
            comorbidity = np.random.binomial(1, 0.3, num_patients)  # 30% have comorbidities
            ldh_high = np.random.binomial(1, 0.15, num_patients)  # 15% have LDH >500 U/L
            epsilon = np.random.normal(0, 0.08, num_patients) # Simulate error term (mean=0, std=0.08)

            # Calculate baseline log-odds
            # sex influence 1.2 due to not being provided in the paper
            lp = -1.5 + log_age * age +  log_ldh * ldh_high + log_comorbidity * comorbidity + 1.2 * sex
            curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability

            # Simulate COVID effects
            daystopeak = switch2Days - switchDays
            if switchDays <= i < switch2Days:
                lp += custom_impact * (i - switchDays) / daystopeak  # Initial impact of COVID ramping up
            elif switch2Days <= i < recoveryDays:
                lp += custom_impact  # Peak of the pandemic

            # Generate outcomes
            curoutcomes = np.random.binomial(1, 1 / (1 + np.exp(-(lp + epsilon))))  # Simulate COVID events

            # Append to dictionary
            mydict['date'].extend([curday] * num_patients)
            mydict['outcome'].extend(curoutcomes)
            mydict['prediction'].extend(curpredictions)
            mydict['age'].extend(age)
            mydict['sex'].extend(sex)
            mydict['comorbidity'].extend(comorbidity)
            mydict['ldh_high'].extend(ldh_high)

        df = pd.DataFrame(mydict)

        ########################################### Baseline Testing #######################################
        model_name='COVID_datasim'
        model = EvaluatePredictions()
        mytest = PREDICT(data=df, model=model, startDate='min', endDate='max', timestep='month')
        mytest.addLogHook(Accuracy(model))
        mytest.addLogHook(AUROC(model))
        mytest.addLogHook(Precision(model))
        mytest.addLogHook(CalibrationSlope(model))
        mytest.addLogHook(CITL(model))
        mytest.addLogHook(OE(model))
        mytest.addLogHook(AUPRC(model))
        mytest.addLogHook(F1Score(model))
        mytest.run()
        log = mytest.getLog()

        baseline_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'F1Score': list(log["F1score"].values()), 'impact_or_prev': list([str(custom_impact)] * len(log["Accuracy"])), 'Method':list(['Baseline'] * len(log["Accuracy"]))})
        # Use baseline measure of OE score in time before switchDate to get CI
        oe_std = baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].std()
        recalthreshold_lower = float(baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].mean() - 3*oe_std)
        recalthreshold_upper = float(baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].mean() + 3*oe_std)
        print(f"Using OE Threshold of {recalthreshold_lower} - {recalthreshold_upper} for impact {custom_impact}, mean was {baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].mean()}")
        
        ########################################### Save Metrics #######################################
        baseline_metrics["Data_Type"] = "COVID Simulation"
        baseline_metrics.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), mode='a', header=False, index=False)
        
        ########################################### Test models ##########################################
        
        covid_metrics_df = get_metrics_recal_methods(df, custom_impact, recalthreshold_lower, recalthreshold_upper, model_name='COVID_datasim')
        undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, switchDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold_lower, recalthreshold_upper)
        ########################################### Bayesian Testing #######################################
        #bay_model = BayesianModel(input_data=df, priors={"Intercept": (-1.5, 0.05), "age": (log_age, 0.01), "sex": (1.2, 0.1), "comorbidity": (log_comorbidity, 0.1), "ldh_high": (log_ldh, 0.1)}, cores=2, verbose=False, draws=1000, tune=250, chains=4)
        bayes_coef_ci = {
            'Intercept': (-1.550 - 3*0.095, -1.550 + 3*0.095),
            'age': (0.049 - 3*0.002, 0.049 + 3*0.002),
            'sex': (1.191 - 3*0.063, 1.191 + 3*0.063),
            'comorbidity': (1.287 - 3*0.082, 1.287 + 3*0.082),
            'ldh_high': (0.984 - 3*0.104, 0.984 + 3*0.104),
        }
        bay_model = BayesianModel(input_data=df, 
                                model_formula = "outcome ~ age + sex + comorbidity + ldh_high", 
                                priors={
                                    "Intercept": (-1.550, 0.095), 
                                    "age": (0.049, 0.002), 
                                    "sex": (1.191, 0.063),
                                    "comorbidity": (1.287, 0.082), 
                                    "ldh_high": (0.984, 0.104)}
                                , verbose=False, draws=10000, tune=2000, chains=4, cores=8)
        bay_model.trigger = AlwaysTrigger(bay_model)
        mytest = PREDICT(data=df, model=bay_model, startDate='min', endDate='max', timestep='month')
        mytest.addLogHook(Accuracy(bay_model))
        mytest.addLogHook(AUROC(bay_model))
        mytest.addLogHook(Precision(bay_model))
        mytest.addLogHook(CalibrationSlope(bay_model))
        mytest.addLogHook(CITL(bay_model))
        mytest.addLogHook(OE(bay_model))
        mytest.addLogHook(AUPRC(bay_model))
        mytest.addLogHook(F1Score(model))
        mytest.addLogHook(TrackBayesianCoefs(bay_model))
        mytest.run()
        log = mytest.getLog()

        if "BayesianCoefficients" in log:
            bayes_dict["BayesianCoefficients"].update(log["BayesianCoefficients"])
            print(log["BayesianCoefficients"])
        
        ttd = find_bayes_coef_change(bayes_dict["BayesianCoefficients"], detectDate=startDate, undetected=undetected, thresholds=bayes_coef_ci)
        bayesian_ttd.append(ttd)

        bayes_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'F1Score': list(log["F1score"].values()), 'impact_or_prev': list([str(custom_impact)] * len(log["Accuracy"])), 'Method':list(['Bayesian'] * len(log["Accuracy"]))})
        
        ########################################### Save Metrics #######################################

        # concatenate all the dataframes into one
        covid_metrics_df = pd.concat([covid_metrics_df, bayes_metrics], ignore_index=True)
        covid_metrics_df["Data_Type"] = "COVID Simulation"

        covid_metrics_df.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), mode='a', header=False, index=False)

        update_ttd_table(regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, custom_impact, os.path.join(resultsloc, 'covid_ttd_tbl.csv'))

        # these two just do the final impact value:
        BayesianCoefsPlot(bayes_dict, model_name = f"fast_change_impact_{custom_impact}", fileloc=resultsloc) 
        pd.DataFrame(bayes_dict["BayesianCoefficients"]).to_csv(os.path.join(resultsloc, f'bayesian_coefficients_impact_{custom_impact}.csv'), index=False)
        plot_incidence_over_time(df, switchDateStrings, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, f"fast_change_impact_{custom_impact}", fileloc=resultsloc)
        

Using OE Threshold of 0.9871497341194047 - 1.0136757259035873 for impact 0.5, mean was 1.000412730011496


Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...


Model formula is set to:  outcome ~ age + sex + comorbidity + ldh_high


Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 26 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 26 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 28 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 27 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 25 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 27 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 26 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()

Sampling 4 chains for 2_000 tune and 25_000 draw iterations (8_000 + 100_000 draws total) took 26 seconds.
Modeling the probability that outcome==1
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, age, sex, comorbidity, ldh_high]


Output()