In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")
from PREDICT import PREDICT
from PREDICT.Models import *
from PREDICT.Metrics import *
from PREDICT.Triggers import *
from PREDICT.Plots import *
from Comparison.Detect_Functions import *
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime
import statistics
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%env PYTENSOR_FLAGS=exception_verbosity=high#,optimizer=fast_compile

env: PYTENSOR_FLAGS=exception_verbosity=high#,optimizer=fast_compile


In [2]:
resultsloc = "./Results/simulation/fast_change"
os.makedirs(resultsloc, exist_ok=True)
if not os.path.exists(os.path.join(resultsloc, 'performance_metrics.csv')):
    header = pd.DataFrame(columns=['Time', 'Accuracy', 'AUROC', 'Precision', 'CalibrationSlope', 'CITL',
    'OE', 'AUPRC', 'F1Score', 'impact_or_prev', 'Method', 'Data_Type'])
    header.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), index=False)

In [3]:
startDate = pd.to_datetime('01-06-2019', dayfirst=True) # 01-06-2019
endDate = pd.to_datetime('31-12-2021', dayfirst=True) # 31-12-2021
num_patients = 100 # number of patients per each timestep

In [None]:
#recalthreshold = 0.86 # Paper has AUROC of 0.91, with lower CI at 0.86

#custom_impacts = [0.04, 0.06, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
custom_impacts = [0.5]
switchDateStrings = ['01-04-2020'] # Keep this as just one switchDate as other methods only look at one startDate/deployment date
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5":0, "SPC7":0, "Bayesian": 0})

or_age = 1.05
or_ldh = 2.5
or_comorbidity = 3.9

log_age = np.log(or_age)
log_ldh = np.log(or_ldh)
log_comorbidity = np.log(or_comorbidity)
bayes_dict = {"BayesianCoefficients":{}}

for switchDateidx, switchDateString in enumerate(switchDateStrings):
    for custom_impact in custom_impacts:
        regular_ttd = []
        static_ttd = []
        spc_ttd3 = []
        spc_ttd5 = []
        spc_ttd7 = []
        bayesian_ttd = []
        mydict = {
                'date': list(),
                'outcome': list(),
                'prediction': list(),
                'age': list(),
                'sex': list(),
                'comorbidity': list(),
                'ldh_high': list()
            }

        # Define date range and COVID shock periods
        switchDate = pd.to_datetime(switchDateString, dayfirst=True)  # COVID starts spreading
        switchDate2 = pd.to_datetime('01-06-2020', dayfirst=True)  # Peak of the pandemic
        recoveryDate = pd.to_datetime('01-06-2021', dayfirst=True)  # Start of recovery phase
        numdays = (endDate - startDate).days
        switchDays = (switchDate - startDate).days
        switch2Days = (switchDate2 - startDate).days
        recoveryDays = (recoveryDate - startDate).days

        for i in range(numdays):
            curday = startDate + dt.timedelta(days=i)

            age = (np.random.normal(44, 16.3, num_patients)).astype(int)  # Mean age 44, SD 16.3
            sex = np.random.binomial(1, 0.562, num_patients) # 56.2% are male
            comorbidity = np.random.binomial(1, 0.3, num_patients)  # 30% have comorbidities
            ldh_high = np.random.binomial(1, 0.15, num_patients)  # 15% have LDH >500 U/L
            epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

            # Calculate baseline log-odds
            # sex influence 1.2 due to not being provided in the paper
            lp = -1.5 + log_age * age +  log_ldh * ldh_high + log_comorbidity * comorbidity + 1.2 * sex  + epsilon
            curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability

            # Simulate COVID effects
            daystopeak = switch2Days - switchDays
            if switchDays <= i < switch2Days:
                lp += custom_impact * (i - switchDays) / daystopeak  # Initial impact of COVID ramping up
            elif switch2Days <= i < recoveryDays:
                lp += custom_impact  # Peak of the pandemic
            elif i >= recoveryDays:
                lp -= 1.0  # Recovery periodâ€”improved health outcomes

            # Generate outcomes
            curoutcomes = np.random.binomial(1, 1 / (1 + np.exp(-lp)))  # Simulate COVID events

            # Append to dictionary
            mydict['date'].extend([curday] * num_patients)
            mydict['outcome'].extend(curoutcomes)
            mydict['prediction'].extend(curpredictions)
            mydict['age'].extend(age)
            mydict['sex'].extend(sex)
            mydict['comorbidity'].extend(comorbidity)
            mydict['ldh_high'].extend(ldh_high)

        df = pd.DataFrame(mydict)

        ########################################### Baseline Testing #######################################
        model_name='COVID_datasim'
        model = EvaluatePredictions()
        mytest = PREDICT(data=df, model=model, startDate='min', endDate='max', timestep='month')
        mytest.addLogHook(Accuracy(model))
        mytest.addLogHook(AUROC(model))
        mytest.addLogHook(Precision(model))
        mytest.addLogHook(CalibrationSlope(model))
        mytest.addLogHook(CITL(model))
        mytest.addLogHook(OE(model))
        mytest.addLogHook(AUPRC(model))
        mytest.addLogHook(F1Score(model))
        mytest.run()
        log = mytest.getLog()

        baseline_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'F1Score': list(log["F1score"].values()), 'impact_or_prev': list([str(custom_impact)] * len(log["Accuracy"])), 'Method':list(['Baseline'] * len(log["Accuracy"]))})
        # Use baseline measure of OE score in time before switchDate to get CI
        recalthreshold_lower = float(baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].quantile(0.025))
        recalthreshold_upper = float(baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].quantile(0.975))
        print(f"Using OE Threshold of {recalthreshold_lower} - {recalthreshold_upper} for impact {custom_impact}, mean was {baseline_metrics[baseline_metrics['Time'] < switchDate]['OE'].mean()}")
        
        ########################################### Save Metrics #######################################
        baseline_metrics["Data_Type"] = "COVID Simulation"
        baseline_metrics.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), mode='a', header=False, index=False)
        
        ########################################### Test models ##########################################
        
        covid_metrics_df = get_metrics_recal_methods(df, custom_impact, recalthreshold_lower, recalthreshold_upper, model_name='COVID_datasim')
        undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, switchDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold_lower, recalthreshold_upper)
        ########################################### Bayesian Testing #######################################
        bay_model = BayesianModel(input_data=df, priors={"Intercept": (-1.5, 0.1), "age": (log_age, 0.01), "sex": (1.2, 0.1), "comorbidity": (log_comorbidity, 0.5), "ldh_high": (log_ldh, 0.5)}, cores=2, verbose=False, draws=1000, tune=250, chains=4)
        bay_model.trigger = TimeframeTrigger(model=bay_model, updateTimestep='month', dataStart=startDate, dataEnd=endDate)
        mytest = PREDICT(data=df, model=bay_model, startDate='min', endDate='max', timestep='month')
        mytest.addLogHook(Accuracy(bay_model))
        mytest.addLogHook(AUROC(bay_model))
        mytest.addLogHook(Precision(bay_model))
        mytest.addLogHook(CalibrationSlope(bay_model))
        mytest.addLogHook(CITL(bay_model))
        mytest.addLogHook(OE(bay_model))
        mytest.addLogHook(AUPRC(bay_model))
        mytest.addLogHook(F1Score(model))
        mytest.addLogHook(TrackBayesianCoefs(bay_model))
        mytest.run()
        log = mytest.getLog()

        if "BayesianCoefficients" in log:
            bayes_dict["BayesianCoefficients"].update(log["BayesianCoefficients"])
            print(log["BayesianCoefficients"])
        
        ttd = find_bayes_coef_change(bayes_dict["BayesianCoefficients"], detectDate=switchDate, undetected=undetected, threshold=0.1)
        print(ttd)
        bayesian_ttd.append(ttd)

        bayes_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'F1Score': list(log["F1score"].values()), 'impact_or_prev': list([str(custom_impact)] * len(log["Accuracy"])), 'Method':list(['Bayesian'] * len(log["Accuracy"]))})
        
        ########################################### Save Metrics #######################################

        # concatenate all the dataframes into one
        covid_metrics_df = pd.concat([covid_metrics_df, bayes_metrics], ignore_index=True)
        covid_metrics_df["Data_Type"] = "COVID Simulation"

        covid_metrics_df.to_csv(os.path.join(resultsloc, 'performance_metrics.csv'), mode='a', header=False, index=False)

        update_ttd_table(regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, custom_impact, os.path.join(resultsloc, 'covid_ttd_tbl.csv'))

        # these two just do the final impact value:
        BayesianCoefsPlot(bayes_dict, model_name = f"fast_change_impact_{custom_impact}", fileloc=resultsloc) 
        plot_incidence_over_time(df, switchDateStrings, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, f"fast_change_impact_{custom_impact}", fileloc=resultsloc)
        

AttributeError: 'int' object has no attribute 'days'

In [15]:
# Tweak plot outputs
BayesianCoefsPlot(bayes_dict, model_name = f"fast_change_impact_{custom_impact}", fileloc=resultsloc) 
plot_incidence_over_time(df, switchDateStrings, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, f"fast_change_impact_{custom_impact}", fileloc=resultsloc)

In [14]:
def plot_incidence_over_time(df, switchDateStrings, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, sim_data=None, fileloc='./'):
    """Plot the incidence of an outcome over time, with vertical lines indicating model update times.

    Args:
        df (pd.DataFrame): DataFrame containing the simulation data with 'date' and 'outcome' columns.
        switchDateStrings (list or None): List of switch dates as strings, or None if not applicable.
        regular_ttd (list): List of time to detect (ttd) for regular testing model updates.
        static_ttd (list): List of time to detect (ttd) for static threshold model updates.
        spc_ttd3 (list): List of time to detect (ttd) for SPC 3 months model updates.
        spc_ttd5 (list): List of time to detect (ttd) for SPC 5 months model updates.
        spc_ttd7 (list): List of time to detect (ttd) for SPC 7 months model updates.
        bayesian_ttd (list): List of time to detect (ttd) for Bayesian model updates.
        sim_data (str or None): Identifier for the simulation data, used in the filename. Defaults to None.
        fileloc (str): Directory to save the plot image. Defaults to current directory.
    """

    # If we want to plot a different simulated data incidence:
    # save times and grouped incidence in a df to plot at the end? - another column to say which number run it is for new lines
    plt.figure(figsize=(10, 5)) # plot incidence over time for each switchTime - start with just the final one first

    # groupby the date and get the sum of the outcome
    groupby_df = df.groupby('date').agg({'outcome': 'sum'}).reset_index()

    plt.plot(groupby_df['date'], groupby_df['outcome'], label='Incidence', color='blue')

    if switchDateStrings is not None:
        switch_time = pd.to_datetime(switchDateStrings[-1], dayfirst=True)
        plt.vlines(x=switch_time, ymin=0, ymax=groupby_df['outcome'].max(), color='orange', linestyle='-', label='Shock Time')
    else:
        switch_time = df['date'].min()  # Use the minimum date in the DataFrame if no switch date is provided

    if len(regular_ttd) > 0 and regular_ttd[-1] is not None:
        regular_update = switch_time + timedelta(days=regular_ttd[-1])
        plt.vlines(x=regular_update, ymin=0, ymax=groupby_df['outcome'].max(), color='black', linestyle='dashed', label='Regular Testing Model Update Time', alpha=0.6)
    if len(static_ttd) > 0 and static_ttd[-1] is not None:
        static_update = switch_time + timedelta(days=static_ttd[-1])
        plt.vlines(x=static_update, ymin=0, ymax=groupby_df['outcome'].max(), color='purple', linestyle='dashdot', label='Static Threshold Model Update Time', alpha=0.6)
    if len(spc_ttd3) > 0 and spc_ttd3[-1] is not None: 
        spc_update3 = switch_time + timedelta(days=spc_ttd3[-1])
        plt.vlines(x=spc_update3, ymin=0, ymax=groupby_df['outcome'].max(), color='green', linestyle='dotted', label='SPC 3 months Model Update Time', alpha=0.6)
    if len(spc_ttd5) > 0 and spc_ttd5[-1] is not None:
        spc_update5 = switch_time + timedelta(days=spc_ttd5[-1])
        plt.vlines(x=spc_update5, ymin=0, ymax=groupby_df['outcome'].max(), color='pink',  linestyle='dotted', label='SPC 5 months Model Update Time', alpha=0.6)
    if len(spc_ttd7) > 0 and spc_ttd7[-1] is not None:
        spc_update7 = switch_time + timedelta(days=spc_ttd7[-1])
        plt.vlines(x=spc_update7, ymin=0, ymax=groupby_df['outcome'].max(), color='grey', linestyle='dotted', label='SPC 7 months Model Update Time', alpha=0.6)
    if len(bayesian_ttd) > 0 and bayesian_ttd[-1] is not None:
        bayesian_update = switch_time + timedelta(days=bayesian_ttd[-1])
        plt.vlines(x=bayesian_update, ymin=0, ymax=groupby_df['outcome'].max(), linestyle='-', label='Bayesian Model Sig. Change')

    plt.xlabel("Date")
    plt.ylabel("Incidence")
    plt.legend()
    # save figure
    plt.savefig(os.path.join(fileloc, f"incidence_over_time_{sim_data}.png"), dpi=600, bbox_inches='tight')
    plt.show()
