### Import packages

In [None]:
import sys
sys.path.append("../")
from PREDICT import PREDICT
from PREDICT.Models import *
from PREDICT.Metrics import *
from PREDICT.Triggers import *
from PREDICT.Plots import *
from Comparison.Detect_Functions import *
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime
import statistics
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
%env PYTENSOR_FLAGS=exception_verbosity=high,optimizer=None

## Comparing Methods to Detect Temporal Drift

In this notebook, four methods to detect temporal drift are compared:

1) Regular model testing
2) Statistical process control
3) Static threshold
4) Bayesian variable relative change


<img src="..\docs\images\detect_methods.png" alt="Methods used to detect temporal drift" width="400">


These methods are compared for four scenarios:

1) Fast predictor change - COVID pandemic
2) Slow predictor change - population-based diabetes increase
3) Outcome drift - change in prevalence of diabetes mellitus
4) Multivariate drift - the diabetes prevalence increases whilst smoking prevalence decreases


In [None]:
startDate = pd.to_datetime('01-06-2019', dayfirst=True) # 01-06-2019
endDate = pd.to_datetime('31-12-2021', dayfirst=True)
num_patients = 40 # number of patients per each timestep

#### Fast Change - COVID Data Simulation

In [None]:
recalthreshold = 0.86 # Paper has AUROC of 0.91, with lower CI at 0.86

custom_impacts = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 2.0, 2.5, 3.0]
switchDateStrings = ['01-04-2020']#, '15-04-2020', '01-05-2020']
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5":0, "SPC7":0, "Bayesian": 0})

hr_age = 0.5
hr_ldh = 9.8
hr_comorbidity = 3.9

log_age = np.log(hr_age)
log_ldh = np.log(hr_ldh)
log_comorbidity = np.log(hr_comorbidity)

plt.figure(figsize=(10, 5))
plt.title(f"Time to Detect Change in Outcomes")

bayes_dict = {"BayesianCoefficients":{}}

for switchDateidx, switchDateString in enumerate(switchDateStrings):
    drop_sizes = []
    regular_ttd = []
    static_ttd = []
    spc_ttd3 = []
    spc_ttd5 = []
    spc_ttd7 = []
    bayesian_ttd = []
    for custom_impact in custom_impacts:
        mydict = {
                'date': list(),
                'outcome': list(),
                'prediction': list(),
                'age': list(),
                'sex': list(),
                'comorbidity': list(),
                'ldh_high': list()
            }

        # Define date range and COVID shock periods
        switchDate = pd.to_datetime(switchDateString, dayfirst=True)  # COVID starts spreading
        switchDate2 = pd.to_datetime('01-06-2020', dayfirst=True)  # Peak of the pandemic
        recoveryDate = pd.to_datetime('01-06-2021', dayfirst=True)  # Start of recovery phase
        numdays = (endDate - startDate).days
        switchDays = (switchDate - startDate).days
        switch2Days = (switchDate2 - startDate).days
        recoveryDays = (recoveryDate - startDate).days

        for i in range(numdays):
            curday = startDate + dt.timedelta(days=i)

            age = (np.random.normal(44, 16.3, num_patients) - 44) / 16.3   # Mean age 44 years, std 16.3
            sex = np.random.binomial(1, 0.562, num_patients) # 56.2% are male
            comorbidity = np.random.binomial(1, 0.3, num_patients)  # 30% have comorbidities
            ldh_high = np.random.binomial(1, 0.15, num_patients)  # 15% have LDH >500 U/L
            epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

            # Calculate baseline log-odds
            # sex influence 1.2 due to not being provided in the paper
            lp = -1.5 + log_age * age +  log_ldh * ldh_high + log_comorbidity * comorbidity + 1.2 * (sex - 0.562) + epsilon
            curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability

            # Simulate COVID effects
            if switchDays <= i < switch2Days:
                lp += custom_impact  # Initial impact of COVID
            elif switch2Days <= i < recoveryDays:
                lp += custom_impact + 0.5  # Peak of the pandemic
            elif i >= recoveryDays:
                lp -= 1.0  # Recovery periodâ€”improved health outcomes

            # Generate outcomes
            curoutcomes = np.random.binomial(1, 1 / (1 + np.exp(-lp)))  # Simulate COVID events

            # Append to dictionary
            mydict['date'].extend([curday] * num_patients)
            mydict['outcome'].extend(curoutcomes)
            mydict['prediction'].extend(curpredictions)
            mydict['age'].extend(age)
            mydict['sex'].extend(sex)
            mydict['comorbidity'].extend(comorbidity)
            mydict['ldh_high'].extend(ldh_high)

        df = pd.DataFrame(mydict)

        drop_sizes.append(custom_impact)
        ############################ Regular, static and SPC testing ############################
        undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, switchDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold)
        ############################ Bayesian ############################
        bay_model = BayesianModel(input_data=df, priors={"Intercept": (-1, 2), "age": (log_age, 2), "sex": (1, 2), "comorbidity": (log_comorbidity, 2), "ldh_high": (log_ldh, 2)}, cores=1, verbose=False, draws=1000, tune=250, chains=4)
        undetected, bayesian_ttd, bayes_dict = run_bayes_model(undetected, bay_model, bayes_dict, df, bayesian_ttd, switchDate)
            
    plt.plot(drop_sizes, regular_ttd, color='#f781bf', label='Regular Testing', alpha=0.6, linewidth=1)
    plt.plot(drop_sizes, static_ttd, color='#984ea3', label='Static Threshold', alpha=0.6, linewidth=1)
    plt.plot(drop_sizes, spc_ttd3, color='#4daf4a', label='SPC 3 months', alpha=0.6, linewidth=1)
    plt.plot(drop_sizes, spc_ttd5, color='#377eb8', label='SPC 5 months', alpha=0.6, linewidth=1)
    plt.plot(drop_sizes, spc_ttd7, color='#ff7f00', label='SPC 7 months', alpha=0.6, linewidth=1)
    plt.plot(drop_sizes, bayesian_ttd, color='#a65628', label='Bayesian', alpha=0.6, linewidth=1)


plt.xlabel("Impact Increase Size")
plt.ylabel("Time to Detect (days)")
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:6], labels[:6]) # Only the first plot of each model is labelled in the legend

# save figures
plt.savefig("time_to_detect_change_fast_change.png", dpi=600, bbox_inches='tight')
plt.show()


# These can only be plot to show one (the final) SwitchTime at a time, as repeating Timestamp keys will be overwritten
BayesianCoefsPlot(bayes_dict, "fast_change") 
plot_prev_over_time(df, switchDateStrings, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, "fast_change")



#### Outcome Prevalence - Diabetes Outcome Predicted

In [None]:
recalthreshold = 0.77 # Paper has AUROC of 0.81, with lower CI at 0.77 

prev_increases = np.arange(1.0001, 1.003, 0.0002).tolist()
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5": 0, "SPC7": 0, "Bayesian": 0})

# coefficients from non-laboratory logistic regression model
age_at_lv_coef = 0.16 # lv = last visit
bmi_coef = 0.68
hip_circ_coef = -0.05
sex_coef = -0.14
height_coef = -0.15
waist_circ_coef = 0.31
waist_hips_ratio_coef = 0.54
weight_coef = 0.03
time_between_visits_coef = 0.38
bias_coef = -0.74

# mean and standard deviation for each predictor
# variable at the last visit is used
mean_age, std_age = 62.9, 7.5
mean_bmi, std_bmi = 26.6, 4.4
mean_hip_circ, std_hip_circ = 101.6, 8.8
perc_male, mean_height, std_height = 0.478, 169, 9.2
mean_waist_circ, std_waist_circ = 88.7, 12.7
mean_weight, std_weight = 76.2, 15.2
mean_time_between_visits, std_time_between_visits = 7.3, 2.3

mean_waist_hips_ratio = mean_waist_circ / mean_hip_circ
std_waist_hips_ratio = mean_waist_hips_ratio * np.sqrt(
    (std_waist_circ / mean_waist_circ) ** 2 + (std_hip_circ / mean_hip_circ) ** 2)
bayes_dict = {"BayesianCoefficients":{}}
plt.figure(figsize=(10, 5))
plt.title(f"Time to Detect Change in Outcomes")

dm_prev = 0.07  # Initial diabetes prevalence = 7%
dm_increases = []
regular_ttd = []
static_ttd = []
spc_ttd3 = []
spc_ttd5 = []
spc_ttd7 = []
bayesian_ttd = []
for prev_increase in prev_increases:
    mydict = {
            'date': list(),
            'outcome': list(),
            'prediction': list(),
            'age': list(),
            'bmi':list(),
            'hip_circ': list(),
            'sex': list(),
            'height': list(),
            'waist_circ': list(),
            'waist_hips_ratio': list(),
            'weight': list(),
            'time_between_visits': list()
        }

    num_patients = 60
    numdays = (endDate - startDate).days
    
    for i in range(numdays):
        curday = startDate + dt.timedelta(days=i)

        age = np.random.normal(mean_age, std_age, num_patients)
        # min max normalisation
        age = (age - np.min(age)) / (np.max(age) - np.min(age))  # Normalize age to [0, 1]

        bmi = np.random.normal(mean_bmi, std_bmi, num_patients) 
        bmi = (bmi - np.min(bmi)) / (np.max(bmi) - np.min(bmi))  # Normalize BMI to [0, 1]

        hip_circ = np.random.normal(mean_hip_circ, std_hip_circ, num_patients)
        hip_circ = (hip_circ - np.min(hip_circ)) / (np.max(hip_circ) - np.min(hip_circ))

        height = np.random.normal(mean_height, std_height, num_patients)
        height = (height - np.min(height)) / (np.max(height) - np.min(height))  # Normalize height to [0, 1]

        waist_circ = np.random.normal(mean_waist_circ, std_waist_circ, num_patients)
        waist_circ = (waist_circ - np.min(waist_circ)) / (np.max(waist_circ) - np.min(waist_circ))  # Normalize waist circumference to [0, 1]

        waist_hips_ratio = np.random.normal(mean_waist_hips_ratio, std_waist_hips_ratio, num_patients)
        waist_hips_ratio = (waist_hips_ratio - np.min(waist_hips_ratio)) / (np.max(waist_hips_ratio) - np.min(waist_hips_ratio))  # Normalize waist-hips ratio to [0, 1]

        weight = np.random.normal(mean_weight, std_weight, num_patients)
        weight = (weight - np.min(weight)) / (np.max(weight) - np.min(weight))  # Normalize weight to [0, 1]

        time_between_visits = np.random.normal(mean_time_between_visits, std_time_between_visits, num_patients)
        time_between_visits = (time_between_visits - np.min(time_between_visits)) / (np.max(time_between_visits) - np.min(time_between_visits))  # Normalize time between visits to [0, 1]

        sex = np.random.binomial(1, perc_male, num_patients)

        epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)
        

        # Calculate baseline log-odds
        lp = bias_coef + age_at_lv_coef * age + bmi_coef * bmi + hip_circ_coef * hip_circ + sex_coef * (sex - perc_male) + height_coef * height + waist_circ_coef * waist_circ  + waist_hips_ratio_coef * waist_hips_ratio + weight_coef * weight  + time_between_visits_coef * time_between_visits + epsilon
        curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability

        # Generate outcomes to simulate diabetes rates increasing over time
        if i % 30 == 0:
            dm_prev *= prev_increase # this increases the probability by x% each month

        mod_lp = 1/(1+np.exp(lp + dm_prev))
        # intercept changed, but model weights constant
        # diabetes increased as outcome, but not explained by data
        curoutcomes = np.random.binomial(1, mod_lp)           
        

        # Append to dictionary from the distribution for each of the variables (Table 1)
        mydict['date'].extend([curday] * num_patients)
        mydict['outcome'].extend(curoutcomes)
        mydict['prediction'].extend(curpredictions)
        mydict['age'].extend(age)
        mydict['bmi'].extend(bmi)
        mydict['hip_circ'].extend(hip_circ)
        mydict['sex'].extend(sex)
        mydict['height'].extend(height)
        mydict['waist_circ'].extend(waist_circ)
        mydict['waist_hips_ratio'].extend(waist_hips_ratio)
        mydict['weight'].extend(weight)
        mydict['time_between_visits'].extend(time_between_visits)
        

    df = pd.DataFrame(mydict)  
    
    dm_increases.append(dm_prev)
    
    undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, startDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold)

    ############################ Bayesian ############################
    bay_model = BayesianModel(input_data=df, priors={"Intercept": (bias_coef, 2), "age": (age_at_lv_coef, 2), "bmi": (bmi_coef, 2), "hip_circ": (hip_circ_coef, 2),
                                                "sex": (sex_coef, 2), "height":(height_coef, 2), "waist_circ":(waist_circ_coef, 2),
                                                "waist_hips_ratio":(waist_hips_ratio_coef, 2), "weight":(weight_coef, 2), 
                                                "time_between_visits":(time_between_visits_coef, 2)}, cores=1, verbose=False)
    undetected, bayesian_ttd, bayes_dict = run_bayes_model(undetected, bay_model, bayes_dict, df, bayesian_ttd, startDate)


# plot the time to detect for all models
plt.plot(dm_increases, regular_ttd, color='#f781bf', label='Regular Testing', alpha=0.6, linewidth=1)
plt.plot(dm_increases, static_ttd, color='#984ea3', label='Static Threshold', alpha=0.6, linewidth=1)
plt.plot(dm_increases, spc_ttd3, color='#4daf4a', label='SPC 3 months', alpha=0.6, linewidth=1)
plt.plot(dm_increases, spc_ttd5, color='#377eb8', label='SPC 5 months', alpha=0.6, linewidth=1)
plt.plot(dm_increases, spc_ttd7, color='#ff7f00', label='SPC 7 months', alpha=0.6, linewidth=1)
plt.plot(dm_increases, bayesian_ttd, color='#a65628', label='Bayesian', alpha=0.6, linewidth=1)
plt.xlabel("Prevalence of Diabetes")
plt.ylabel("Time to Detect (days)")
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:6], labels[:6])
# save figures
plt.savefig("time_to_detect_change_outcome_prev.png", dpi=600, bbox_inches='tight')
plt.show()


BayesianCoefsPlot(bayes_dict, "outcome_prev") 
plot_prev_over_time(df, None, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, "outcome_prev")



#### Slow change data simulation - Diabetes as a Predictor (increasing over time) with CKD as the predicted outcome.

In [None]:
recalthreshold = 0.851 # Paper has AUROC of 0.889, with lower CI at 0.851

prev_increases = np.arange(1.0001, 1.0030, 0.0002).tolist()
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5": 0, "SPC7": 0, "Bayesian": 0})

mean_TGFB, std_TGFB = 13.23, 5.18
mean_ADMA, std_ADMA= 101.1, 64.8
mean_BUN, std_BUN = 5.45, 1.11
mean_age, std_age = 63.27, 10.09 

TGFB_coef = 1.84
ADMA_coef = 1.137
DM_coef = 0.84
BUN_coef = 0.497
elderly_coef = 0.603

bayes_dict = {"BayesianCoefficients":{}}

plt.figure(figsize=(10, 5))
plt.title(f"Time to Detect Change in Outcomes")

perc_dm = 0.05 # 5.5%
dm_increases = []
regular_ttd = []
static_ttd = []
spc_ttd3 = []
spc_ttd5 = []
spc_ttd7 = []
bayesian_ttd = []
for prev_increase in prev_increases:
    mydict = {
            'date': list(),
            'outcome': list(),
            'prediction': list(),
            'TGFB': list(),
            'ADMA':list(),
            'DM': list(),
            'BUN': list(),
            'elderly': list()
        }

    num_patients = 60

    numdays = (endDate - startDate).days

    for i in range(numdays):
        curday = startDate + dt.timedelta(days=i)

        # increase the prevalence of diabetes over time
        if i % 30 == 0:
            perc_dm *= prev_increase # this increases the probability by x% each month

        TGFB = get_binom_from_normal(mean_TGFB, std_TGFB, num_patients, 1.011)
        ADMA = get_binom_from_normal(mean_ADMA, std_ADMA, num_patients, 0.019)
        DM = np.random.binomial(1, perc_dm, num_patients)
        BUN = get_binom_from_normal(mean_BUN, std_BUN, num_patients, 5.9)
        elderly = get_binom_from_normal(mean_age, std_age, num_patients, 60)
        epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

        # Calculate baseline log-odds
        # non_genetic_risk_score_model from paper
        lp = TGFB_coef * TGFB + ADMA_coef * ADMA + DM_coef * DM + BUN_coef * BUN + elderly_coef * elderly + epsilon

        curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability
        curoutcomes = np.random.binomial(1, curpredictions)           
        
        # Append to dictionary from the distribution for each of the variables (Table 1)
        mydict['date'].extend([curday] * num_patients)
        mydict['outcome'].extend(curoutcomes)
        mydict['prediction'].extend(curpredictions)
        mydict['TGFB'].extend(TGFB)
        mydict['ADMA'].extend(ADMA)
        mydict['DM'].extend(DM)
        mydict['BUN'].extend(BUN)
        mydict['elderly'].extend(elderly)
        

    df = pd.DataFrame(mydict)

    undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, startDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold)

    ############################ Bayesian ############################
    bay_model = BayesianModel(input_data=df, priors={"Intercept": (-1, 2), "TGFB": (TGFB_coef, 2), "ADMA": (ADMA_coef, 2), "DM": (DM_coef, 2), "BUN": (BUN_coef, 2),
                                                "elderly": (elderly_coef, 2)}, cores=1, verbose=False)
    undetected, bayesian_ttd, bayes_dict = run_bayes_model(undetected, bay_model, bayes_dict, df, bayesian_ttd, startDate)

        
dm_increases = [x * 100 for x in dm_increases]  # Convert to percentage
# plot the time to detect for all models
plt.plot(prev_increases, regular_ttd, color='#f781bf', label='Regular Testing', alpha=0.6, linewidth=1)
plt.plot(prev_increases, static_ttd, color='#984ea3', label='Static Threshold', alpha=0.6, linewidth=1)
plt.plot(prev_increases, spc_ttd3, color='#4daf4a', label='SPC 3 months', alpha=0.6, linewidth=1)
plt.plot(prev_increases, spc_ttd5, color='#377eb8', label='SPC 5 months', alpha=0.6, linewidth=1)
plt.plot(prev_increases, spc_ttd7, color='#ff7f00', label='SPC 7 months', alpha=0.6, linewidth=1)
plt.plot(prev_increases, bayesian_ttd, color='#a65628', label='Bayesian', alpha=0.6, linewidth=1)
plt.xlabel("Prevalence of Diabetes Predictor (%)")
plt.ylabel("Time to Detect (days)")
handles, labels = plt.gca().get_legend_handles_labels()
# Ensure only the first plot of each model is labelled in the legend
plt.legend(handles[:6], labels[:6])
# save figures
plt.savefig("time_to_detect_change_slow_change.png", dpi=600, bbox_inches='tight')
plt.show()


BayesianCoefsPlot(bayes_dict, "slow_change") 
plot_prev_over_time(df, None, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, "slow_change")



#### Multivariate Model - QRISK2 - Diabetes increasing whilst smoking is decreasing.


In [None]:
plt.figure(figsize=(10, 5))
plt.title(f"Time to Detect Change in Outcomes")

bayes_dict = {"BayesianCoefficients":{}}

recalthreshold = 0.811 # Paper has AUROC of 0.814, with lower CI at 0.811 

# Define the coefficients (hazard ratios converted to log-odds)
coefs = {"White": np.log(1), 
    "Indian": np.log(1.43),
    "Pakistani": np.log(1.8),
    "Bangladeshi": np.log(1.35),
    "Other_Asian": np.log(1.15),
    "Black_Caribbean": np.log(1.08),
    "Black_African": np.log(0.58),
    "Chinese": np.log(0.69),
    "Other": np.log(1.04),
    "Age": np.log(1.66),
    "BMI": np.log(1.08),
    "Townsend": np.log(1.37),
    "SBP": np.log(1.2),
    "CholHDL_ratio": np.log(1.17),
    "Family_CHD": np.log(1.99),
    "Current_smoker": np.log(1.8),
    "Treated_HTN": np.log(1.54),
    "DM": np.log(2.54),
    "RA": np.log(1.5),
    "AF": np.log(3.06),
    "Renal_disease": np.log(1.7),
    "Age_BMI": np.log(0.976),
    "Age_Townsend": np.log(0.938),
    "Age_SBP": np.log(0.966),
    "Age_Family_CHD": np.log(0.927),
    "Age_Smoking": np.log(0.931),
    "Age_Treated_HTN": np.log(0.952),
    "Age_DM": np.log(0.904),
    "Age_AF": np.log(0.858)
}


# Percentage variables (/100)
percent_family_history_chd = 0.126
percent_treated_hypertension = 0.0712
percent_rheumatoid_arthritis = 0.0093
percent_atrial_fibrillation = 0.0035
percent_renal_disease = 0.0016

# Age variable
median_age, IQR_age = 49, 19
mean_age, std_age = median_age, IQR_age / 1.35

# Mean and standard deviation variables
mean_bmi, std_bmi = 33.8, 6.1
mean_townsend, std_townsend = 17.67, 3.534
mean_sbp, std_sbp = 26.6, 4.4
mean_chol_hdl_ratio, std_chol_hdl_ratio = 3.66, 0.144


intercept = None
baseline_prob = 0.233 # 23.3%

prev_increases = np.arange(1.0001, 1.003, 0.0002).tolist() # Increase in diabetes prevalence over time
smoking_decrease = np.arange(0.9995, 0.9967, -0.0002).tolist()  # Decrease in smoking prevalence over time
undetected = dict({"Static Threshold": 0, "Regular Testing": 0, "SPC3": 0, "SPC5": 0, "SPC7": 0, "Bayesian": 0})

percent_type_2_diabetes = 0.017 # reset these for each start date
percent_current_smoker = 0.228
dm_increases = []
regular_ttd = []
static_ttd = []
spc_ttd3 = []
spc_ttd5 = []
spc_ttd7 = []
bayesian_ttd = []
for num, prev_increase in enumerate(prev_increases):
    print("Prev increase:", prev_increase)
    mydict = {
            'date': list(),
            'outcome': list(),
            'prediction': list(),
            'White': list(),
            'Indian': list(),
            'Pakistani': list(),
            'Bangladeshi': list(),
            'Other_Asian': list(),
            'Black_Caribbean': list(),
            'Black_African': list(),
            'Chinese': list(),
            'Other': list(),
            'Age': list(),
            'BMI':list(),
            'Townsend': list(),
            'SBP': list(),
            'CholHDL_ratio': list(),
            'Family_CHD': list(),
            'Current_smoker': list(),
            'Treated_HTN': list(),
            'DM': list(),
            'RA': list(),
            'AF': list(),
            'Renal_disease': list()
        }

    num_patients = 60

    # Define date range
    numdays = (endDate - startDate).days

    
    for i in range(numdays):
        curday = startDate + dt.timedelta(days=i)

        # increase the prevalence of diabetes over time
        if i % 30 == 0:
            percent_type_2_diabetes *= prev_increase # this increases the probability by x% each month
            percent_current_smoker *= smoking_decrease[num] # decrease the prevalence of smoking over time
        if percent_type_2_diabetes < 0 or percent_type_2_diabetes > 1:
            print("Percentage of people with DM", percent_type_2_diabetes)
        if percent_current_smoker < 0 or percent_current_smoker > 1:
            print("Percentage of people who are current smokers", percent_current_smoker)


        # Generate random factors for patients using z-score normalization for non-binary values
        age = np.random.normal(mean_age, std_age, num_patients) 
        age = (age - np.min(age)) / (np.max(age) - np.min(age))
        bmi = np.random.normal(mean_bmi, std_bmi, num_patients)
        bmi = (bmi - np.min(bmi)) / (np.max(bmi) - np.min(bmi))
        townsend = np.random.normal(mean_townsend, std_townsend, num_patients)
        townsend = (townsend - np.min(townsend)) / (np.max(townsend) - np.min(townsend))
        SBP = np.random.normal(mean_sbp, std_sbp, num_patients)
        SBP = (SBP - np.min(SBP)) / (np.max(SBP) - np.min(SBP))
        chol_hdl_ratio = np.random.normal(mean_chol_hdl_ratio, std_chol_hdl_ratio, num_patients)
        chol_hdl_ratio = (chol_hdl_ratio - np.min(chol_hdl_ratio)) / (np.max(chol_hdl_ratio) - np.min(chol_hdl_ratio))
        pat_factors = {"Age": age, 
            "BMI": bmi,
            "Townsend": townsend,
            "SBP": SBP,
            "CholHDL_ratio": chol_hdl_ratio,
            "Family_CHD": np.random.binomial(1, percent_family_history_chd, num_patients),
            "Current_smoker": np.random.binomial(1, percent_current_smoker, num_patients),
            "Treated_HTN": np.random.binomial(1, percent_treated_hypertension, num_patients),
            "DM": np.random.binomial(1, percent_type_2_diabetes, num_patients),
            "RA": np.random.binomial(1, percent_rheumatoid_arthritis, num_patients),
            "AF": np.random.binomial(1, percent_atrial_fibrillation, num_patients),
            "Renal_disease": np.random.binomial(1, percent_renal_disease, num_patients)
        }
        epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)


        ethnicity_assignment = select_ethnic_group(num_patients)
        pat_factors.update(ethnicity_assignment) # combine ethnicity dict with ethnic


        # Calculate baseline log-odds
        weighted_coef_sum = coefs['White']*pat_factors['White'] + coefs['Indian']*pat_factors['Indian'] + coefs['Pakistani']*pat_factors['Pakistani'] + coefs['Bangladeshi']*pat_factors['Bangladeshi'] 
        weighted_coef_sum += coefs['Other_Asian']*pat_factors['Other_Asian'] + coefs['Black_Caribbean']*pat_factors['Black_Caribbean'] + coefs['Black_African']*pat_factors['Black_African'] 
        weighted_coef_sum += coefs['Chinese']*pat_factors['Chinese'] + coefs['Other']*pat_factors['Other'] + coefs['Age']*(pat_factors['Age']) + coefs['BMI']*(pat_factors['BMI']) 
        weighted_coef_sum += coefs['Townsend']*(pat_factors['Townsend']) + coefs['SBP']*(pat_factors['SBP']) + coefs['CholHDL_ratio']*(pat_factors['CholHDL_ratio']) 
        weighted_coef_sum += coefs["Family_CHD"]*(pat_factors["Family_CHD"]) + coefs["Current_smoker"]*(pat_factors["Current_smoker"]) 
        weighted_coef_sum += coefs["Treated_HTN"]*(pat_factors["Treated_HTN"]) + coefs["DM"]*(pat_factors["DM"]) + coefs["RA"]*(pat_factors["RA"]) 
        weighted_coef_sum += coefs["AF"]*(pat_factors["AF"]) + coefs["Renal_disease"]*(pat_factors["Renal_disease"]) + (coefs["Age_BMI"] * pat_factors["Age"] * pat_factors["BMI"]) 
        weighted_coef_sum += (coefs["Age_Townsend"] * pat_factors["Age"] * pat_factors["Townsend"]) + (coefs["Age_SBP"] * pat_factors["Age"] * pat_factors["SBP"]) 
        weighted_coef_sum += (coefs["Age_Family_CHD"] * pat_factors["Age"] * pat_factors["Family_CHD"]) + (coefs["Age_Smoking"] * pat_factors["Age"] * pat_factors["Current_smoker"]) 
        weighted_coef_sum += (coefs["Age_Treated_HTN"] * pat_factors["Age"] * pat_factors["Treated_HTN"]) + (coefs["Age_DM"] * pat_factors["Age"] * pat_factors["DM"])
        weighted_coef_sum += (coefs["Age_AF"] * pat_factors["Age"] * pat_factors["AF"] + epsilon)

    
        intercept = np.log(baseline_prob / (1 - baseline_prob))

        # Compute log-odds
        lp = intercept + weighted_coef_sum

        curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability
        curoutcomes = np.random.binomial(1, curpredictions)           
        

        # Append to dictionary from the distribution for each of the variables (Table 1)
        mydict['date'].extend([curday] * num_patients)
        mydict['outcome'].extend(curoutcomes)
        mydict['prediction'].extend(curpredictions)
        mydict['White'].extend(pat_factors['White'])
        mydict['Indian'].extend(pat_factors['Indian'])
        mydict['Pakistani'].extend(pat_factors['Pakistani'])
        mydict['Bangladeshi'].extend(pat_factors['Bangladeshi'])
        mydict['Other_Asian'].extend(pat_factors['Other_Asian'])
        mydict['Black_Caribbean'].extend(pat_factors['Black_Caribbean'])
        mydict['Black_African'].extend(pat_factors['Black_African'])
        mydict['Chinese'].extend(pat_factors['Chinese'])
        mydict['Other'].extend(pat_factors['Other'])
        mydict['Age'].extend(pat_factors['Age'])
        mydict['BMI'].extend(pat_factors['BMI'])
        mydict['Townsend'].extend(pat_factors['Townsend'])
        mydict['SBP'].extend(pat_factors['SBP'])
        mydict['CholHDL_ratio'].extend(pat_factors['CholHDL_ratio'])
        mydict['Family_CHD'].extend(pat_factors['Family_CHD'])
        mydict['Current_smoker'].extend(pat_factors['Current_smoker'])
        mydict['Treated_HTN'].extend(pat_factors['Treated_HTN'])
        mydict['DM'].extend(pat_factors['DM'])
        mydict['RA'].extend(pat_factors['RA'])
        mydict['AF'].extend(pat_factors['AF'])
        mydict['Renal_disease'].extend(pat_factors['Renal_disease'])

        

    df = pd.DataFrame(mydict)
    df = prevent_constant_variable(df, startDate, endDate)

    #drop_sizes.append(drop_size)
    dm_increases.append(percent_type_2_diabetes)
    undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7 = run_recalibration_tests(df, startDate, undetected, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, recalthreshold)

    ############################ Bayesian ############################
    bay_model = BayesianModel(input_data=df, priors={"Intercept": (intercept, 2), 
                                                "White": (coefs['White'], 2), 
                                                "Indian": (coefs['Indian'], 2),
                                                "Pakistani": (coefs['Pakistani'], 2),
                                                "Bangladeshi": (coefs['Bangladeshi'], 2),
                                                "Other_Asian": (coefs['Other_Asian'], 2),
                                                "Black_Caribbean": (coefs['Black_Caribbean'], 2),
                                                "Black_African": (coefs['Black_African'], 2),
                                                "Chinese": (coefs['Chinese'], 2),
                                                "Other": (coefs['Other'], 2),
                                                "Age": (coefs['Age'], 2),
                                                "BMI": (coefs['BMI'], 2),
                                                "Townsend": (coefs['Townsend'], 2),
                                                "SBP": (coefs['SBP'], 2),
                                                "CholHDL_ratio": (coefs['CholHDL_ratio'], 2),
                                                "Family_CHD": (coefs['Family_CHD'], 2),
                                                "Current_smoker": (coefs['Current_smoker'], 2),
                                                "Treated_HTN": (coefs['Treated_HTN'], 2),
                                                "DM": (coefs['DM'], 2),
                                                "RA": (coefs['RA'], 2),
                                                "AF": (coefs['AF'], 2),
                                                "Renal_disease": (coefs['Renal_disease'], 2)}, 
                                                cores=1, verbose=False,
                                                model_formula="outcome ~ White + Indian + Pakistani + Bangladeshi + Other_Asian + Black_Caribbean + Black_African + Chinese + Other + Age + BMI + Townsend + SBP + CholHDL_ratio + Family_CHD + Current_smoker + Treated_HTN + DM + RA + AF + Renal_disease + Age*BMI + Age*Townsend + Age*SBP + Age*Family_CHD + Age*Current_smoker + Age*Treated_HTN + Age*DM + Age*AF")
    undetected, bayesian_ttd, bayes_dict = run_bayes_model(undetected, bay_model, bayes_dict, df, bayesian_ttd, startDate)

        
dm_increases = [x * 100 for x in dm_increases]  # Convert to percentage
# plot the time to detect for all models
plt.plot(dm_increases, regular_ttd, color='#f781bf', label='Regular Testing', alpha=0.6, linewidth=1)
plt.plot(dm_increases, static_ttd, color='#984ea3', label='Static Threshold', alpha=0.6, linewidth=1)
plt.plot(dm_increases, spc_ttd3, color='#4daf4a', label='SPC 3 months', alpha=0.6, linewidth=1)
plt.plot(dm_increases, spc_ttd5, color='#377eb8', label='SPC 5 months', alpha=0.6, linewidth=1)
plt.plot(dm_increases, spc_ttd7, color='#ff7f00', label='SPC 7 months', alpha=0.6, linewidth=1)
plt.plot(dm_increases, bayesian_ttd, color='#a65628', label='Bayesian', alpha=0.6, linewidth=1)

plt.xlabel("Prevalence of Diabetes (%)")
plt.ylabel("Time to Detect (days)")
handles, labels = plt.gca().get_legend_handles_labels()
# Ensure only the first plot of each model is labelled in the legend
plt.legend(handles[:6], labels[:6])
# save figures
plt.savefig("time_to_detect_change_QRISK.png", dpi=600, bbox_inches='tight')
plt.show()


BayesianCoefsPlot(bayes_dict, "QRISK") 
plot_prev_over_time(df, None, regular_ttd, static_ttd, spc_ttd3, spc_ttd5, spc_ttd7, bayesian_ttd, "QRISK")

