In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")
from PREDICT import PREDICT
from PREDICT.Models import *
from PREDICT.Metrics import *
from PREDICT.Triggers import *
from PREDICT.Plots import *
from Comparison.Detect_Functions import *
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import arviz as az
import bambi as bmb

import warnings
warnings.filterwarnings('ignore')

%env PYTENSOR_FLAGS=exception_verbosity=high,floatX=float32

In [None]:
recalthreshold = 0.811 # Paper has AUROC of 0.814, with lower CI at 0.811 

# Define the coefficients (hazard ratios converted to log-odds)
coefs = {"White": np.log(1), 
    "Indian": np.log(1.43),
    "Pakistani": np.log(1.8),
    "Bangladeshi": np.log(1.35),
    "Other_Asian": np.log(1.15),
    "Black_Caribbean": np.log(1.08),
    "Black_African": np.log(0.58),
    "Chinese": np.log(0.69),
    "Other": np.log(1.04),
    "Age": np.log(1.66),
    "BMI": np.log(1.08),
    "Townsend": np.log(1.37),
    "SBP": np.log(1.2),
    "CholHDL_ratio": np.log(1.17),
    "Family_CHD": np.log(1.99),
    "Current_smoker": np.log(1.8),
    "Treated_HTN": np.log(1.54),
    "DM": np.log(2.54),
    "RA": np.log(1.5),
    "AF": np.log(3.06),
    "Renal_disease": np.log(1.7),
    "Age_BMI": np.log(0.976),
    "Age_Townsend": np.log(0.938),
    "Age_SBP": np.log(0.966),
    "Age_Family_CHD": np.log(0.927),
    "Age_Smoking": np.log(0.931),
    "Age_Treated_HTN": np.log(0.952),
    "Age_DM": np.log(0.904),
    "Age_AF": np.log(0.858)
}


# Percentage variables (/100)
percent_family_history_chd = 0.126
percent_treated_hypertension = 0.0712
percent_rheumatoid_arthritis = 0.0093
percent_atrial_fibrillation = 0.0035
percent_renal_disease = 0.0016

# Age variable
median_age, IQR_age = 49, 19
mean_age, std_age = median_age, IQR_age / 1.35

# Mean and standard deviation variables
mean_bmi, std_bmi = 33.8, 6.1
mean_townsend, std_townsend = 17.67, 3.534
mean_sbp, std_sbp = 26.6, 4.4
mean_chol_hdl_ratio, std_chol_hdl_ratio = 3.66, 0.144

startDate = pd.to_datetime('01-06-2019', dayfirst=True) # 01-06-2019
endDate = pd.to_datetime('31-12-2021', dayfirst=True) # 31-12-2021
numdays = (endDate - startDate).days


intercept = None
baseline_prob = 0.233 # 23.3%

prev_increases = np.arange(1.0011, 1.003, 0.0002).tolist() # Increase in diabetes prevalence over time STARTS AT 1.0007
smoking_decrease = np.arange(0.9995, 0.9967, -0.0002).tolist()  # Decrease in smoking prevalence over time

percent_type_2_diabetes = 0.017 # reset these for each start date
percent_current_smoker = 0.228

In [None]:
# Pretrain on fake data
numdays_pretrain = 365
num_patients = 100
mydict = {
    'date': list(),
    'outcome': list(),
    'prediction': list(),
    'White': list(),
    'Indian': list(),
    'Pakistani': list(),
    'Bangladeshi': list(),
    'Other_Asian': list(),
    'Black_Caribbean': list(),
    'Black_African': list(),
    'Chinese': list(),
    'Other': list(),
    'Age': list(),
    'BMI':list(),
    'Townsend': list(),
    'SBP': list(),
    'CholHDL_ratio': list(),
    'Family_CHD': list(),
    'Current_smoker': list(),
    'Treated_HTN': list(),
    'DM': list(),
    'RA': list(),
    'AF': list(),
    'Renal_disease': list()
}

for i in range(numdays):
    curday = startDate + dt.timedelta(days=i)

    # Generate random factors for patients using min max normalization for non-binary values
    age = np.random.normal(mean_age, std_age, num_patients) 
    age = (age - np.min(age)) / (np.max(age) - np.min(age))
    bmi = np.random.normal(mean_bmi, std_bmi, num_patients)
    bmi = (bmi - np.min(bmi)) / (np.max(bmi) - np.min(bmi))
    townsend = np.random.normal(mean_townsend, std_townsend, num_patients)
    townsend = (townsend - np.min(townsend)) / (np.max(townsend) - np.min(townsend))
    SBP = np.random.normal(mean_sbp, std_sbp, num_patients)
    SBP = (SBP - np.min(SBP)) / (np.max(SBP) - np.min(SBP))
    chol_hdl_ratio = np.random.normal(mean_chol_hdl_ratio, std_chol_hdl_ratio, num_patients)
    chol_hdl_ratio = (chol_hdl_ratio - np.min(chol_hdl_ratio)) / (np.max(chol_hdl_ratio) - np.min(chol_hdl_ratio))
    pat_factors = {"Age": age, 
        "BMI": bmi,
        "Townsend": townsend,
        "SBP": SBP,
        "CholHDL_ratio": chol_hdl_ratio,
        "Family_CHD": np.random.binomial(1, percent_family_history_chd, num_patients),
        "Current_smoker": np.random.binomial(1, percent_current_smoker, num_patients),
        "Treated_HTN": np.random.binomial(1, percent_treated_hypertension, num_patients),
        "DM": np.random.binomial(1, percent_type_2_diabetes, num_patients),
        "RA": np.random.binomial(1, percent_rheumatoid_arthritis, num_patients),
        "AF": np.random.binomial(1, percent_atrial_fibrillation, num_patients),
        "Renal_disease": np.random.binomial(1, percent_renal_disease, num_patients)
    }
    epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

    ethnicity_assignment = select_ethnic_group(num_patients)
    pat_factors.update(ethnicity_assignment) # combine ethnicity dict with ethnic

    # Calculate baseline log-odds
    weighted_coef_sum = coefs['White']*pat_factors['White'] + coefs['Indian']*pat_factors['Indian'] + coefs['Pakistani']*pat_factors['Pakistani'] + coefs['Bangladeshi']*pat_factors['Bangladeshi'] 
    weighted_coef_sum += coefs['Other_Asian']*pat_factors['Other_Asian'] + coefs['Black_Caribbean']*pat_factors['Black_Caribbean'] + coefs['Black_African']*pat_factors['Black_African'] 
    weighted_coef_sum += coefs['Chinese']*pat_factors['Chinese'] + coefs['Other']*pat_factors['Other'] + coefs['Age']*(pat_factors['Age']) + coefs['BMI']*(pat_factors['BMI']) 
    weighted_coef_sum += coefs['Townsend']*(pat_factors['Townsend']) + coefs['SBP']*(pat_factors['SBP']) + coefs['CholHDL_ratio']*(pat_factors['CholHDL_ratio']) 
    weighted_coef_sum += coefs["Family_CHD"]*(pat_factors["Family_CHD"]) + coefs["Current_smoker"]*(pat_factors["Current_smoker"]) 
    weighted_coef_sum += coefs["Treated_HTN"]*(pat_factors["Treated_HTN"]) + coefs["DM"]*(pat_factors["DM"]) + coefs["RA"]*(pat_factors["RA"]) 
    weighted_coef_sum += coefs["AF"]*(pat_factors["AF"]) + coefs["Renal_disease"]*(pat_factors["Renal_disease"]) + (coefs["Age_BMI"] * pat_factors["Age"] * pat_factors["BMI"]) 
    weighted_coef_sum += (coefs["Age_Townsend"] * pat_factors["Age"] * pat_factors["Townsend"]) + (coefs["Age_SBP"] * pat_factors["Age"] * pat_factors["SBP"]) 
    weighted_coef_sum += (coefs["Age_Family_CHD"] * pat_factors["Age"] * pat_factors["Family_CHD"]) + (coefs["Age_Smoking"] * pat_factors["Age"] * pat_factors["Current_smoker"]) 
    weighted_coef_sum += (coefs["Age_Treated_HTN"] * pat_factors["Age"] * pat_factors["Treated_HTN"]) + (coefs["Age_DM"] * pat_factors["Age"] * pat_factors["DM"])
    weighted_coef_sum += (coefs["Age_AF"] * pat_factors["Age"] * pat_factors["AF"])

    intercept = np.log(baseline_prob / (1 - baseline_prob))
    
    # Compute log-odds
    lp = intercept + weighted_coef_sum    
    curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability
    mod_pred = 1 / (1 + np.exp(-(lp + epsilon)))
    curoutcomes = np.random.binomial(1, mod_pred)         
    

    # Append to dictionary from the distribution for each of the variables (Table 1)
    mydict['date'].extend([curday] * num_patients)
    mydict['outcome'].extend(curoutcomes)
    mydict['prediction'].extend(curpredictions)
    mydict['White'].extend(pat_factors['White'])
    mydict['Indian'].extend(pat_factors['Indian'])
    mydict['Pakistani'].extend(pat_factors['Pakistani'])
    mydict['Bangladeshi'].extend(pat_factors['Bangladeshi'])
    mydict['Other_Asian'].extend(pat_factors['Other_Asian'])
    mydict['Black_Caribbean'].extend(pat_factors['Black_Caribbean'])
    mydict['Black_African'].extend(pat_factors['Black_African'])
    mydict['Chinese'].extend(pat_factors['Chinese'])
    mydict['Other'].extend(pat_factors['Other'])
    mydict['Age'].extend(pat_factors['Age'])
    mydict['BMI'].extend(pat_factors['BMI'])
    mydict['Townsend'].extend(pat_factors['Townsend'])
    mydict['SBP'].extend(pat_factors['SBP'])
    mydict['CholHDL_ratio'].extend(pat_factors['CholHDL_ratio'])
    mydict['Family_CHD'].extend(pat_factors['Family_CHD'])
    mydict['Current_smoker'].extend(pat_factors['Current_smoker'])
    mydict['Treated_HTN'].extend(pat_factors['Treated_HTN'])
    mydict['DM'].extend(pat_factors['DM'])
    mydict['RA'].extend(pat_factors['RA'])
    mydict['AF'].extend(pat_factors['AF'])
    mydict['Renal_disease'].extend(pat_factors['Renal_disease'])


pretrain_data = pd.DataFrame(mydict)
pretrain_data['Age_BMI'] = pretrain_data['Age'] * pretrain_data['BMI']
pretrain_data['Age_Townsend'] = pretrain_data['Age'] * pretrain_data['Townsend']
pretrain_data['Age_SBP'] = pretrain_data['Age'] * pretrain_data['SBP']
pretrain_data['Age_Family_CHD'] = pretrain_data['Age'] * pretrain_data['Family_CHD']
pretrain_data['Age_Smoking'] = pretrain_data['Age'] * pretrain_data['Current_smoker']
pretrain_data['Age_Treated_HTN'] = pretrain_data['Age'] * pretrain_data['Treated_HTN']
pretrain_data['Age_DM'] = pretrain_data['Age'] * pretrain_data['DM']
pretrain_data['Age_AF'] = pretrain_data['Age'] * pretrain_data['AF']

In [None]:
prefit_model = bmb.Model("outcome ~ White + Indian + Pakistani + Bangladeshi + Other_Asian + Black_Caribbean + Black_African + Chinese + Other + Age + BMI + Townsend + SBP + CholHDL_ratio + Family_CHD + Current_smoker + Treated_HTN + DM + RA + AF + Renal_disease + Age_BMI + Age_Townsend + Age_SBP + Age_Family_CHD + Age_Smoking + Age_Treated_HTN + Age_DM + Age_AF", 
                        pretrain_data, family="bernoulli")
prefit_fitted = prefit_model.fit(
    tune=2000, draws=15000, cores=2, chains=4, target_accept=0.9)

az.summary(prefit_fitted)

In [None]:
for num, prev_increase in enumerate(prev_increases):
    regular_ttd = []
    static_ttd = []
    spc_ttd3 = []
    spc_ttd5 = []
    spc_ttd7 = []
    bayesian_ttd = []
    mydict = {
            'date': list(),
            'outcome': list(),
            'prediction': list(),
            'White': list(),
            'Indian': list(),
            'Pakistani': list(),
            'Bangladeshi': list(),
            'Other_Asian': list(),
            'Black_Caribbean': list(),
            'Black_African': list(),
            'Chinese': list(),
            'Other': list(),
            'Age': list(),
            'BMI':list(),
            'Townsend': list(),
            'SBP': list(),
            'CholHDL_ratio': list(),
            'Family_CHD': list(),
            'Current_smoker': list(),
            'Treated_HTN': list(),
            'DM': list(),
            'RA': list(),
            'AF': list(),
            'Renal_disease': list()
        }


    # Define date range
    numdays = (endDate - startDate).days

    
    for i in range(numdays):
        curday = startDate + dt.timedelta(days=i)

        # Generate random factors for patients using min max normalization for non-binary values
        age = np.random.normal(mean_age, std_age, num_patients) 
        age = (age - np.min(age)) / (np.max(age) - np.min(age))
        bmi = np.random.normal(mean_bmi, std_bmi, num_patients)
        bmi = (bmi - np.min(bmi)) / (np.max(bmi) - np.min(bmi))
        townsend = np.random.normal(mean_townsend, std_townsend, num_patients)
        townsend = (townsend - np.min(townsend)) / (np.max(townsend) - np.min(townsend))
        SBP = np.random.normal(mean_sbp, std_sbp, num_patients)
        SBP = (SBP - np.min(SBP)) / (np.max(SBP) - np.min(SBP))
        chol_hdl_ratio = np.random.normal(mean_chol_hdl_ratio, std_chol_hdl_ratio, num_patients)
        chol_hdl_ratio = (chol_hdl_ratio - np.min(chol_hdl_ratio)) / (np.max(chol_hdl_ratio) - np.min(chol_hdl_ratio))
        pat_factors = {"Age": age, 
            "BMI": bmi,
            "Townsend": townsend,
            "SBP": SBP,
            "CholHDL_ratio": chol_hdl_ratio,
            "Family_CHD": np.random.binomial(1, percent_family_history_chd, num_patients),
            "Current_smoker": np.random.binomial(1, percent_current_smoker, num_patients),
            "Treated_HTN": np.random.binomial(1, percent_treated_hypertension, num_patients),
            "DM": np.random.binomial(1, percent_type_2_diabetes, num_patients),
            "RA": np.random.binomial(1, percent_rheumatoid_arthritis, num_patients),
            "AF": np.random.binomial(1, percent_atrial_fibrillation, num_patients),
            "Renal_disease": np.random.binomial(1, percent_renal_disease, num_patients)
        }
        epsilon = np.random.normal(0, 0.2, num_patients) # Simulate error term (mean=0, std=0.2)

        ethnicity_assignment = select_ethnic_group(num_patients)
        pat_factors.update(ethnicity_assignment) # combine ethnicity dict with ethnic

        # Calculate baseline log-odds
        weighted_coef_sum = coefs['White']*pat_factors['White'] + coefs['Indian']*pat_factors['Indian'] + coefs['Pakistani']*pat_factors['Pakistani'] + coefs['Bangladeshi']*pat_factors['Bangladeshi'] 
        weighted_coef_sum += coefs['Other_Asian']*pat_factors['Other_Asian'] + coefs['Black_Caribbean']*pat_factors['Black_Caribbean'] + coefs['Black_African']*pat_factors['Black_African'] 
        weighted_coef_sum += coefs['Chinese']*pat_factors['Chinese'] + coefs['Other']*pat_factors['Other'] + coefs['Age']*(pat_factors['Age']) + coefs['BMI']*(pat_factors['BMI']) 
        weighted_coef_sum += coefs['Townsend']*(pat_factors['Townsend']) + coefs['SBP']*(pat_factors['SBP']) + coefs['CholHDL_ratio']*(pat_factors['CholHDL_ratio']) 
        weighted_coef_sum += coefs["Family_CHD"]*(pat_factors["Family_CHD"]) + coefs["Current_smoker"]*(pat_factors["Current_smoker"]) 
        weighted_coef_sum += coefs["Treated_HTN"]*(pat_factors["Treated_HTN"]) + coefs["DM"]*(pat_factors["DM"]) + coefs["RA"]*(pat_factors["RA"]) 
        weighted_coef_sum += coefs["AF"]*(pat_factors["AF"]) + coefs["Renal_disease"]*(pat_factors["Renal_disease"]) + (coefs["Age_BMI"] * pat_factors["Age"] * pat_factors["BMI"]) 
        weighted_coef_sum += (coefs["Age_Townsend"] * pat_factors["Age"] * pat_factors["Townsend"]) + (coefs["Age_SBP"] * pat_factors["Age"] * pat_factors["SBP"]) 
        weighted_coef_sum += (coefs["Age_Family_CHD"] * pat_factors["Age"] * pat_factors["Family_CHD"]) + (coefs["Age_Smoking"] * pat_factors["Age"] * pat_factors["Current_smoker"]) 
        weighted_coef_sum += (coefs["Age_Treated_HTN"] * pat_factors["Age"] * pat_factors["Treated_HTN"]) + (coefs["Age_DM"] * pat_factors["Age"] * pat_factors["DM"])
        weighted_coef_sum += (coefs["Age_AF"] * pat_factors["Age"] * pat_factors["AF"]) + epsilon

    
        intercept = np.log(baseline_prob / (1 - baseline_prob))
        
        # Compute log-odds
        lp = intercept + weighted_coef_sum
        lp = np.clip(lp, -500, 500)  # Clip to avoid overflow issues
        
        curpredictions = 1 / (1 + np.exp(-lp))  # Convert to probability
        
        
        curoutcomes = np.random.binomial(1, curpredictions)         
        

        # Append to dictionary from the distribution for each of the variables (Table 1)
        mydict['date'].extend([curday] * num_patients)
        mydict['outcome'].extend(curoutcomes)
        mydict['prediction'].extend(curpredictions)
        mydict['White'].extend(pat_factors['White'])
        mydict['Indian'].extend(pat_factors['Indian'])
        mydict['Pakistani'].extend(pat_factors['Pakistani'])
        mydict['Bangladeshi'].extend(pat_factors['Bangladeshi'])
        mydict['Other_Asian'].extend(pat_factors['Other_Asian'])
        mydict['Black_Caribbean'].extend(pat_factors['Black_Caribbean'])
        mydict['Black_African'].extend(pat_factors['Black_African'])
        mydict['Chinese'].extend(pat_factors['Chinese'])
        mydict['Other'].extend(pat_factors['Other'])
        mydict['Age'].extend(pat_factors['Age'])
        mydict['BMI'].extend(pat_factors['BMI'])
        mydict['Townsend'].extend(pat_factors['Townsend'])
        mydict['SBP'].extend(pat_factors['SBP'])
        mydict['CholHDL_ratio'].extend(pat_factors['CholHDL_ratio'])
        mydict['Family_CHD'].extend(pat_factors['Family_CHD'])
        mydict['Current_smoker'].extend(pat_factors['Current_smoker'])
        mydict['Treated_HTN'].extend(pat_factors['Treated_HTN'])
        mydict['DM'].extend(pat_factors['DM'])
        mydict['RA'].extend(pat_factors['RA'])
        mydict['AF'].extend(pat_factors['AF'])
        mydict['Renal_disease'].extend(pat_factors['Renal_disease'])

        

    df = pd.DataFrame(mydict)
            
    ########################################### Baseline Testing #######################################
    model = RecalibratePredictions()
    model.trigger = TimeframeTrigger(model=model, updateTimestep=900, dataStart=df['date'].min(), dataEnd=df['date'].max())
    mytest = PREDICT(data=df, model=model, startDate='min', endDate='max', timestep='month', model_name='QRISK_datasim')
    mytest.addLogHook(Accuracy(model))
    mytest.addLogHook(AUROC(model))
    mytest.addLogHook(Precision(model))
    mytest.addLogHook(CalibrationSlope(model))
    mytest.addLogHook(CITL(model))
    mytest.addLogHook(OE(model))
    mytest.addLogHook(AUPRC(model))
    mytest.run()
    log = mytest.getLog()


    baseline_metrics = pd.DataFrame({'Time': list(log["Accuracy"].keys()), 'Accuracy': list(log["Accuracy"].values()), 'AUROC': list(log["AUROC"].values()), 'Precision': list(log["Precision"].values()), 'CalibrationSlope': list(log["CalibrationSlope"].values()), 'CITL': list(log["CITL"].values()), 'OE': list(log["O/E"].values()), 'AUPRC': list(log["AUPRC"].values()), 'impact_or_prev': list([str(prev_increase)] * len(log["Accuracy"])), 'Method':list(['Baseline'] * len(log["Accuracy"]))})
    
    ########################################### Save Metrics #######################################
    baseline_metrics["Data_Type"] = "Multivariate Simulation"

    baseline_metrics.to_csv('performance_metrics.csv', mode='a', header=False, index=False)