In [1]:
import starsim as ss
import pandas as pd
import sciris as sc
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams["svg.fonttype"] = "none"
import datetime as dt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams["figure.dpi"] = 300  # Increase resolution inside notebook itself
from matplotlib import colors
import os
import zipfile
import glob
import re
import statsmodels.stats.api as sms
from sklearn.neighbors import KernelDensity
days = lambda x: dt.timedelta(days=x)
%matplotlib inline

Starsim 0.5.1 (2024-05-15) — © 2023-2024 by IDM


In [2]:
import sys
sys.path.remove('c:\\users\\alina.muellenmeister\\documents\\github\\gavi-outbreaks')
sys.path.append(r'c:\\users\\alina.muellenmeister\\documents\\github\\syphilis_analyses')

In [4]:
import stisim as sti

STIsim 0.0.1 (2024-05-15) — © 2024 by IDM


# Data

## Read in calibration data

In [48]:
location = 'zimbabwe'
data_dir = r'C:\Users\alina.muellenmeister\Documents\GitHub\syphilis_analyses\data'
data = pd.read_csv(data_dir + '//' +  f'{location}_calib.csv')
data.index = data["year"]
data

Unnamed: 0_level_0,year,pop_size,hiv_prev,plhiv,new_infections,new_deaths
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990,1990,10432409.0,0.069974,730000,210000,25000.0
1991,1991,10681008.0,0.085198,910000,220000,33000.0
1992,1992,10900511.0,0.100913,1100000,220000,43000.0
1993,1993,11092775.0,0.108179,1200000,210000,53000.0
1994,1994,11261752.0,0.115435,1300000,200000,63000.0
1995,1995,11410721.0,0.122692,1400000,180000,74000.0
1996,1996,11541215.0,0.129969,1500000,160000,84000.0
1997,1997,11653254.0,0.128719,1500000,150000,94000.0
1998,1998,11747079.0,0.136204,1600000,140000,100000.0
1999,1999,11822722.0,0.135333,1600000,130000,110000.0


# Model Output

In [149]:
# Summary Files
zipfiles = os.listdir ("results//Calibration")
results_calibration = pd.DataFrame()
for zipfile_name in zipfiles:
    with zipfile.ZipFile("results/Calibration/" + zipfile_name) as z:
        with z.open("summary.csv") as summary_file:
            summary_file_df = pd.read_csv(summary_file)
            results_calibration = pd.concat([results_calibration, summary_file_df], ignore_index=True)

In [150]:
# Time Series Files 
calibration_timeseries_df = pd.DataFrame()
for zipfile_name in zipfiles: 
    with zipfile.ZipFile("results/Calibration/" + zipfile_name) as z:
        for file in z.infolist():
            if file.filename.__contains__("seed"):
                with z.open(file.filename) as timeseries_file:
                    timeseries_df = pd.read_csv(timeseries_file)
                    timeseries_df.insert(0, "scenario", zipfile_name.removesuffix(".zip"))
                    timeseries_df.insert(2, "year", np.floor(timeseries_df['t']))
                    calibration_timeseries_df = pd.concat([calibration_timeseries_df, timeseries_df])   
calibration_timeseries_df

Unnamed: 0,scenario,t,year,yearvec,pregnancy.pregnancies,pregnancy.births,pregnancy.cbr,deaths.new,deaths.cumulative,deaths.cmr,...,m0_prob,m1_prob,m2_prob,f0_conc,f1_conc,f2_conc,m0_conc,m1_conc,m2_conc,p_pair_form
0,scenario_0,1990.000000,1990.0,1990.000000,32937.2967,0.0000,0.000000,12975.2987,1.297530e+04,0.015602,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
1,scenario_0,1990.083333,1990.0,1990.083333,27946.7972,0.0000,0.000000,9980.9990,2.295630e+04,0.011990,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
2,scenario_0,1990.166667,1990.0,1990.166667,21958.1978,0.0000,0.000000,6986.6993,2.994300e+04,0.008386,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
3,scenario_0,1990.250000,1990.0,1990.250000,24952.4975,0.0000,0.000000,13973.3986,4.391640e+04,0.016771,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
4,scenario_0,1990.333333,1990.0,1990.333333,32937.2967,0.0000,0.000000,7984.7992,5.190119e+04,0.009571,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,scenario_9,2029.666667,2029.0,2029.666667,54895.4945,39923.9960,25.656101,7984.7992,5.443637e+06,0.005131,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
477,scenario_9,2029.750000,2029.0,2029.750000,44914.4955,45912.5954,29.447853,7984.7992,5.451622e+06,0.005121,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
478,scenario_9,2029.833333,2029.0,2029.833333,45912.5954,39923.9960,25.564550,13973.3986,5.465595e+06,0.008948,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5
479,scenario_9,2029.916667,2029.0,2029.916667,29942.9970,41920.1958,26.814216,9980.9990,5.475576e+06,0.006384,...,0.78,0.21,0.01,0.0001,0.01,0.1,0.01,0.2,0.5,0.5


# Goodness of Fit

In [151]:
gof_estimates = pd.DataFrame()

# Loop over considered scenarios
pd.options.mode.chained_assignment = None 
for idx, scenario in enumerate(np.unique(calibration_timeseries_df['scenario'])):
    this_scenario_df = calibration_timeseries_df[calibration_timeseries_df['scenario'] == scenario]

    this_row = pd.DataFrame()
    
    # Combine seeds to a single metric using mean 
    this_scenario_df_combined = pd.DataFrame()
    this_scenario_df_combined.loc[:, "pop_size_pt"] = this_scenario_df.groupby("t")["n_alive"].mean()
    this_scenario_df_combined.loc[:, "plhiv_pt"] = this_scenario_df.groupby("t")["hiv.n_infected"].mean()
    this_scenario_df_combined.loc[:, "hiv_prev_pt"] = this_scenario_df.groupby("t")["hiv.prevalence"].mean()
    this_scenario_df_combined.loc[:, "new_infections_pt"] = this_scenario_df.groupby("t")["hiv.new_infections"].mean()
    this_scenario_df_combined.loc[:, "new_deaths_pt"] = this_scenario_df.groupby("t")["hiv.new_deaths"].mean()
    this_scenario_df_combined.insert(0, "year", np.floor(this_scenario_df_combined.index).astype(int))
    this_row.loc[0, 'scenario'] = scenario
    # Add values of parameters for this scenario: 
    for param in this_scenario_df.iloc[:, -18:].columns:
        this_scenario_df_combined.loc[:, param] = this_scenario_df[param].iloc[0]
        if param != 'hiv_beta':
            this_row.loc[0, param] = this_scenario_df[param].iloc[0]
   
    ####################################################################################################################################################################
    # GOF - Population
    ####################################################################################################################################################################
    # Calculate GOF for population - grab the last value for each year: 
    this_scenario_pop_size = this_scenario_df_combined["pop_size_pt"][::12]
    this_scenario_pop_size.index = np.floor(np.round(this_scenario_pop_size.index, 5))
    data_available_years = data["pop_size"].dropna().index
    pop_size_gof = calibration.compute_gof(this_scenario_pop_size.loc[data_available_years].values, data["pop_size"].dropna().values, as_scalar="sum")
    this_row["pop_size_gof"] = pop_size_gof

    ####################################################################################################################################################################
    # GOF - HIV Prevalence
    ####################################################################################################################################################################
    this_scenario_hiv_prev = this_scenario_df_combined["hiv_prev_pt"][::12]
    this_scenario_hiv_prev.index = np.floor(np.round(this_scenario_hiv_prev.index, 5))
    data_available_years = data["hiv_prev"].dropna().index
    hiv_prev_gof = calibration.compute_gof(this_scenario_hiv_prev.loc[data_available_years].values, data["hiv_prev"].dropna().values, as_scalar="sum")
    this_row["hiv_prev_gof"] = hiv_prev_gof
    
    ####################################################################################################################################################################
    # GOF - PLHIV
    ####################################################################################################################################################################
    this_scenario_plhiv = this_scenario_df_combined["plhiv_pt"][::12]
    this_scenario_plhiv.index = np.floor(np.round(this_scenario_plhiv.index, 5))
    data_available_years = data["plhiv"].dropna().index
    plhiv_gof = calibration.compute_gof(this_scenario_plhiv.loc[data_available_years].values, data["plhiv"].dropna().values, as_scalar="sum")
    this_row["plhiv_gof"] = plhiv_gof
    
    ####################################################################################################################################################################
    # GOF - New Infections
    ####################################################################################################################################################################
    this_scenario_new_infections = this_scenario_df_combined.groupby("year")["new_infections_pt"].sum()
    this_scenario_new_infections.index = np.floor(np.round(this_scenario_new_infections.index, 5))
    data_available_years = data["new_infections"].dropna().index
    new_infections_gof = calibration.compute_gof(this_scenario_new_infections.loc[data_available_years].values, data["new_infections"].dropna().values, as_scalar="sum")
    this_row["new_infections_gof"] = new_infections_gof
    
    ####################################################################################################################################################################
    # GOF - New Deaths
    ####################################################################################################################################################################
    this_scenario_new_deaths = this_scenario_df_combined.groupby("year")["new_deaths_pt"].sum()
    this_scenario_new_deaths.index = np.floor(np.round(this_scenario_new_infections.index, 5))
    data_available_years = data["new_deaths"].dropna().index
    new_deaths_gof = calibration.compute_gof(this_scenario_new_deaths.loc[data_available_years].values, data["new_deaths"].dropna().values, as_scalar="sum")
    this_row["new_deaths_gof"] = new_deaths_gof


    this_row['total_sum'] = this_row[["pop_size_gof", "hiv_prev_gof", "plhiv_gof", "new_infections_gof", "new_deaths_gof"]].sum(axis=1)
    
    gof_estimates = pd.concat([gof_estimates, this_row])
    
gof_estimates.sort_values(by=['total_sum'])

Unnamed: 0,scenario,maternal_hiv_beta,init_prev,duration_on_ART,cd4_start_dist,f0_prob,f1_prob,f2_prob,m0_prob,m1_prob,...,m0_conc,m1_conc,m2_conc,p_pair_form,pop_size_gof,hiv_prev_gof,plhiv_gof,new_infections_gof,new_deaths_gof,total_sum
0,scenario_36,0.4,0.15,18.0,400.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,2.022716,9.962765,13.870556,6.921414,7.573883,40.351334
0,scenario_42,0.6,0.15,18.0,800.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.993815,10.3739,14.355629,7.921286,7.283317,41.927948
0,scenario_41,0.6,0.15,18.0,600.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.940156,10.513341,14.497317,8.557039,7.306152,42.814004
0,scenario_38,0.4,0.15,18.0,800.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.86907,10.80609,14.833333,9.43057,7.198182,44.137244
0,scenario_39,0.4,0.15,18.0,1000.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.819713,11.001016,15.077904,9.78055,7.228334,44.907516
0,scenario_37,0.4,0.15,18.0,600.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.763701,11.423962,15.577526,10.811984,7.026839,46.604012
0,scenario_26,0.05,0.15,18.0,800.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.613602,11.75868,15.897152,12.889141,7.114511,49.273086
0,scenario_31,0.1,0.15,18.0,1000.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.586869,11.999749,16.217999,13.55098,7.24602,50.601617
0,scenario_35,0.2,0.15,18.0,1000.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.579251,12.154653,16.376787,13.538966,7.067601,50.717259
0,scenario_30,0.1,0.15,18.0,800.0,0.85,0.14,0.01,0.78,0.21,...,0.01,0.2,0.5,0.5,1.576288,12.143444,16.398452,13.759285,7.187878,51.065346


In [245]:
round(gof_estimates[gof_estimates['scenario'] == scenario]["pop_size_gof"][0], 2)

1.33

# Plots

In [252]:
low = 0.25
high = 0.75

for idx, scenario in enumerate(np.unique(calibration_timeseries_df['scenario'])):
    this_scenario_df = calibration_timeseries_df[calibration_timeseries_df['scenario'] == scenario]

    # Combine seeds to a single metric using mean 
    this_scenario_df_combined = pd.DataFrame()
    # Mean 
    this_scenario_df_combined.loc[:, "pop_size_pt"] = this_scenario_df.groupby("t")["n_alive"].mean()
    this_scenario_df_combined.loc[:, "plhiv_pt"] = this_scenario_df.groupby("t")["hiv.n_infected"].mean()
    this_scenario_df_combined.loc[:, "hiv_prev_pt"] = this_scenario_df.groupby("t")["hiv.prevalence"].mean()
    this_scenario_df_combined.loc[:, "new_infections_pt"] = this_scenario_df.groupby("t")["hiv.new_infections"].mean()
    this_scenario_df_combined.loc[:, "new_deaths_pt"] = this_scenario_df.groupby("t")["hiv.new_deaths"].mean()

    # Low
    this_scenario_df_combined.loc[:, "pop_size_pt_low"] = this_scenario_df.groupby("t")["n_alive"].quantile(q=low)
    this_scenario_df_combined.loc[:, "plhiv_pt_low"] = this_scenario_df.groupby("t")["hiv.n_infected"].quantile(q=low)
    this_scenario_df_combined.loc[:, "hiv_prev_pt_low"] = this_scenario_df.groupby("t")["hiv.prevalence"].quantile(q=low)
    this_scenario_df_combined.loc[:, "new_infections_pt_low"] = this_scenario_df.groupby("t")["hiv.new_infections"].quantile(q=low)
    this_scenario_df_combined.loc[:, "new_deaths_pt_low"] = this_scenario_df.groupby("t")["hiv.new_deaths"].quantile(q=low)

    # High
    this_scenario_df_combined.loc[:, "pop_size_pt_high"] = this_scenario_df.groupby("t")["n_alive"].quantile(q=high)
    this_scenario_df_combined.loc[:, "plhiv_pt_high"] = this_scenario_df.groupby("t")["hiv.n_infected"].quantile(q=high)
    this_scenario_df_combined.loc[:, "hiv_prev_pt_high"] = this_scenario_df.groupby("t")["hiv.prevalence"].quantile(q=high)
    this_scenario_df_combined.loc[:, "new_infections_pt_high"] = this_scenario_df.groupby("t")["hiv.new_infections"].quantile(q=high)
    this_scenario_df_combined.loc[:, "new_deaths_pt_high"] = this_scenario_df.groupby("t")["hiv.new_deaths"].quantile(q=high)

    
    this_scenario_df_combined.insert(0, "year", np.floor(this_scenario_df_combined.index).astype(int))

    ####################################################################################################################################################################
    # Plots
    ####################################################################################################################################################################

    fig, ax = plt.subplots(2, 3)
    fig.set_size_inches(15, 10)
    #fig.tight_layout(pad=5.0)

    ax = ax.ravel()

    fill_args = {"alpha": 0.3}
    
    ax[0].fill_between(this_scenario_df_combined.index, this_scenario_df_combined.pop_size_pt_low, this_scenario_df_combined.pop_size_pt_high, **fill_args)
    ax[0].plot(this_scenario_df_combined.index, this_scenario_df_combined.pop_size_pt[:], color="b", alpha=1)
    ax[0].plot(data["year"], data["pop_size"], color="tab:red")
    # Look up Population GOF: 
    pop_gof = round(gof_estimates[gof_estimates['scenario'] == scenario]["pop_size_gof"][0], 2)
    ax[0].set_title('Population, GOF ' + str(pop_gof))

    ax[1].fill_between(this_scenario_df_combined.index, this_scenario_df_combined.plhiv_pt_low, this_scenario_df_combined.plhiv_pt_high, **fill_args)
    ax[1].plot(this_scenario_df_combined.index, this_scenario_df_combined.plhiv_pt[:], color="b", alpha=1)
    ax[1].scatter(data["year"], data["plhiv"], color="tab:red")
    plhiv_gof = round(gof_estimates[gof_estimates['scenario'] == scenario]["plhiv_gof"][0], 2)
    ax[1].set_title('PLHIV, GOF ' + str(plhiv_gof))

    ax[2].fill_between(this_scenario_df_combined.index, this_scenario_df_combined.hiv_prev_pt_low, this_scenario_df_combined.hiv_prev_pt_high, **fill_args)
    ax[2].plot(this_scenario_df_combined.index, this_scenario_df_combined.hiv_prev_pt[:], color="b", alpha=1)
    ax[2].scatter(data["year"], data["hiv_prev"], color="tab:red")
    hiv_prev_gof = round(gof_estimates[gof_estimates['scenario'] == scenario]["hiv_prev_gof"][0], 2)
    ax[2].set_title('HIV Prevalence, GOF ' + str(hiv_prev_gof))

    ax[3].fill_between(np.unique(this_scenario_df_combined["year"]), 
                       this_scenario_df_combined.groupby("year")["new_infections_pt_low"].sum(), 
                       this_scenario_df_combined.groupby("year")["new_infections_pt_high"].sum(), **fill_args)
    ax[3].plot(np.unique(this_scenario_df_combined["year"]), this_scenario_df_combined.groupby("year")["new_infections_pt"].sum()[:], color="b", alpha=1)
    ax[3].scatter(data["year"], data["new_infections"], color="tab:red")
    new_infections_gof = round(gof_estimates[gof_estimates['scenario'] == scenario]["new_infections_gof"][0], 2)
    ax[3].set_title('New Infections, GOF ' + str(new_infections_gof))

    ax[4].fill_between(np.unique(this_scenario_df_combined["year"]), 
                       this_scenario_df_combined.groupby("year")["new_deaths_pt_low"].sum(), 
                       this_scenario_df_combined.groupby("year")["new_deaths_pt_high"].sum(), **fill_args)
    ax[4].plot(np.unique(this_scenario_df_combined["year"]), this_scenario_df_combined.groupby("year")["new_deaths_pt"].sum()[:], color="b", alpha=1)
    ax[4].scatter(data["year"], data["new_deaths"], color="tab:red")
    new_deaths_gof = round(gof_estimates[gof_estimates['scenario'] == scenario]["new_deaths_gof"][0], 2)
    ax[4].set_title('New Deaths, GOF ' + str(new_deaths_gof))

    ax[5].set_axis_off() 
    ax[5].table(rowLabels=['hiv_beta'] + this_scenario_df.iloc[:, -17:].columns.tolist(),
                colLabels=["Value"], 
                colWidths = [0.4], loc='center',
                cellText=[[round(np.mean(this_scenario_df['hiv_beta']), 5)]] + [[value] for value in this_scenario_df.iloc[:, -17:].iloc[0]])

    fig.suptitle('Sum GOF: ' + str(round(gof_estimates[gof_estimates['scenario'] == scenario]["total_sum"][0], 2)))
    plt.savefig("figures" + '//' + str(scenario) + ".png", dpi=100)
    plt.close()