In [None]:
# Calibration
import numpy as np
import pandas as pd
import old.nat_hist_calibration_mp as nh
import configs as c

In [None]:
# Set simulated annealing parameters
sim_anneal_params = {
    'starting_T': 1.0,
    'final_T': 0.01, # 0.01
    'cooling_rate': 0.9, # 0.9
    'iterations': 100} # 100

# starting T = 1.0, final_T = 0.1, cooling_rate = 0.01
# 100 iterations for each temperature

# Define ratios used during calibration process
ls_ss_ratio = 1.2 / 3.0
norm_gerd_be_ratio = 1 / 5.0
ss_ls_progression = 1 / (2**(1/3))

# Define weights based on cohort_type
if c.COHORT_TYPE == 'wm':
    inc_weight = 5
    loc_inc_weight = 0.1
    reg_inc_weight = 0.1
    dist_inc_weight = 0.1
    mort_weight = 10
    loc_mort_weight = 5
    reg_mort_weight = 0.1
    dist_mort_weight = 0.1
    be_weight = 10000
    gerd_weight = 30000
    nd_weight = 5000
    lgd_weight = 5000
    hgd_weight = 5000
    unst_inc_weight = 0 # unstaged is always 0 because we don't care
    unst_mort_weight = 0 # unstaged is always 0 because we don't care

In [None]:


# LOAD TARGET DATA AS NUMPY MATRICES
def get_targets(pop):
    '''outputs numpy matrix of calibration target'''
    if pop == 'gerd':
        # GERD prevalence: 20% at all ages and all years
        gerd_target = np.full((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES), 0.2)
        return gerd_target
    elif pop == 'be':
        # BE prevalence
        # Row index: years 1975-2013 (1975-2009 for am and af)
        # Column index: ages 20-84
        be_target = pd.read_excel("data/be_prev/BE_prevalence_target.xlsx", 
                                   sheet_name=c.COHORT_TYPE, index_col="Year").to_numpy()
        return be_target
    elif pop == 'nd': # proportion of patients with be that have nd
        nd_target = np.full((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES), 0.9)
        return nd_target
    elif pop == 'lgd': 
        lgd_target = np.full((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES), 0.07)
        return lgd_target
    elif pop == 'hgd': 
        hgd_target = np.full((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES), 0.03)
        return hgd_target
    elif pop == 'all_eac':
        # EAC Incidence
        # Row index: 1975-2016
        # Column index: 20-84
        eac_inc_target_all = pd.read_excel("data/eac_incidence/EAC_SEER9_Incidence_20190430_Incid_All.xlsx", 
                                        index_col=0, sheet_name=c.COHORT_TYPE).to_numpy()
        return eac_inc_target_all
    elif pop == 'loc_eac':
        eac_inc_target_loc = pd.read_excel("data/eac_incidence/EAC_SEER9_Incidence_20190430_Incid_Local.xlsx",
                                        index_col=0, sheet_name=c.COHORT_TYPE).to_numpy()
        return eac_inc_target_loc
    elif pop == 'reg_eac':
        eac_inc_target_reg = pd.read_excel("data/eac_incidence/EAC_SEER9_Incidence_20190430_Incid_Regional.xlsx",
                                        index_col=0, sheet_name=c.COHORT_TYPE).to_numpy()
        return eac_inc_target_reg
    elif pop == 'dis_eac':
        eac_inc_target_dis = pd.read_excel("data/eac_incidence/EAC_SEER9_Incidence_20190430_Incid_Distant.xlsx",
                                        index_col=0, sheet_name=c.COHORT_TYPE).to_numpy()
        return eac_inc_target_dis
    elif pop == 'unst_eac':
        eac_inc_target_unst = pd.read_excel("data/eac_incidence/EAC_SEER9_Incidence_20190430_Incid_Unstaged.xlsx",
                                        index_col=0, sheet_name=c.COHORT_TYPE).to_numpy()
        return eac_inc_target_unst
    elif pop == 'all_mort':
        # EAC Mortality
        # Row index: 1975-2016
        # Column index: 20-84
        eac_mort_target_all = pd.read_excel("data/eac_mort/EAC_SEER9_5YrMort_20190430_" + c.COHORT_TYPE + ".xlsx",
                                            index_col=0, sheet_name="All").to_numpy()
        return eac_mort_target_all
    elif pop == 'loc_mort':
        eac_mort_target_loc = pd.read_excel("data/eac_mort/EAC_SEER9_5YrMort_20190430_" + c.COHORT_TYPE + ".xlsx",
                                            index_col=0, sheet_name="Localized").to_numpy()
        return eac_mort_target_loc
    elif pop == 'reg_mort':
        eac_mort_target_reg = pd.read_excel("data/eac_mort/EAC_SEER9_5YrMort_20190430_" + c.COHORT_TYPE + ".xlsx",
                                            index_col=0, sheet_name="Regional").to_numpy()
        return eac_mort_target_reg
    elif pop == 'dis_mort':
        eac_mort_target_dis = pd.read_excel("data/eac_mort/EAC_SEER9_5YrMort_20190430_" + c.COHORT_TYPE + ".xlsx",
                                            index_col=0, sheet_name="Distant").to_numpy()
        return eac_mort_target_dis
    elif pop == 'unst_mort':
        eac_mort_target_unst = pd.read_excel("data/eac_mort/EAC_SEER9_5YrMort_20190430_" + c.COHORT_TYPE + ".xlsx",
                                            index_col=0, sheet_name="Unstaged").to_numpy()
        return eac_mort_target_unst
    else:
        print("Error: Input not recognized")

In [None]:
# Get alive population, gerd population, and be population from state matrix for plotting
# Outputs numpy matrix
def get_prevalence_pop(state, pop):
    '''function to output numpy matrix of population of interest from the state matrix
       of the markov model'''
    if pop == 'alive':
        # calculates alive pop by summing all death states and subtracting from 1
        alive_matrix = np.zeros((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES))
        all_death_states = state[:,:,0,[15,16]]
        for i_year in range(c.NUM_TARGET_YEARS):
            for i_age in range(c.NUM_TARGET_AGES):
                sum_alive = 1 - all_death_states[i_year, i_age].sum()
                alive_matrix[i_year, i_age] = sum_alive
        return alive_matrix
    elif pop == 'gerd':
        # calculates gerd pop by taking prop of patients with gerd at month 0 of each year
        gerd_matrix = np.zeros((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES))
        all_gerd_states = state[:,:,0,1]
        for i_year in range(c.NUM_TARGET_YEARS):
            for i_age in range(c.NUM_TARGET_AGES):
                gerd_matrix[i_year, i_age] = all_gerd_states[i_year, i_age]
        return gerd_matrix
    elif pop == 'be':
        # calculates be pop by summing prop of patients with all be states at month 0 of each year
        be_matrix = np.zeros((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES))
        all_be_states = state[:,:,0,2:7]
        for i_year in range(c.NUM_TARGET_YEARS):
            for i_age in range(c.NUM_TARGET_AGES):
                be_matrix[i_year, i_age] = all_be_states[i_year, i_age].sum()
        return be_matrix
    elif pop == 'nd':
        # calculates nd pop by summing prop of patients with nd at month 0 of each year
        nd_matrix = np.zeros((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES))
        all_nd_states = state[:,:,0,2:3]
        for i_year in range(c.NUM_TARGET_YEARS):
            for i_age in range(c.NUM_TARGET_AGES):
                nd_matrix[i_year, i_age] = all_nd_states[i_year, i_age].sum()
        return nd_matrix
    elif pop == 'lgd':
        # calculates nd pop by summing prop of patients with lgd at month 0 of each year
        lgd_matrix = np.zeros((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES))
        all_lgd_states = state[:,:,0,4:5]
        for i_year in range(c.NUM_TARGET_YEARS):
            for i_age in range(c.NUM_TARGET_AGES):
                lgd_matrix[i_year, i_age] = all_lgd_states[i_year, i_age].sum()
        return lgd_matrix
    elif pop == 'hgd':
        # calculates nd pop by summing prop of patients with hgd at month 0 of each year
        hgd_matrix = np.zeros((c.NUM_TARGET_YEARS, c.NUM_TARGET_AGES))
        all_hgd_states = state[:,:,0,6:7]
        for i_year in range(c.NUM_TARGET_YEARS):
            for i_age in range(c.NUM_TARGET_AGES):
                hgd_matrix[i_year, i_age] = all_hgd_states[i_year, i_age].sum()
        return hgd_matrix
    else:
        print("Error: Input not recognized")

In [None]:
def get_eac_pop(array, pop):
    '''function to output numpy matrix of new cancer cases or deaths each year'''
    if pop in ['all_eac', 'all_mort']:
        return array[:,:,0]
    elif pop in ['loc_eac', 'loc_mort']:
        return array[:,:,1]
    elif pop in ['reg_eac', 'reg_mort']:
        return array[:,:,2]
    elif pop in ['dis_eac', 'dis_mort']:
        return array[:,:,3]
    elif pop in ['unst_eac', 'unst_mort']:
        return array[:,:,4]
    else:
        print("Error: No stage specified")

In [None]:
# Goodness-of-fit functions
def gof(obs, exp):
    # chi-squared
    # inputs: umpy arrays of observed and expected values
    chi = ((obs-exp)**2)
    chi_sq = sum(chi)
    return chi_sq

def calc_total_gof(state, all_diagnosed_cancer, all_cancer_deaths):
    '''Calculates and sums gof values for gerd prevalence, be prevalence, eac incidence, eac mortality'''
    # Get alive population to calculate proportion/incidence
    alive_pop = get_prevalence_pop(state, 'alive')
    # Initialize total gof
    total_gof = 0

    # Get gof from each calibration target
    calib_targets = ['gerd', 'be', 'nd', 'lgd', 'hgd', 'all_eac', 'loc_eac', 'reg_eac', 'dis_eac', 
                    'unst_eac', 'all_mort', 'loc_mort', 'reg_mort', 'dis_mort', 'unst_mort']
    for i in calib_targets:
        if i in ['gerd', 'be']:
            if i == 'gerd':
                wt = gerd_weight
            elif i == 'be':
                wt = be_weight
            pop = get_prevalence_pop(state, i)
            model = np.divide(pop, alive_pop)
            target = get_targets(i)
            current_gof = gof(model, target).sum() * wt
            total_gof += current_gof
        elif i in ['nd', 'lgd', 'hgd']:
            if i == 'nd':
                wt = nd_weight
            elif i == 'lgd':
                wt = lgd_weight
            elif i == 'hgd':
                wt = hgd_weight
            pop = get_prevalence_pop(state, i)
            be = get_prevalence_pop(state, 'be')
            model = np.divide(pop, be)
            target = get_targets(i)
            current_gof = gof(model, target).sum() * wt
            total_gof += current_gof
        elif i in ['all_eac', 'loc_eac', 'reg_eac', 'dis_eac', 'unst_eac']:
            if i == 'all_eac':
                wt = inc_weight
            elif i == 'loc_eac':
                wt = loc_inc_weight
            elif i == 'reg_eac':
                wt = reg_inc_weight
            elif i == 'dis_eac':
                wt = dist_inc_weight
            elif i == 'unst_eac':
                wt = unst_inc_weight
            pop = get_eac_pop(all_diagnosed_cancer, i)
            model = np.divide(pop, alive_pop) * 100_000
            target = get_targets(i)
            current_gof = gof(model, target).sum() * wt
            total_gof += current_gof
        elif i in ['all_mort', 'loc_mort', 'reg_mort', 'dis_mort', 'unst_mort']:
            if i == 'all_mort':
                wt = mort_weight
            elif i == 'loc_mort':
                wt = loc_mort_weight
            elif i == 'reg_mort':
                wt = reg_mort_weight
            elif i == 'dis_mort':
                wt = dist_mort_weight
            elif i == 'unst_mort':
                wt = unst_mort_weight
            pop = get_eac_pop(all_cancer_deaths, i)
            model = np.divide(pop, alive_pop) * 100_000
            target = get_targets(i)
            current_gof = gof(model, target).sum() * wt
            total_gof += current_gof
    return total_gof

In [None]:
# Functions for running simulated annealing algorithm
def select_new_params(step, old_param):
    '''Selects new param within range old_param +/- step%
       step: proportion to change param (between 0 and 1), does not depend on temperature
       old_param: old parameter
       Outputs a new parameter'''
    new_param = np.random.uniform(old_param - old_param * step, old_param + old_param * step)
    return new_param

def change_trans_vals(row, column, step):
    '''selects new transition parameter value'''
    if row['calibrate'] == 'Yes':
        if pd.isnull(row[column]):
            return np.nan
        else:
            new_val = select_new_params(step, row[column])
            return new_val
    else:
        return row[column]

def generate_trans_df(current_params, step = 0.3):
    '''Creates random dataframe of transition params based on previous parameter matrix
       current_params: pandas dataframe'''
    new_params = current_params.copy()
    new_params['b'] = new_params.apply(change_trans_vals, step=step, column = 'b', axis=1)
    new_params['m'] = new_params.apply(change_trans_vals, step=step, column = 'm', axis=1)
    return new_params

def change_trend_val(row, column, step):
    '''selects new parameter value'''
    if pd.isnull(row[column]):
        return np.nan
    else:
        new_val = select_new_params(step, row[column])
        return new_val

def generate_trend_df(current_params, step = 0.3):
    '''Creates random dataframe of trend params based on previous parameter matrix
       current_params: pandas dataframe'''
    new_params = current_params.copy()
    new_params['base'] = new_params.apply(change_trend_val, step=step, column = 'base', axis=1)
    new_params['increase'] = new_params.apply(change_trend_val, step=step, column = 'increase', axis=1)
    new_params['offset'] = new_params.apply(change_trend_val, step=step, column = 'offset', axis=1)
    new_params['spread'] = new_params.apply(change_trend_val, step=step, column = 'spread', axis=1)
    new_params['increase2'] = new_params.apply(change_trend_val, step=step, column = 'increase2', axis=1)
    new_params['offset2'] = new_params.apply(change_trend_val, step=step, column = 'offset2', axis=1)
    new_params['spread2'] = new_params.apply(change_trend_val, step=step, column = 'spread2', axis=1)
    return new_params


def apply_factors(trans_params):
    trans = trans_params.copy()
    # Apply norm_gerd_be_ratio
    # Makes it so transition to SSND or LSND is higher from GERD than Norm
    # Changing Norm_SSND based on GERD_SSND (logistic function)
    trans.loc[1,'m'] = trans.loc[3,'m'] * norm_gerd_be_ratio # slope
    trans.loc[1,'b'] = trans.loc[3,'b'] * norm_gerd_be_ratio # intercept
    # Changing Norm_LSND based on GERD_LSND (logistic function)
    trans.loc[2,'m'] = trans.loc[4,'m'] * norm_gerd_be_ratio # slope
    trans.loc[2,'b'] = trans.loc[4,'b'] * norm_gerd_be_ratio # intercept

    # Apply ls_ss_ratio
    # Makes it so long-segment is less prevalent than short-segment
    # Changing Norm_LSND based on Norm_SSND (logistic function)
    trans.loc[2,'m'] = trans.loc[1,'m'] * ls_ss_ratio # slope
    trans.loc[2,'b'] = trans.loc[1,'b'] * ls_ss_ratio # intercept
    # Changing GERD_LSND based on GERD_SSND (logistic function)
    trans.loc[4,'m'] = trans.loc[3,'m'] * ls_ss_ratio # slope
    trans.loc[4,'b'] = trans.loc[3,'b'] * ls_ss_ratio # intercept

    # Apply ls/ss progression rate
    # Makes it so short-segment has smaller progression rate than long-segment
    # Changing SSND_SSLD based on LSND_LSLD (linear function)
    trans.loc[5,'m'] = trans.loc[8,'m'] * ss_ls_progression # slope
    trans.loc[5,'b'] = trans.loc[8,'b'] * ss_ls_progression # intercept
    # Changing SSLD_SSHD based on LSHD_LSHD (linear function)
    trans.loc[6,'m'] = trans.loc[9,'m'] * ss_ls_progression # slope
    trans.loc[6,'b'] = trans.loc[9,'b'] * ss_ls_progression # intercept
    # Changing SSHD_ULoc based on LSHD_ULoc (double logistic)
    trans.loc[7,'m'] = trans.loc[10,'m'] * ss_ls_progression # slope
    trans.loc[7,'b'] = trans.loc[10,'b'] * ss_ls_progression # intercept
    return trans

In [None]:
def acceptance_prob(old_gof, new_gof, T):
    if new_gof < old_gof:
        return 1
    else:
        return np.exp((old_gof - new_gof) / T)

In [None]:

# Simulated annealing algorithm
def anneal(init_trans_params, init_trend_params):
    # Get first solution for initial parameters
    trans_df = init_trans_params
    trend_df = init_trend_params
    state, cancer_cases, cancer_deaths = nh.run_markov(trans_df, trend_df)

    # Calculate gof
    old_gof = calc_total_gof(state, cancer_cases, cancer_deaths)
    print("old_gof:", old_gof)

    # Starting temperature
    T = sim_anneal_params['starting_T']

    # Start temperature loop
    # Annealing schedule
    while T > sim_anneal_params['final_T']:
        # Sampling at T
        for i in range(sim_anneal_params['iterations']):
            # Find new candidate parameters
            new_trans_params = generate_trans_df(trans_df)
            new_trend_params = generate_trend_df(trend_df)
            # Apply ratio factors
            new_trans_params = apply_factors(new_trans_params)

            # Get new solutions
            new_state, new_cancer_cases, new_cancer_deaths = nh.run_markov(new_trans_params, new_trend_params)
            
            # Calculate new gof
            new_gof = calc_total_gof(new_state, new_cancer_cases, new_cancer_deaths)
            print("new_gof:", new_gof)
            ap =  acceptance_prob(old_gof, new_gof, T)
            print("ap:", ap)

            # Decide if the new solution is accepted
            if np.random.uniform() < ap:
                trans_df = new_trans_params
                trend_df = new_trend_params
                old_gof = new_gof
                print(T, i, new_gof)

        T = T * sim_anneal_params['cooling_rate']
    
    return trans_df, trend_df