DR CRC Calibration

Author: Sophie Wagner, sw3767@cumc.columbia.edu

In [1]:
# Required Packages
import numpy as np  # For matrix manipulation
import pandas as pd  # For output/input data processing
import matplotlib.pyplot as plt  # For visualizations
from csaps import csaps
from scipy.interpolate import interp1d
from tqdm import tqdm
from datetime import datetime

# Add the src directory to the Python path
import sys
import os
sys.path.append(os.path.abspath('../src'))

# Load .py files
import common_functions as func
import calibration_plots as p
import configs as c
import gof


# Some aesthetic options
np.set_printoptions(suppress=True, linewidth=300, formatter={'float': '{: 0.9f}'.format})
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
c.acm_rate.shape

(80,)

In [8]:
def row_normalize(matrix):
    for age_layer in range(matrix.shape[0]):  # Loop over each age layer
        layer = matrix[age_layer]
        # Calculate the sum of non-diagonal elements for each row
        sum_of_columns = np.sum(layer, axis=1) - np.diag(layer)
        # Set the diagonal elements
        np.fill_diagonal(layer, 1 - sum_of_columns)
    return matrix


def create_matrix():
    matrix = np.zeros((len(c.age_layers), len(c.health_states), len(c.health_states)))
    matrix[:, 0, 1] = func.probtoprob(0.005)  # Healthy to LR
    matrix[:, 1, 2] = func.probtoprob(0.015)  # LR to HR
    matrix[:, 2, 3] = func.probtoprob(0.05)  # HR to uLoc
    matrix[:, 3, 4] = func.probtoprob(0.45)  # uLoc to uReg
    matrix[:, 4, 5] = func.probtoprob(0.50)  # uReg to uDis
    matrix[:, 3, 6] = func.probtoprob(0.20)  # uLoc to dLoc
    matrix[:, 4, 7] = func.probtoprob(0.60)  # uReg to dReg
    matrix[:, 5, 8] = func.probtoprob(0.90)  # uDis to dDis

    matrix = add_acm(matrix)  # ACM
    matrix = add_csd(matrix)  # CSD
    matrix = constrain_matrix(matrix)  # constrain
    matrix = row_normalize(matrix)  # normalize

    return matrix


def constrain_matrix(matrix):
    matrix = np.clip(matrix, 0.0, 0.5)

    # Progression Block
    matrix[:, 0, 1] = np.maximum(0.000001, matrix[:, 0, 1])  # not below 0
    matrix[:, 1, 2] = np.maximum(matrix[:, 0, 1], matrix[:, 1, 2])  
    matrix[:, 2, 3] = np.maximum(matrix[:, 1, 2], matrix[:, 2, 3])
    matrix[:, 3, 4] = np.maximum(matrix[:, 2, 3], matrix[:, 3, 4])
    matrix[:, 4, 5] = np.maximum(matrix[:, 3, 4], matrix[:, 4, 5])

    # Detection Block
    matrix[:, 3, 6] = np.maximum(0, matrix[:, 3, 6])
    matrix[:, 4, 7] = np.maximum(matrix[:, 3, 6], matrix[:, 4, 7])
    matrix[:, 5, 8] = np.maximum(matrix[:, 4, 7], matrix[:, 5, 8])
    
    # Age dependencies
    matrix[:, 0, 1] = np.maximum.accumulate(matrix[:,0,1])
    matrix[:, 1, 2] = np.maximum.accumulate(matrix[:,0,1])
    matrix[:, 2, 3] = np.maximum.accumulate(matrix[:,0,1])
    
    # Limit potential increase
    matrix[11,:,:] = np.minimum(matrix[10:,:],matrix[11,:,:])
    matrix[12,:,:] = np.minimum(matrix[11,:,:],matrix[12,:,:])

    return matrix


def add_acm(matrix):
    matrix[:, 0, 10] = c.acm_rate  # Healthy to ACM
    matrix[:, 1:3, 12] = c.acm_rate[:, np.newaxis]  # Polyp to ACM
    matrix[:, 3:6, 13] = c.acm_rate[:, np.newaxis]  # Undiagnosed to ACM
    matrix[:, 6:9, 11] = c.acm_rate[:, np.newaxis]  # Cancer to ACM
    matrix[:, 9, 9] = 1  # Stay in CSD
    matrix[:, 10, 10] = 1  # Stay in ACM
    matrix[:, 11, 11] = 1  # Stay in Cancer ACM
    matrix[:, 12, 12] = 1  # Stay in Polyp ACM
    matrix[:, 13, 13] = 1  # Stay in uCRC ACM

    return matrix


def add_csd(matrix):
    matrix[:, 6, 9] = c.csd_rate[:, 0]
    matrix[:, 7, 9] = c.csd_rate[:, 1]
    matrix[:, 8, 9] = c.csd_rate[:, 2]
    return matrix

In [9]:
def step(matrix, step_size, num_adj=11):
    new_matrix = np.copy(matrix)
    step_mat = np.random.choice(len(c.points), size=num_adj, replace=True)
    step_age = np.random.choice(len(c.age_layers), size=num_adj, replace=True)

    for i in range(num_adj):
        from_state, to_state = c.points[step_mat[i]][0], c.points[step_mat[i]][1]
        step_param = np.mean(matrix[:, from_state, to_state]) * step_size
        new_matrix[step_age[i], from_state, to_state] += np.random.uniform(
            low=-step_param, high=step_param
        )

    new_matrix = constrain_matrix(new_matrix)
    new_matrix = add_acm(new_matrix)
    new_matrix = add_csd(new_matrix)
    new_matrix = row_normalize(new_matrix)

    return new_matrix

In [10]:
def run_markov(matrix, starting_age=20, max_age=84):
    
    current_age = starting_age
    stage = age_layer = 0
    month_pop, pop_log = c.starting_pop, c.starting_pop
    inc_log = np.zeros(pop_log.shape)  # to track new incidences in each state
    matrixT = matrix.transpose(0,2,1)
    inflow_matrix = np.tril(matrixT, k=-1)
    
    while current_age <= max_age:
        
        inflow_mat = inflow_matrix[age_layer]
        month_inc = np.matmul(inflow_mat, month_pop)  # (14,14)(14,1)->(14,1)
        month_pop = np.matmul(matrixT, month_pop)  # (14,14)(14,1)->(14,1)
        
        # Add to log
        inc_log = np.concatenate((inc_log, month_inc), axis=1)
        pop_log = np.concatenate((pop_log, month_pop), axis=1)
        
        stage += 1
        if stage % 12 == 0:
            current_age += 1
            if current_age in c.ages_5y:
                age_layer += 1

    incidence = inc_log.copy()  # make (14,960)
    dead_factor = np.divide(c.N, c.N - pop_log[9:, :].sum(axis=0))  # inc and prev denominator is out of living only
    prevalence = np.zeros(pop_log.shape)  # (14,80)

    for state in range(14):
        incidence[state, :] = np.multiply(incidence[state, :], dead_factor)
        prevalence[state, :] = np.multiply(pop_log[state, :], dead_factor)

    incidence = incidence.reshape(len(c.health_states), len(c.age_layers), 12).sum(axis=2)  # getting annual incidence (rate per 100k)
    incidence_unadj = inc_log.reshape(len(c.health_states), len(c.age_layers), 12).sum(axis=2)  # getting inc unadjusted
    prevalence = prevalence.reshape(len(c.health_states), len(c.age_layers), 12).mean(axis=2)  # getting mean annual prevalence

    return incidence, prevalence, incidence_unadj

In [11]:
def simulated_annealing(
    n_iterations, step_size, start_tmat=None, n_adj=7, verbose=False
):

    if start_tmat is None:
        start_tmat = create_matrix()

    best_t = np.copy(start_tmat)
    best_log = run_markov(best_t)
    best_eval = gof.objective(best_log, 1)  # evaluate the initial point
    curr_t, curr_eval = best_t, best_eval  # current working solution
    ticker = 0

    with tqdm(total=n_iterations, desc="Simulated annealing progress", unit="iteration") as pbar:
        
        for i in range(n_iterations):

            # Run model
            candidate_t = np.copy(curr_t)
            candidate_t = step(candidate_t, step_size, n_adj)
            candidate_log = run_markov(candidate_t)
            candidate_eval = gof.objective(candidate_log, i)  # Evaluate candidate point

            # Update "best" if better than candidate
            if candidate_eval < best_eval:
                ticker = 0
                best_t, best_eval = np.copy(candidate_t), np.copy(candidate_eval)
                best_log = run_markov(best_t)

            else:
                ticker += 1

            # t = 10 / float(i+1)  # calculate temperature for current epoch
            t = 1 / (1 + np.log(i + 1))

            # Progress report
            if verbose and i % 1000 == 0:
                inc_log = best_log[3]
                total_dxd = np.sum(inc_log[6:9, :]) / c.N
                print(i, ": ", best_eval, "   CRC: ", round(total_dxd, 5),"   tick:",ticker)
                if i % 10000 == 0:
                    transition_probs = p.extract_transition_probs(
                        best_t, c.health_states, c.desired_transitions
                    )
                    print(f"Progress report, i = {i}")
                    p.print_trans_probs(transition_probs)

            # Check if we should update "curr"
            diff = (candidate_eval - curr_eval)  # difference between candidate and current point evaluation
            metropolis = np.exp(-diff / t)  # calculate metropolis acceptance criterion
            if (diff < 0 or np.random.random() < metropolis):  # check if we should keep the new point
                curr_t, curr_eval = np.copy(candidate_t), np.copy(candidate_eval)  # store the new current point
                ticker = 0

            pbar.update(1)

    print(best_eval)
    
    return best_t

In [12]:
def run_sa(save_all=False):
    tmat = create_matrix()
    result = simulated_annealing(n_iterations=100000, step_size=0.01, start_tmat=tmat, n_adj=7, verbose=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    
    
    curr_tmat = result.copy()
    curr_log = run_markov(curr_tmat)
    log_adj, log_prev, log_inc = curr_log

    # Extract transition probabilities
    transition_probs = p.extract_transition_probs(curr_tmat, c.health_states, c.desired_transitions)

    # Saving
    if save_all:
        # Save the with the timestamp in the filenames
        output_dir = c.OUTPUT_PATHS[type]
        np.save(f"{output_dir}/tmats/{timestamp}_tmat.npy", curr_tmat)
        pd.DataFrame(log_adj).to_csv(f"{output_dir}/logs/{timestamp}_inc_adj.csv")
        pd.DataFrame(log_prev).to_csv(f"{output_dir}/logs/{timestamp}_prev.csv")
        pd.DataFrame(log_inc).to_csv(f"{output_dir}/logs/{timestamp}_inc_unadj.csv")

        p.print_trans_probs(transition_probs, save_imgs=True, outpath=f"{output_dir}/probs/", timestamp=timestamp)
        p.plot_tps(curr_tmat, save_imgs=True, outpath=f"{output_dir}/plots", timestamp=timestamp)
        p.plot_vs_seer(curr_log, c.seer_inc, save_imgs=True, outpath=f"{output_dir}/plots", timestamp=timestamp)
        p.plot_vs_seer_total(curr_log, c.seer_inc, save_imgs=True, outpath=f"{output_dir}/plots", timestamp=timestamp)
        
        out = np.zeros((len(c.points), len(c.age_layers)))
        for idx, (from_state, to_state) in enumerate(c.points):
            out[idx] = curr_tmat[:, from_state, to_state]

        pd.DataFrame(out).to_csv(f"{output_dir}/tmats/{timestamp}_tps.csv")

    else:
        p.print_trans_probs(transition_probs)
        p.plot_tps(curr_tmat)
        p.plot_vs_seer(curr_log, c.seer_inc)
        p.plot_vs_seer_total(curr_log, c.seer_inc)

    return curr_tmat

In [15]:
result = run_sa()

ValueError: could not broadcast input array from shape (80,) into shape (16,)