# Lynch Syndrome CRC calibration
Author: Sophie Wagner <br>
Contact: sw3767@cumc.columbia.edu

## Notebook configuration

In [1]:
import sys
import os

import numpy as np  # For matrix manipulation
import pandas as pd  # For output/input data processing
import matplotlib.pyplot as plt  # For visualizations
from csaps import csaps  # For smoothing splines
from scipy.interpolate import interp1d # For interpolation
from tqdm import tqdm  # For progress bars


sys.path.append(os.path.abspath("../src"))

# Remove previously imported modules to ensure fresh imports
for m in ["utils", "calibration", "configs"]:
    if m in sys.modules:
        del sys.modules[m]

import utils
import calibration
import configs.global_configs as c
import configs.inputs as inputs
import utils.common_functions as func
import calibration.plots as pl
import calibration.gof as gof


# Some aesthetic options
utils.add_cell_timer()
np.set_printoptions(
    suppress=True, linewidth=300, formatter={"float": "{: 0.9f}".format}
)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## Load inputs

In [10]:
# Lifetables (monthly ACM rate in 5-year age groups)
lt_f = inputs.lt_5y["prob_death_female_m"].iloc[4:15]
lt_m = inputs.lt_5y["prob_death_male_m"].iloc[4:15]

# Starting population vector
starting_pop = np.zeros((len(c.SEXES),len(c.GENES),c.n_states,1))
starting_pop[:,:,0,:] = c.POPULATION_SIZE

## Matrix configuration

In [12]:
def row_normalize(matrix):
    matrix = np.clip(matrix, 0.0, 0.6)
        # Iterate over the first three dimensions to access each (n_states, n_states) matrix
    for sex in range(matrix.shape[0]):
        for gene in range(matrix.shape[1]):
            for age_layer in range(matrix.shape[2]): 
                layer = matrix[sex, gene, age_layer]
                # Calculate the sum of non-diagonal elements for each row
                sum_of_rows = np.sum(layer, axis=1) - np.diag(layer)
                # Set the diagonal elements
                np.fill_diagonal(layer, 1 - sum_of_rows)
    return matrix


def create_matrix():
    # Initialize matrix 2 x 4 x n_age_layers x n_states x n_states
    # This gives us a n_states x n_states matrix for each age layer for each gene-gender combination
    matrix = np.zeros(
        (len(c.SEXES), len(c.GENES), len(c.AGE_LAYERS), c.n_states, c.n_states)
    )

    matrix[:, :, :, 0, 1] = func.probtoprob(0.005)  # healthy to lr_polyp
    matrix[:, :, :, 1, 2] = func.probtoprob(0.025)  # lr_polyp to hr_polyp
    matrix[:, :, :, 2, 3] = func.probtoprob(0.075)  # hr_polyp to u_stage_1
    matrix[:, :, :, 3, 4] = func.probtoprob(0.10)  # u_stage_1 to u_stage_2
    matrix[:, :, :, 4, 5] = func.probtoprob(0.30)  # u_stage_2 to u_stage_3
    matrix[:, :, :, 5, 6] = func.probtoprob(0.60)  # u_stage_3 to u_stage_4
    matrix[:, :, :, 3, 7] = func.probtoprob(0.15)  # u_stage_1 to d_stage_1
    matrix[:, :, :, 4, 8] = func.probtoprob(0.2)  # u_stage_2 to d_stage_2
    matrix[:, :, :, 5, 9] = func.probtoprob(0.4)  # u_stage_3 to d_stage_3
    matrix[:, :, :, 6, 10] = func.probtoprob(0.5)  # u_stage_4 to d_stage_4

    matrix = add_acm(matrix)  # ACM
    matrix = add_csd(matrix)  # CSD
    matrix = constrain_matrix(matrix)  # constrain
    matrix = row_normalize(matrix)  # normalize

    return matrix


def constrain_matrix(matrix):
    matrix = np.clip(matrix, 0.0, 0.6)

    # Progression Block
    # TODO: come back 
    matrix[:, :, :, 0, 1] = np.maximum(0.000001, matrix[:, :, :, 0, 1])  # not below 0
    matrix[:, :, :, 1, 2] = np.maximum(matrix[:, :, :, 0, 1], matrix[:, :, :, 1, 2])
    matrix[:, :, :, 2, 3] = np.maximum(matrix[:, :, :, 1, 2], matrix[:, :, :, 2, 3])
    # Cancer progression
    matrix[:, :, :, 3, 4] = np.maximum(matrix[:, :, :, 2, 3], matrix[:, :, :, 3, 4])
    matrix[:, :, :, 4, 5] = np.maximum(matrix[:, :, :, 3, 4], matrix[:, :, :, 4, 5])
    matrix[:, :, :, 5, 6] = np.maximum(matrix[:, :, :, 5, 6], matrix[:, :, :, 4, 5])

    # Detection Block
    matrix[:, :, :, 3, 7] = np.maximum(0.000001, matrix[:, :, :, 3, 6])
    matrix[:, :, :, 4, 8] = np.maximum(matrix[:, :, :, 3, 7], matrix[:, :, :, 4, 8])
    matrix[:, :, :, 5, 9] = np.maximum(matrix[:, :, :, 4, 8], matrix[:, :, :, 5, 9])
    matrix[:, :, :, 6, 10] = np.maximum(matrix[:, :, :, 5, 9], matrix[:, :, :, 6, 10])

    # Age dependencies
    for s in range(c.n_sexes):
        for g in range(c.n_genes):
            matrix[s, g, :, 0, 1] = np.maximum.accumulate(matrix[s, g, :, 0, 1]) # healthy to lr_polyp

    return matrix


def add_acm(matrix):
    for sex, lt in enumerate([lt_m, lt_f]):
        matrix[sex, :, :, :11, 12] = lt  # ACM
        matrix[sex, :, :, 11, 11] = 1  # Stay in CSD
        matrix[sex, :, :, 12, 12] = 1  # Stay in ACM
        matrix[sex, :, :, 13, 13] = 1  # Stay in Colo death (unused during calibration)
    return matrix


def add_csd(matrix):
    matrix[:, :, :, 7, 11] = func.probtoprob(0.05)
    matrix[:, :, :, 8, 11] = func.probtoprob(0.19)
    matrix[:, :, :, 9, 11] = func.probtoprob(0.75)
    matrix[:, :, :, 10, 11] = func.probtoprob(0.90)
    return matrix

## Markov model

In [13]:
def run_markov(tmat, starting_age=20, max_age=74):
    
    current_age = starting_age
    stage, age_layer_idx = 1, 0
    pop_log, month_pop = starting_pop, starting_pop  # (2, 4, 14, 1)
    inc_log = np.zeros(pop_log.shape)  # to track new incidences in each state
    n_years = starting_age - max_age + 1 

    # Initial matrix is (2, 4, 11, 14, 14) (SEX, GENE, AGE_GROUP, HEALTH_STATE, HEALTH_STATE)
    # Transpose health state sub-matrix s.t. we get inverted matrix for multiplication
    tmat_T = tmat.transpose(0,1,2,4,3)
    inflow_tmat = np.tril(tmat_T, k=-1)
    
    while current_age <= max_age:
        
        # Matrix multiplication (state transition) -- batched for each (SEX, GENE) combination
        month_inc = np.matmul(inflow_tmat[:,:,age_layer_idx,:,:], month_pop)  # (2, 4, 14, 14)(2, 4, 14, 1)->(2, 4, 14, 1)
        month_pop = np.matmul(tmat_T[:,:,age_layer_idx,:,:], month_pop)  # (2, 4, 14, 14)(2, 4, 14, 1)->(2, 4, 14, 1)
        
        # Add to log
        inc_log = np.concatenate((inc_log, month_inc), axis=3)
        pop_log = np.concatenate((pop_log, month_pop), axis=3)
        
        stage += 1
        if stage % 12 == 0:
            current_age += 1
            if current_age in c.AGE_LAYERS.keys(): 
                age_layer_idx = age_layer_idx+1

    # Adjustment: incidence and prevalence should be out of living only
    dead_factor = np.divide(c.POPULATION_SIZE, c.POPULATION_SIZE - pop_log[:,:,11:,:].sum(axis=2))
    prevalence_adj = np.zeros(pop_log.shape)  # (2, 4, 14, 55)
    incidence_adj, incidence_unadj = inc_log.copy(), inc_log.copy()

    for state in range(14):
        incidence_adj[:,:,state,:] = np.multiply(inc_log[:,:,state,:], dead_factor)
        prevalence_adj[:,:,state,:] = np.multiply(pop_log[:,:,state,:], dead_factor)

    # Transform into annual counts. For incidence, we sum; for prevalence, we average. 
    incidence_unadj = incidence_unadj.reshape(c.n_sexes, c.n_genes, c.n_states, n_years, 12).sum(axis=4)  # getting inc unadjusted
    incidence_adj = incidence_adj.reshape(c.n_sexes, c.n_genes, c.n_states, n_years, 12).sum(axis=4)  # getting annual incidence (rate per 100k)
    prevalence_adj = prevalence_adj.reshape(c.n_sexes, c.n_genes, c.n_states, n_years, 12).mean(axis=4)  # getting mean annual prevalence
    
    return incidence_adj, prevalence_adj, incidence_unadj, pop_log

## Calibration

### Step function
1. Select random combination of (sex, gene, age layer, transition prob)
2. Take a random step (using step size) to adjust the selected transition probability
3. (Optional / later into calibration) Spline across age groups such that we don't get too abnormal of a value / smoother transitions across age
4. Check matrix constraints

In [14]:
calibration_tps = list(c.calibration_tps_itos.keys())
n_calibration_tps = len(calibration_tps)
age_mids = [22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5]

def step(matrix, step_size, num_adj=10, smooth=0.01):
    new_matrix = np.copy(matrix)
    step_tp = np.random.choice(n_calibration_tps, size=num_adj, replace=True)
    step_age = np.random.choice(c.n_age_layers, size=num_adj, replace=True)
    step_sex = np.random.choice(c.n_sexes, size=num_adj, replace=True)
    step_gene = np.random.choice(c.n_genes, size=num_adj, replace=True)

    for i in range(num_adj):
        (from_state, to_state) = calibration_tps[step_tp[i]]
        new_matrix[step_sex[i], step_gene[i], step_age[i], from_state, to_state] += np.random.uniform(low=-step_size, high=step_size)
    
    # Limit potential increase before splining across ages 
    new_matrix[:,:,c.n_age_layers-1,:,:] = np.minimum(new_matrix[:,:,c.n_age_layers-2,:,:],new_matrix[:,:,c.n_age_layers-1,:,:])  
    new_matrix = csaps(age_mids, new_matrix, smooth=smooth, axis=2)(age_mids).clip(0.0,1.0)
    
    new_matrix = constrain_matrix(new_matrix)
    new_matrix = add_acm(new_matrix)
    new_matrix = add_csd(new_matrix)
    new_matrix = row_normalize(new_matrix)

    return new_matrix

### Objective function (goodness of fit)
Here, we calculate how far off we are from our model targets, and produce a goodness-of-fit score. This is calculated by the mean-squared error between the model output and the target (i.e. mean((prediction-target)**2)). Our calibration targets are:
1. Stage distribution at diagnosis -- from Myles
2. Cumulative CRC incidence by gene and sex -- from Dominguez et al 2024 splined curves (see "data cleaning" folder)
3. Cumulative adenoma risk by gene at age 60 -- from Myles

In [None]:
stage_target = np.array([
    inputs.stage_dist_target_dict['stage_1'],
    inputs.stage_dist_target_dict['stage_2'],
    inputs.stage_dist_target_dict['stage_3'],
    inputs.stage_dist_target_dict['stage_4'],
])  # shape (4,)

def objective(log, weight_type="sqrt"):
    inc_unadj = log[2]  # (sex, gene, state, age/month)

    # --- CRC stage distribution error (PER GENE, same target vector for all) ---
    stage_idx = [7, 8, 9, 10]  # stage_1..stage_4

    # Sum over sex and time; keep (gene, stage)
    # Result shape: (n_genes, 4)
    stage_totals_gene = inc_unadj[:, :, stage_idx, :].sum(axis=(0, 3))
    denom = np.maximum(stage_totals_gene.sum(axis=1, keepdims=True), 1e-12)  # per-gene totals
    stage_props_gene = stage_totals_gene / denom  # (n_genes, 4)

    # MSE across the 4 stages -> (n_genes,)
    stage_err_by_gene = ((stage_props_gene*100 - stage_target*100)**2).mean(axis=1)
    
    # --- CRC Incidence Error (per gene, sums over sex with sex-specific targets) ---
    err_inc_by_gene = np.zeros(c.n_genes)
    for g, gene in enumerate(c.GENES):
        e = 0.0
        for s, sex in enumerate(c.SEXES):
            # sum cancer states, cum over time, per 100 persons
            model_inc = (inc_unadj[s, g, 7:11, :].sum(axis=0).cumsum() / c.POPULATION_SIZE) * 100
            target_inc = inputs.incidence_target[gene][sex]
            e += np.square(model_inc - target_inc).sum()
        err_inc_by_gene[g] = e

    # --- Adenoma Risk Error by age 60 (per gene) ---
    # 2:4 = LR/HR adenoma 
    # :40 = ages/months to age 60 
    err_adn_by_gene = np.zeros(c.n_genes)
    for g, gene in enumerate(c.GENES):
        model_val = inc_unadj[:, g, 2:4, :40].sum() / (c.POPULATION_SIZE * c.n_sexes) * 100
        target_val = inputs.polyp_targets_dict[gene] * 100
        err_adn_by_gene[g] = (model_val - target_val)**2

    # --- Combine ---
    score_by_gene = err_inc_by_gene + err_adn_by_gene + stage_err_by_gene # (n_genes,)
    score_by_gene = np.asarray(score_by_gene, dtype=float)
    total_score = score_by_gene.sum()  # Total score (scalar)
    return total_score, score_by_gene

def simulated_annealing(n_iterations, step_size, num_adj = 10, smooth=0.01, starting_temp = 1, starting_tmat=None, verbose=False):

    if starting_tmat is None:
        best_tmat = create_matrix()
    else: 
        best_tmat = starting_tmat.copy()
    best_eval_total, best_eval_by_gene = objective(run_markov(best_tmat)) # evaluate the initial point
    curr_tmat, curr_eval_total, curr_eval_by_gene = best_tmat.copy(), best_eval_total.copy(), best_eval_by_gene.copy()  # current working solution

    for i in range(n_iterations):  # Running algorithm

        # Run and evaluate new candidate transition matrix
        candidate_tmat = curr_tmat.copy()
        candidate_tmat = step(candidate_tmat, step_size, num_adj, smooth)
        candidate_eval_total, candidate_eval_by_gene = objective(run_markov(candidate_tmat))  # Evaluate candidate point

        # If outcomes are better, accept it as current best (per gene selective update)
        improve_idx = np.where(candidate_eval_by_gene < best_eval_by_gene)[0]
        if improve_idx.size > 0:
            for idx in improve_idx:
                best_tmat[:, idx, :, :, :] = candidate_tmat[:, idx, :, :, :].copy()
                curr_tmat[:, idx, :, :, :] = candidate_tmat[:, idx, :, :, :].copy()
                best_eval_by_gene[idx] = candidate_eval_by_gene[idx]
                curr_eval_by_gene[idx] = candidate_eval_by_gene[idx]
                best_eval_total = best_eval_by_gene.sum()
                curr_eval_total = curr_eval_by_gene.sum()

        # Report progress
        if verbose and i%(10_000)==0:
            print('>%d score = %.5f' % (i, best_eval_total))     

        # Accept worse solution with a small probability
        t = starting_temp /(1+np.log(i+1))  
        diff = candidate_eval_total - curr_eval_total  # difference between candidate and current point evaluation
        metropolis = np.exp(-diff / t)  # calculate metropolis acceptance criterion
        if diff < 0 or np.random.random() < metropolis:  # check if we should keep the new point
            curr_tmat, curr_eval_total, curr_eval_by_gene = candidate_tmat, candidate_eval_total, curr_eval_total  # store the new current point

    print(best_eval_total)
    return best_tmat

### Iterate
Below, we use the simulated annealing process to select the next candidate parameters.
1. Run markov model and calculate GOF (initial)
2. Adjust parameters randomly using the `step()` function
3. Run markov model with new tmat and calculate GOF (candidate)
4. If candidate GOF is better than initial GOF, use candidate
5. If candidate GOF is worse than initial GOF, mainly use initial, but with small probability we may use candidate (worse)
    (This introduces more randomness to escape local minima, and is why simulated annealing is said to have convergence to optimal parameter set with infinite time and loose enough constraints for it to actually search the parameter space)

## Run model

In [48]:
tmat = create_matrix()  # Create initial transition matrix
result = simulated_annealing(
    n_iterations=10000, 
    step_size=0.01,
    starting_tmat=tmat, 
    verbose=True)

>0 score = 168240.85283


TypeError: 'numpy.float64' object does not support item assignment

## Plot outputs

In [None]:
result_log = run_markov(result)
inc_unadj = result_log[2]
# (2,1,14,55)

ages = np.arange(20,75,1)

plt.plot(ages, (inputs.incidence_target['MLH1']['male'])/100, '--', label="Target")
plt.plot(ages, (inc_unadj[0,0,7:11,:].sum(axis=0).cumsum())/100_000, label="Model")
plt.title("Male MLH1 CRC incidence")
plt.show()

plt.plot(ages, (inputs.incidence_target['MLH1']['female'])/100, '--', label="Target")
plt.plot(ages, (inc_unadj[1,0,7:11,:].sum(axis=0).cumsum())/100_000, label="Model")
plt.title("Female MLH1 CRC incidence")
plt.legend()
plt.show()

plt.plot(ages, (inc_unadj[0,0,1:3,:].sum(axis=0).cumsum())/100_000, label="Model")
plt.title("Male MLH1 polyp incidence")
plt.show()

plt.plot(ages, (inc_unadj[1,0,1:3,:].sum(axis=0).cumsum())/100_000, label="Model")
plt.title("Female MLH1 polyp incidence")
plt.legend()
plt.show()

plt.plot(list(c.AGE_LAYERS.keys()), result[0, 0, :, 0, 1], color="blue")
plt.plot(list(c.AGE_LAYERS.keys()), result[1, 0, :, 0, 1], color="red")
plt.title("Transition from healthy to low-risk polyp")
plt.show()