# Lynch Syndrome CRC calibration
Author: Sophie Wagner <br>
Contact: sw3767@cumc.columbia.edu

## Notebook configuration

In [None]:
import sys
import os

import numpy as np  # For matrix manipulation
import pandas as pd  # For output/input data processing
import matplotlib.pyplot as plt  # For visualizations
from csaps import csaps  # For smoothing splines
from scipy.interpolate import interp1d
from tqdm import tqdm  # For progress bars


sys.path.append(os.path.abspath("../src"))

# Load .py files
# import utils.common_functions as func
# import calibration.plots as p
# import configs.global_configs as c
# import calibration.gof

for m in ["utils", "calibration", "configs"]:
    if m in sys.modules:
        del sys.modules[m]

import utils
import calibration
import configs.global_configs as c
import utils.common_functions as func
import calibration.plots as pl
import calibration.gof as gof

utils.add_cell_timer()

# Some aesthetic options
np.set_printoptions(
    suppress=True, linewidth=300, formatter={"float": "{: 0.9f}".format}
)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

Matplotlib is building the font cache; this may take a moment.


## Matrix configuration

In [None]:
def row_normalize(matrix):
    for age_layer in range(matrix.shape[0]):  # Loop over each age layer
        layer = matrix[age_layer]
        # Calculate the sum of non-diagonal elements for each row
        sum_of_columns = np.sum(layer, axis=1) - np.diag(layer)
        # Set the diagonal elements
        np.fill_diagonal(layer, 1 - sum_of_columns)
    return matrix


def create_matrix():
    # Initialize matrix 2 x 4 x n_age_layers x n_states x n_states
    # This gives us a n_states x n_states matrix for each age layer for each gene-gender combination
    matrix = np.zeros(
        (len(c.SEXES), len(c.GENES), len(c.AGE_LAYERS), c.n_states, c.n_states)
    )

    matrix[:, :, :, 0, 1] = func.probtoprob(0.005)  # healthy to lr_polyp
    matrix[:, :, :, 1, 2] = func.probtoprob(0.015)  # lr_polyp to hr_polyp
    matrix[:, :, :, 2, 3] = func.probtoprob(0.05)  # hr_polyp to u_stage_1
    matrix[:, :, :, 3, 4] = func.probtoprob(0.30)  # u_stage_1 to u_stage_2
    matrix[:, :, :, 4, 5] = func.probtoprob(0.70)  # u_stage_2 to u_stage_3
    matrix[:, :, :, 5, 6] = func.probtoprob(0.80)  # u_stage_3 to u_stage_4
    matrix[:, :, :, 3, 7] = func.probtoprob(0.2)  # u_stage_1 to d_stage_1
    matrix[:, :, :, 4, 8] = func.probtoprob(0.5)  # u_stage_2 to d_stage_2
    matrix[:, :, :, 5, 9] = func.probtoprob(0.5)  # u_stage_3 to d_stage_3
    matrix[:, :, :, 6, 10] = func.probtoprob(0.5)  # u_stage_4 to d_stage_4

    matrix = add_acm(matrix)  # ACM
    matrix = add_csd(matrix)  # CSD
    matrix = constrain_matrix(matrix)  # constrain
    matrix = row_normalize(matrix)  # normalize

    return matrix


def constrain_matrix(matrix):
    matrix = np.clip(matrix, 0.0, 0.5)

    # Progression Block
    matrix[:, :, :, 0, 1] = np.maximum(0.000001, matrix[:, :, :, 0, 1])  # not below 0
    matrix[:, :, :, 1, 2] = np.maximum(matrix[:, :, :, 0, 1], matrix[:, :, :, 1, 2])
    matrix[:, :, :, 2, 3] = np.maximum(matrix[:, :, :, 1, 2], matrix[:, :, :, 2, 3])
    matrix[:, :, :, 3, 4] = np.maximum(matrix[:, :, :, 2, 3], matrix[:, :, :, 3, 4])
    matrix[:, :, :, 4, 5] = np.maximum(matrix[:, :, :, 3, 4], matrix[:, :, :, 4, 5])
    matrix[:, :, :, 5, 6] = np.maximum(matrix[:, :, :, 3, 4], matrix[:, :, :, 4, 5])

    # Detection Block
    matrix[:, :, :, 3, 7] = np.maximum(0.000001, matrix[:, :, :, 3, 6])
    matrix[:, :, :, 4, 8] = np.maximum(matrix[:, :, :, 3, 7], matrix[:, :, :, 4, 8])
    matrix[:, :, :, 5, 9] = np.maximum(matrix[:, :, :, 4, 8], matrix[:, :, :, 5, 9])
    matrix[:, :, :, 6, 10] = np.maximum(matrix[:, :, :, 5, 9], matrix[:, :, :, 6, 10])

    # Age dependencies
    matrix[:, :, :, 0, 1] = np.maximum.accumulate(matrix[:, :, :, 0, 1]) # healthy to lr_polyp

    return matrix


def add_acm(matrix):
    for sex in c.SEXES:
        acm_rate = c.acm_rate[sex]
        matrix[sex, :, :, :11, 12] = acm_rate  # ACM
        matrix[sex, :, :, 7:10, 11] = acm_rate  # Cancer to CSD
        matrix[sex, :, :, 11, 11] = 1  # Stay in CSD
        matrix[sex, :, :, 12, 12] = 1  # Stay in ACM
        matrix[sex, :, :, 13, 13] = 1  # Stay in Colo death (unused during calibration)
    return matrix


def add_csd(matrix):
    matrix[:, :, :, 7, 11] = func.probtoprob(0.05)
    matrix[:, :, :, 8, 11] = func.probtoprob(0.19)
    matrix[:, :, :, 9, 11] = func.probtoprob(0.75)
    matrix[:, :, :, 10, 11] = func.probtoprob(0.90)
    return matrix

In [None]:
def run_markov(matrix, starting_age=20, max_age=75):
    
    current_age = starting_age
    stage, age_layer = 1, 0
    month_pop, pop_log = np.zeros(c.n_states), np.zeros(c.n_states)
    month_pop[0], pop_log[0] = 1., 1.
    inc_log = np.zeros(pop_log.shape)  # to track new incidences in each state
    matrixT = matrix.transpose(0,1,2,4,3)
    inflow_matrix = np.tril(matrixT, k=-1)
    
    while current_age <= max_age:
        
        # Matrix multiplication (state transition)
        mat, inflow_mat = matrixT[age_layer], inflow_matrix[age_layer] 
        month_inc = np.matmul(inflow_mat, month_pop)  # (9, 9)(9, 1)->(9, 1)
        month_pop = np.matmul(mat, month_pop)  # (9, 9)(9, 1)->(9, 1)
        
        # Add to log
        inc_log = np.concatenate((inc_log, month_inc), axis=1)
        pop_log = np.concatenate((pop_log, month_pop), axis=1)
        
        stage += 1
        if stage % 12 == 0:
            current_age += 1
            if current_age in c.AGE_LAYERS: 
                age_layer = min(age_layer+1, 64)

    incidence = inc_log.copy()  # make (14,960)
    dead_factor = np.divide(c.POPULATION_SIZE, c.POPULATION_SIZE - pop_log[11:, :].sum(axis=0))  # inc and prev denominator is out of living only
    prevalence = np.zeros(pop_log.shape)  # (14,80)

    for state in range(14):
        incidence[state, :] = np.multiply(incidence[state, :], dead_factor)
        prevalence[state, :] = np.multiply(pop_log[state, :], dead_factor)

    incidence = incidence.reshape(c.n_states, 81, 12).sum(axis=2)  # getting annual incidence (rate per 100k)
    incidence_unadj = inc_log.reshape(c.n_states, 81, 12).sum(axis=2)  # getting inc unadjusted
    prevalence = prevalence.reshape(c.n_states, 81, 12).mean(axis=2)  # getting mean annual prevalence
    
    return incidence, prevalence, incidence_unadj, pop_log