# DR CRC Calibration

Author: Sophie Wagner, sw3767@cumc.columbia.edu

In [1]:
# Required Packages
import numpy as np  # For matrix manipulation
import pandas as pd  # For output/input data processing
import matplotlib.pyplot as plt  # For visualizations
from csaps import csaps
from scipy.interpolate import interp1d
from tqdm import tqdm
from datetime import datetime

# Add the src directory to the Python path
import sys
import os
sys.path.append(os.path.abspath('../src'))

# Load .py files
import common_functions as func
import calibration_plots as p
import configs as c
import gof


# Some aesthetic options
np.set_printoptions(suppress=True, linewidth=300, formatter={'float': '{: 0.9f}'.format})
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Matrix setup, normalization, constraints

In [8]:
def row_normalize(matrix):
    for age_layer in range(matrix.shape[0]):  # Loop over each age layer
        layer = matrix[age_layer]
        # Calculate the sum of non-diagonal elements for each row
        sum_of_columns = np.sum(layer, axis=1) - np.diag(layer)
        # Set the diagonal elements
        np.fill_diagonal(layer, 1 - sum_of_columns)
    return matrix


def create_matrix():
    matrix = np.zeros((len(c.age_layers), len(c.health_states), len(c.health_states)))
    matrix[:, 0, 1] = func.probtoprob(0.005)  # Healthy to LR
    matrix[:, 1, 2] = func.probtoprob(0.015)  # LR to HR
    matrix[:, 2, 3] = func.probtoprob(0.05)  # HR to uLoc
    matrix[:, 3, 4] = func.probtoprob(0.45)  # uLoc to uReg
    matrix[:, 4, 5] = func.probtoprob(0.50)  # uReg to uDis
    matrix[:, 3, 6] = func.probtoprob(0.20)  # uLoc to dLoc
    matrix[:, 4, 7] = func.probtoprob(0.60)  # uReg to dReg
    matrix[:, 5, 8] = func.probtoprob(0.90)  # uDis to dDis

    matrix = add_acm(matrix)  # ACM
    matrix = add_csd(matrix)  # CSD
    matrix = constrain_matrix(matrix)  # constrain
    matrix = row_normalize(matrix)  # normalize

    return matrix


def constrain_matrix(matrix):
    matrix = np.clip(matrix, 0.0, 0.5)

    # Progression Block
    matrix[:, 0, 1] = np.maximum(0.000001, matrix[:, 0, 1])  # not below 0
    matrix[:, 1, 2] = np.maximum(matrix[:, 0, 1], matrix[:, 1, 2])  
    matrix[:, 2, 3] = np.maximum(matrix[:, 1, 2], matrix[:, 2, 3])
    matrix[:, 3, 4] = np.maximum(matrix[:, 2, 3], matrix[:, 3, 4])
    matrix[:, 4, 5] = np.maximum(matrix[:, 3, 4], matrix[:, 4, 5])

    # Detection Block
    matrix[:, 3, 6] = np.maximum(0.000001, matrix[:, 3, 6])
    matrix[:, 4, 7] = np.maximum(matrix[:, 3, 6], matrix[:, 4, 7])
    matrix[:, 5, 8] = np.maximum(matrix[:, 4, 7], matrix[:, 5, 8])
    
    # Age dependencies
    # matrix[:, 0, 1] = np.maximum.accumulate(matrix[:,0,1])
    # matrix[:, 1, 2] = np.maximum.accumulate(matrix[:,0,1])
    # matrix[:, 2, 3] = np.maximum.accumulate(matrix[:,0,1])

    return matrix


def add_acm(matrix):
    matrix[:, 0, 10] = c.acm_rate  # Healthy to ACM
    matrix[:, 1:3, 12] = c.acm_rate[:, np.newaxis]  # Polyp to ACM
    matrix[:, 3:6, 13] = c.acm_rate[:, np.newaxis]  # Undiagnosed to ACM
    matrix[:, 6:9, 11] = c.acm_rate[:, np.newaxis]  # Cancer to ACM
    matrix[:, 9, 9] = 1  # Stay in CSD
    matrix[:, 10, 10] = 1  # Stay in ACM
    matrix[:, 11, 11] = 1  # Stay in Cancer ACM
    matrix[:, 12, 12] = 1  # Stay in Polyp ACM
    matrix[:, 13, 13] = 1  # Stay in uCRC ACM

    return matrix


def add_csd(matrix):
    matrix[:, 6, 9] = c.csd_rate[:, 0]
    matrix[:, 7, 9] = c.csd_rate[:, 1]
    matrix[:, 8, 9] = c.csd_rate[:, 2]
    return matrix

### Markov model

In [9]:
def run_markov(matrix, starting_age=20, max_age=100):
    
    current_age = starting_age
    stage, age_layer = 1, 0
    month_pop, pop_log = c.starting_pop, c.starting_pop
    inc_log = np.zeros(pop_log.shape)  # to track new incidences in each state
    matrixT = matrix.transpose(0,2,1)
    inflow_matrix = np.tril(matrixT, k=-1)
    
    while current_age <= max_age:
        
        # Matrix multiplication (state transition)
        mat, inflow_mat = matrixT[age_layer], inflow_matrix[age_layer] 
        month_inc = np.matmul(inflow_mat, month_pop)  # (9, 9)(9, 1)->(9, 1)
        month_pop = np.matmul(mat, month_pop)  # (9, 9)(9, 1)->(9, 1)
        
        # Add to log
        inc_log = np.concatenate((inc_log, month_inc), axis=1)
        pop_log = np.concatenate((pop_log, month_pop), axis=1)
        
        stage += 1
        if stage % 12 == 0:
            current_age += 1
            age_layer = min(age_layer+1, 79)

    incidence = inc_log.copy()  # make (14,960)
    dead_factor = np.divide(c.N, c.N - pop_log[9:, :].sum(axis=0))  # inc and prev denominator is out of living only
    prevalence = np.zeros(pop_log.shape)  # (14,80)

    for state in range(14):
        incidence[state, :] = np.multiply(incidence[state, :], dead_factor)
        prevalence[state, :] = np.multiply(pop_log[state, :], dead_factor)

    incidence = incidence.reshape(len(c.health_states), 81, 12).sum(axis=2)  # getting annual incidence (rate per 100k)
    incidence_unadj = inc_log.reshape(len(c.health_states), 81, 12).sum(axis=2)  # getting inc unadjusted
    prevalence = prevalence.reshape(len(c.health_states), 81, 12).mean(axis=2)  # getting mean annual prevalence
    
    return incidence, prevalence, incidence_unadj, pop_log

### Simulated annealing

In [10]:
def step(matrix, step_size, num_adj=5):
    new_matrix = np.copy(matrix)
    step_mat = np.random.choice(len(c.points), size=num_adj, replace=True)
    step_age = np.random.choice(len(c.age_layers_5y), size=num_adj, replace=True)
    small_matrix = new_matrix[2:65:5, :, :]  # (13, 14, 14)

    for i in range(num_adj):
        (from_state, to_state) = c.points[step_mat[i]]
        step_param = np.mean(small_matrix[:, from_state, to_state]) * step_size
        small_matrix[step_age[i], from_state, to_state] += np.random.uniform(low=-step_param, high=step_param)
    
    # anchor = small_matrix[:,:,:].mean(axis=0)  # Mean accross ages
    # anchored_matrix = np.append(small_matrix, anchor, axis=0)  # add final age to axis 0 ages
    small_matrix[12,:,:] = np.minimum(small_matrix[11,:],small_matrix[12,:,:])  # Limit potential increase before splining
    new_matrix = csaps([22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5], small_matrix, smooth=0.01, axis=0)(np.arange(20,100,1)).clip(0.0,1.0)
    new_matrix = constrain_matrix(new_matrix)
    new_matrix = add_acm(new_matrix)
    new_matrix = add_csd(new_matrix)
    new_matrix = row_normalize(new_matrix)

    return new_matrix

In [11]:
def progress_report(iteration, best_eval, best_log, ticker, best_t):
    """
    Prints progress report during simulated annealing.
    """
    log_adj, _, inc_log, _ = best_log
    total_dxd = np.sum(inc_log[6:9, :]) / c.N
    print(f"{iteration}: Best Eval: {best_eval:.5f}, CRC: {total_dxd:.5f}, Tick: {ticker}")

    if iteration % 50000 == 0:
        transition_probs = p.extract_transition_probs(best_t, c.health_states, c.desired_transitions)
        print(f"Detailed Progress Report, Iteration = {iteration}")
        p.print_trans_probs(transition_probs)

In [12]:
def simulated_annealing(n_iterations, step_size, start_tmat=None, n_adj=7, verbose=False, starting_temp=1, print_interval=2500, obj=""):
    """
    Performs simulated annealing to optimize a transition matrix.

    Args:
        n_iterations (int): Number of iterations for optimization.
        step_size (float): Step size for parameter adjustments.
        start_tmat (numpy.ndarray): Initial transition matrix.
        n_adj (int): Number of parameters to adjust per step.
        starting_temp (float): Initial temperature for annealing.
        verbose (bool): Whether to print progress reports.
        print_interval (int): Interval for progress reporting.

    Returns:
        numpy.ndarray: Optimized transition matrix.
    """
    best_t = np.copy(start_tmat)
    best_log = run_markov(best_t)
    best_eval = gof.objective(run_markov(start_tmat), -1, obj)
    curr_t, curr_eval = best_t, best_eval
    ticker = 0

    with tqdm(total=n_iterations, desc="Simulated annealing progress", unit="iteration") as pbar:
        
        for i in range(n_iterations):

            # Run model
            candidate_t = np.copy(curr_t)
            candidate_t = step(candidate_t, step_size, n_adj)
            candidate_log = run_markov(candidate_t)
            candidate_eval = gof.objective(candidate_log, i, obj)  # Evaluate candidate point

            # Update "best" if better than candidate
            if candidate_eval < best_eval:
                ticker = 0
                best_t, best_eval = np.copy(candidate_t), np.copy(candidate_eval)
                best_log = run_markov(best_t)

            else:
                ticker += 1

            # Calculate temperature and Metropolis acceptance criterion
            t = starting_temp / (1 + np.log(i + 1))
            diff = candidate_eval - curr_eval
            metropolis = np.exp(-diff / t)

            if diff < 0 or np.random.random() < metropolis:
                curr_t, curr_eval = np.copy(candidate_t), candidate_eval

            # Print progress report
            if verbose and i > 0 and i % print_interval == 0:
                progress_report(i, best_eval, best_log, ticker, best_t)

            # Check if we should update "curr"
            diff = (candidate_eval - curr_eval)  # difference between candidate and current point evaluation
            metropolis = np.exp(-diff / t)  # calculate metropolis acceptance criterion
            if (diff < 0 or np.random.random() < metropolis):  # check if we should keep the new point
                curr_t, curr_eval = np.copy(candidate_t), np.copy(candidate_eval)  # store the new current point
                ticker = 0

            pbar.update(1)

    print(best_eval)
    
    return best_t

In [13]:
def run_sa(tmat=None, save_all=False, n_iterations=50000, step_size=0.2, n_adj=5, obj=""):
    
    start_tmat = None
    start_tmat = tmat if tmat is not None else create_matrix()   
    initial_score = gof.objective(run_markov(start_tmat), -1, obj)
    print(f"Initial score: {round(initial_score, 5)}")
    print("Starting calibration...")
    
    result = simulated_annealing(n_iterations=n_iterations, step_size=step_size, start_tmat=tmat, n_adj=n_adj, verbose=True, obj=obj)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    
    curr_tmat = result.copy()
    curr_log = run_markov(curr_tmat)

    # Extract transition probabilities
    transition_probs = p.extract_transition_probs(curr_tmat, c.health_states, c.desired_transitions)

    # Saving
    if save_all:
        # Save the with the timestamp in the filenames
        tmat_path, plot_path, probs_path = c.OUTPUT_PATHS["tmats"], c.OUTPUT_PATHS["plots"], c.OUTPUT_PATHS["probs"]
        np.save(f"{tmat_path}/{timestamp}_tmat.npy", curr_tmat)

        p.print_trans_probs(transition_probs, save_imgs=True, outpath=plot_path, timestamp=timestamp)
        p.print_trans_probs(transition_probs)
        p.plot_tps(curr_tmat, save_imgs=True, outpath=plot_path, timestamp=timestamp)
        p.plot_vs_seer(curr_log, c.seer_inc, save_imgs=True, outpath=plot_path, timestamp=timestamp)
        p.plot_vs_seer_total(curr_log, c.seer_inc, save_imgs=True, outpath=plot_path, timestamp=timestamp)
        
        out = np.zeros((len(c.points), 80))
        for idx, (from_state, to_state) in enumerate(c.points):
            out[idx] = curr_tmat[:, from_state, to_state]

        pd.DataFrame(out).to_csv(f"{probs_path}/{timestamp}_tps.csv")

    else:
        p.print_trans_probs(transition_probs)
        # p.plot_tps(curr_tmat)
        # p.plot_vs_seer(curr_log, c.seer_inc)
        # p.plot_vs_seer_total(curr_log, c.seer_inc)

    return curr_tmat

1/2/2025 10am: Using our US 1243 model, calibrate to DR incidence and HGPS stage distribution. 

In [None]:
result=np.load("../out/DR/HGPS_I1_P1/tmats/20240924_2148_tmat.npy")
print(f"{c.model_version}, {c.stage}:: incidence {c.inc_factor} | polyp {c.polyp_factor}")

for epoch in range(5):
    print(f"EPOCH {epoch+1}/5 -----------------------------------------------------------------------------")
    for batch in range(100):
        print(f"BATCH {batch+1}/100 ----------------------------------------------")
        save = True if batch==99 else False
        result = run_sa(result, False, n_iterations=100, step_size = 0.25, n_adj=5, obj="pol")
        result = run_sa(result, save, n_iterations=1000, step_size = 0.25, n_adj=5)
print("DONE")

DR, HGPS:: incidence 1 | polyp 2
EPOCH 0/2 -----------------------------------------------------------------------------
Initial score: 149649.88833
Starting calibration...


  metropolis = np.exp(-diff / t)
Simulated annealing progress:   0%|          | 6/50000 [00:00<14:18, 58.20iteration/s]

SEER contribution:171.0, (0.0%)
Polyp contribution:151010.0, (100.0%)


Simulated annealing progress:   5%|▌         | 2510/50000 [00:45<19:42, 40.16iteration/s]

2500: Best Eval: 17084.11552, CRC: 0.01573, Tick: 769


Simulated annealing progress:  10%|█         | 5008/50000 [01:28<11:47, 63.58iteration/s]

SEER contribution:15142.0, (93.0%)
Polyp contribution:1186.0, (7.0%)
5000: Best Eval: 15730.46932, CRC: 0.01577, Tick: 77


Simulated annealing progress:  15%|█▌        | 7509/50000 [02:10<15:45, 44.93iteration/s]

7500: Best Eval: 14985.17936, CRC: 0.01585, Tick: 1529


Simulated annealing progress:  20%|██        | 10008/50000 [02:53<14:42, 45.34iteration/s]

SEER contribution:14685.0, (81.0%)
Polyp contribution:3403.0, (19.0%)
10000: Best Eval: 14799.96588, CRC: 0.01629, Tick: 641


Simulated annealing progress:  25%|██▌       | 12512/50000 [03:36<08:02, 77.67iteration/s]

12500: Best Eval: 14031.19924, CRC: 0.01587, Tick: 1304


Simulated annealing progress:  30%|███       | 15011/50000 [04:18<07:36, 76.61iteration/s]

SEER contribution:14502.0, (94.0%)
Polyp contribution:878.0, (6.0%)
15000: Best Eval: 14025.87483, CRC: 0.01563, Tick: 1751


Simulated annealing progress:  35%|███▌      | 17514/50000 [05:01<06:44, 80.21iteration/s]

17500: Best Eval: 14025.87483, CRC: 0.01563, Tick: 4251


Simulated annealing progress:  40%|████      | 20006/50000 [05:44<12:29, 40.00iteration/s]

SEER contribution:15240.0, (94.0%)
Polyp contribution:1048.0, (6.0%)
20000: Best Eval: 13995.48419, CRC: 0.01586, Tick: 259


Simulated annealing progress:  45%|████▌     | 22515/50000 [06:25<06:06, 75.05iteration/s]

22500: Best Eval: 13709.94912, CRC: 0.01634, Tick: 860


Simulated annealing progress:  50%|█████     | 25015/50000 [07:07<05:25, 76.74iteration/s]

SEER contribution:14536.0, (89.0%)
Polyp contribution:1718.0, (11.0%)
25000: Best Eval: 13709.94912, CRC: 0.01634, Tick: 3360


Simulated annealing progress:  55%|█████▌    | 27504/50000 [07:47<06:30, 57.56iteration/s]

27500: Best Eval: 13709.94912, CRC: 0.01634, Tick: 5860


Simulated annealing progress:  60%|██████    | 30014/50000 [08:31<04:09, 80.01iteration/s]

SEER contribution:14553.0, (91.0%)
Polyp contribution:1505.0, (9.0%)
30000: Best Eval: 13709.94912, CRC: 0.01634, Tick: 8360


Simulated annealing progress:  65%|██████▌   | 32514/50000 [09:17<03:43, 78.37iteration/s]

32500: Best Eval: 13709.94912, CRC: 0.01634, Tick: 10860


Simulated annealing progress:  70%|███████   | 35011/50000 [10:00<03:05, 80.75iteration/s]

SEER contribution:15919.0, (96.0%)
Polyp contribution:655.0, (4.0%)
35000: Best Eval: 13476.33552, CRC: 0.01635, Tick: 480


Simulated annealing progress:  75%|███████▌  | 37507/50000 [10:43<04:02, 51.54iteration/s]

37500: Best Eval: 13476.33552, CRC: 0.01635, Tick: 2980


Simulated annealing progress:  80%|████████  | 40003/50000 [12:05<04:52, 34.12iteration/s]

SEER contribution:15053.0, (97.0%)
Polyp contribution:502.0, (3.0%)
40000: Best Eval: 13476.33552, CRC: 0.01635, Tick: 5480


Simulated annealing progress:  85%|████████▌ | 42503/50000 [14:02<08:45, 14.28iteration/s]

42500: Best Eval: 13476.33552, CRC: 0.01635, Tick: 7980


Simulated annealing progress:  90%|█████████ | 45009/50000 [16:00<01:18, 63.84iteration/s]

SEER contribution:15350.0, (97.0%)
Polyp contribution:516.0, (3.0%)
45000: Best Eval: 13476.33552, CRC: 0.01635, Tick: 10480


Simulated annealing progress:  95%|█████████▌| 47516/50000 [16:41<00:30, 80.46iteration/s]

47500: Best Eval: 13476.33552, CRC: 0.01635, Tick: 12980


Simulated annealing progress: 100%|██████████| 50000/50000 [17:21<00:00, 48.00iteration/s]


13476.33552277643
Monthly transition probabilities
healthy to LR_polyp: Min: 0.00000, Max: 0.00039, Average: 0.00023
LR_polyp to HR_polyp: Min: 0.00038, Max: 0.00116, Average: 0.00088
HR_polyp to u_CRC_loc: Min: 0.00038, Max: 0.00417, Average: 0.00168
u_CRC_loc to u_CRC_reg: Min: 0.00309, Max: 0.12876, Average: 0.06274
u_CRC_reg to u_CRC_dis: Min: 0.07917, Max: 0.12876, Average: 0.09930
u_CRC_loc to d_CRC_loc: Min: 0.00000, Max: 0.00974, Average: 0.00469
u_CRC_reg to d_CRC_reg: Min: 0.03388, Max: 0.15985, Average: 0.10628
u_CRC_dis to d_CRC_dis: Min: 0.21147, Max: 0.49143, Average: 0.40838

Annual transition probabilities
healthy to LR_polyp: Min: 0.00001, Max: 0.00462, Average: 0.00280
LR_polyp to HR_polyp: Min: 0.00455, Max: 0.01384, Average: 0.01046
HR_polyp to u_CRC_loc: Min: 0.00455, Max: 0.04892, Average: 0.01993
u_CRC_loc to u_CRC_reg: Min: 0.03647, Max: 0.80873, Average: 0.46960
u_CRC_reg to u_CRC_dis: Min: 0.62835, Max: 0.80873, Average: 0.71049
u_CRC_loc to d_CRC_loc: Min: 0.

In [None]:
t = np.load("../out/DR/HGPS_I1_P1/tmats/20240924_2148.npy")  # Start: DR HGPS tmat
out = np.zeros((len(c.points), 80))
for idx, (from_state, to_state) in enumerate(c.points):
    out[idx] = t[:, from_state, to_state]

#pd.DataFrame(out).to_csv(f"{probs_path}/{timestamp}_tps.csv")

In [28]:
pd.DataFrame(out).to_csv(f"{c.OUTPUT_PATHS["probs"]}/20250102_1233_tps.csv")

## Post-processing

In [54]:

def objective_cp(tmat):
    score = 0
    cp = cancer_progression(tmat)
    score += np.square(cp-3.0).sum()
    return score
    
def cancer_progression(tmat):
    """
    Calculate time from preclinical local to preclinical distant. MFPT
    """
    p_12 = tmat[:, 3, 4] # loc to reg
    p_23 = tmat[:, 4, 5] # reg to dis
    p_11 = tmat[:, 3, 3] # stay loc
    p_22 = tmat[:, 4, 4] # stay reg
    p_33 = tmat[:, 5, 5] # stay dis
    
    cp = (1 + p_12 * (1 + p_23 * (1 / (1 - p_33))) * (1 / (1 - p_22))) * (1 / (1 - p_11))
    
    return cp
    
    
def sojourn_time_weighted(tm, metric="mean"):
    """
    Calculate  time spent in each path.
    """
    in_loc, in_reg, in_dis = [1/(1-tm[:, x, x]) for x in [3,4,5]]
    mloc = in_loc
    mreg = in_loc + in_reg
    mdis = (in_loc + in_reg * tm[:, 3, 4]) + in_dis
    
    if metric == "mean": # Mean across paths per age
        sj_time =  np.mean([mloc, mreg, mdis], axis=0)
    else: # Each path per age
        sj_time = np.array([mloc, mreg, mdis])
    
    return sj_time

def sojourn_time_weighted2(tm):
    """
    Calculate  time spent in each path, weighted by stage.
    """
    in_loc, in_reg, in_dis = [1/(1-tm[:, x, x]) for x in [3,4,5]]
    mloc = in_loc
    mreg = in_loc * tm[:, 3, 4] + in_reg
    mdis = in_loc * tm[:, 3, 4] + in_reg * tm[:, 4, 5] + in_dis
    sj_time = np.array([mloc, mreg, mdis])
    
    return sj_time

def sojourn_time_in_stage(tmat):
    """
    Calculate mean time in each state. Average over all states.
    """
    sojourn_times = np.zeros((3,80))
    for i in np.arange(3,6,1):
        p_stay = tmat[:, i, i]
        sojourn_times[i-3] = 1 / (1 - p_stay)
    return sojourn_times

In [4]:
tmat = np.load("../out/US/interp/tmats/20240923_1243_tmat.npy")

In [5]:
from scipy.stats import sem, t

def summarize_data(data):
    """
    Returns the min, max, median, mean, and 95% confidence interval of a dataset.
    
    Parameters:
        data (list or numpy array): Input data
    
    Returns:
        dict: Summary statistics
    """
    if len(data) == 0:
        return "Data is empty."
    
    # Convert to numpy array for convenience
    data = np.array(data)
    
    # Summary statistics
    min_val = np.min(data)
    max_val = np.max(data)
    median_val = np.median(data)
    mean_val = np.mean(data)
    
    # Compute 95% confidence interval
    confidence = 0.95
    n = len(data)
    if n > 1:
        std_err = sem(data)
        h = std_err * t.ppf((1 + confidence) / 2, n - 1)
        ci_lower = mean_val - h
        ci_upper = mean_val + h
    else:
        ci_lower = ci_upper = mean_val  # No confidence interval for a single data point

    return {
        "min": min_val,
        "max": max_val,
        "median": median_val,
        "mean": mean_val,
        "95% CI": (ci_lower, ci_upper)
    }

In [None]:
loc, reg, dis = sojourn_time_weighted(tmat, metric="")
total = sojourn_time_weighted(tmat)
print(summarize_data(loc))
print(summarize_data(reg))
print(summarize_data(dis))
print(summarize_data(total))

plt.plot(np.arange(0,80,1), loc, color="blue", label="L")
plt.plot(np.arange(0,80,1), reg, color="red", label="R")
plt.plot(np.arange(0,80,1), dis, color="green", label="D")
plt.plot(np.arange(0,80,1), total, color="grey", label="All")
plt.title("Sojourn Time by Age (cumulative uL->dX)")
plt.xlabel("Age")
plt.ylabel("Months")
plt.legend()
plt.show()

import seaborn as sns
plt.hist(loc, bins=30, density=True, alpha=0.6, color="blue", label="Loc")
plt.hist(reg, bins=30, density=True, alpha=0.6, color="red", label="Reg")
plt.hist(dis, bins=30, density=True, alpha=0.6, color="green", label="Dis ")
sns.kdeplot(loc, fill=True, color="blue", alpha=0.3, clip=(0, None))  
sns.kdeplot(reg, fill=True, color="red", alpha=0.3, clip=(0, None))  
sns.kdeplot(dis, fill=True, color="green", alpha=0.3, clip=(0, None))
plt.title("Density Distribution of Sojourn Time by Stage at DX (cumulative uL->dX)")
plt.xlabel("Months")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
loc, reg, dis = sojourn_time_weighted2(tmat)
print(summarize_data(loc))
print(summarize_data(reg))
print(summarize_data(dis))
plt.plot(np.arange(0,80,1), loc, color="blue", label="L")
plt.plot(np.arange(0,80,1), reg, color="red", label="R")
plt.plot(np.arange(0,80,1), dis, color="green", label="D")
plt.title("Sojourn Time by Age")
plt.xlabel("Age")
plt.ylabel("Months")
plt.legend()
plt.show()

plt.hist(loc, bins=30, density=True, alpha=0.6, color="blue")
plt.hist(reg, bins=30, density=True, alpha=0.6, color="red")
plt.hist(dis, bins=30, density=True, alpha=0.6, color="green")
sns.kdeplot(loc, fill=True, color="blue", alpha=0.3, clip=(0, None), label="loc")  
sns.kdeplot(reg, fill=True, color="red", alpha=0.3, clip=(0, None), label="reg")  
sns.kdeplot(dis, fill=True, color="green", alpha=0.3, clip=(0, None), label="dis")  
plt.title("Density Distribution of Sojourn Time")
plt.xlabel("Months")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
summarize_data(cancer_progression(tmat))

In [100]:
def convert_to_conditional_probs(matrix):
    """
    Converts a transition matrix into conditional probabilities for TreeAge.
    
    Parameters:
        matrix (numpy.ndarray): Transition matrix of shape (n_ages, n_states, n_states).
    
    Returns:
        numpy.ndarray: Conditional transition matrix of the same shape.
    """
    conditional_matrix = np.copy(matrix)

    # Loop through all transitions to adjust probabilities
    for (from_idx, to_idx), (from_state, to_state) in zip(c.points, c.desired_transitions): 
        # Compute survival probability (1 - ACM)
        p_survive = 1 - matrix[:, from_idx, c.acm_states[from_idx]].clip(1e-10, 1.0)

        # Normalize by survival probability
        conditional_matrix[:, from_idx, to_idx] /= p_survive

        # If transition is progression (e.g., u_PDAC_x -> u_PDAC_x+1), normalize by p(no_dx)
        if from_idx in [3,4,5] and to_idx == from_idx + 1:  # Progression
            dx_state = from_idx + 3  # Corresponding diagnosed state
            p_no_dx = 1 - matrix[:, from_idx, dx_state].clip(1e-10, 1.0)
            conditional_matrix[:, from_idx, to_idx] /= p_no_dx 
    return conditional_matrix

In [102]:
tmat = np.load("../out/US/interp/tmats/20240923_1243_tmat.npy")

In [103]:
tmat_c = convert_to_conditional_probs(tmat)

In [107]:
def extract_transition_probs(tmat, type="markov", metric="all"):
    """
    Extracts and optionally saves transition probabilities from a transition matrix.
    
    Parameters:
        tmat (numpy.ndarray): Transition probability matrix of shape (n_ages, n_states, n_states).
        type (str): Type of model ("markov" or other). Determines age range.
        save (bool): Whether to save the output as a CSV file.
        outpath (str): Path to save the CSV file. Required if save=True.
        timestamp (str): Custom timestamp for the filename. Defaults to current datetime.
    
    Returns:
        pd.DataFrame: Transition probabilities dataframe.
    """
    
    tmat = convert_to_conditional_probs(tmat) if type == "treeage" else tmat
    age_range = np.arange(20,100,1)
    data = []
    df = None 
    
    if metric == "all":
        for (from_idx, to_idx), (from_state, to_state) in zip(c.points, c.desired_transitions):
            for age, probs in zip(age_range, tmat[:, from_idx, to_idx]):
                data.append({
                    "Age": age,
                    "From State": from_state,
                    "To State": to_state,
                    "Probability": probs
                })
        
        df = pd.DataFrame(data)
    
    elif metric == "avg":
        for (from_idx, to_idx), (from_state, to_state) in c.transitions_itos.items():
            probs = [round(p,5) for p in tmat[:, from_idx, to_idx]]
            data.append({
                "From State": from_state,
                "To State": to_state,
                "Age 30": probs[10],
                "Age 75": probs[-10],
                 "Min": min(probs),
                 "Max": max(probs),
                 "Avg": round(np.mean(probs),5)
            })
        df = pd.DataFrame(data)
    
    else:
        print("Wrong metric specified in extract_transition_probs. Need [avg, all]")
    
    return df
        

In [108]:
df = extract_transition_probs(tmat_c)

In [110]:
df.to_csv("../out/US/interp/probs/20240923_1243_c.csv")