In [1]:
import numpy as np
import pandas as pd
rng = np.random.default_rng(12345)

from lymph.models import Unilateral
from lymixture import LymphMixture
from lymixture.utils import binom_pmf, late_binomial, normalize
from fixtures import get_patient_data
PARAMS_C1 = {
    "TtoII_spread": 0.5,
    "TtoIII_spread": 0.25,
    "TtoIV_spread": 0.1,
    "IItoIII_spread": 0.4,
    "IIItoIV_spread": 0.3,
    "late_p": 0.5,
}
PARAMS_C2 = {
    "TtoII_spread": 0.65,
    "TtoIII_spread": 0.15,
    "TtoIV_spread": 0.05,
    "IItoIII_spread": 0.5,
    "IIItoIV_spread": 0.4,
    "late_p": 0.5,
}

In [2]:
data = pd.read_csv("data/mixture.csv", header=[0,1,2])
# new_column = pd.MultiIndex.from_tuples([('diagnose', 'ipsi', ['II', 'III'])])

# Initialize the new column with None values
# data['diagnose', 'ipsi', 'II'] = None
# data['diagnose', 'ipsi', 'III'] = None

data


Unnamed: 0_level_0,path,path,tumor,tumor
Unnamed: 0_level_1,ipsi,ipsi,1,1
Unnamed: 0_level_2,II,III,t_stage,subsite
0,False,True,late,c1
1,False,True,early,c1
2,False,True,early,c1
3,False,False,early,c1
4,False,True,early,c1
...,...,...,...,...
2995,True,True,early,c3
2996,True,True,late,c3
2997,False,False,early,c3
2998,True,False,early,c3


In [3]:
graph = {
    ("tumor", "T"): ["II", "III"],
    ("lnl", "II"): ["III"],
    ("lnl", "III"): [],
}
num_components = 2

mixture = LymphMixture(
    model_cls=Unilateral,
    model_kwargs={"graph_dict": graph},
    num_components=num_components,
    universal_p= False
)
mixture.load_patient_data(
    data,
    split_by=("tumor", "1", "subsite"),
    mapping=lambda x: x,
)

Set the diagnostic modality to be the same as in the generated dataset.

In [4]:
mixture.set_modality("path", 1., 1.)
# mixture.set_modality("diagnose", 1., 0.81 )
mixture.get_all_modalities()

{'path': Clinical(spec=1.0, sens=1.0, is_trinary=False)}

Fix the distribution over diagnosis times for early T-stage (T1 & T2) to be a binomial distribution with a parameters $p=0.3$.

The late T-stage's diagnosis time distribution is a binomial one with a free model parameter than needs to be learned as well.

In [5]:
mixture.set_distribution("early", binom_pmf(np.arange(11), 10, 0.3))
mixture.set_distribution("late", late_binomial)
mixture.get_all_distributions()

{'early': Distribution([0.0282475249, 0.121060821, 0.23347444050000002, 0.266827932, 0.20012094900000002, 0.1029193452, 0.03675690900000001, 0.009001692, 0.0014467005000000002, 0.00013778100000000004, 5.904900000000001e-06]),
 'late': Distribution([0.0009765625, 0.009765625, 0.0439453125, 0.1171875, 0.205078125, 0.24609375, 0.205078125, 0.1171875, 0.0439453125, 0.009765625, 0.0009765625])}

Initialize random model parameters and latent variables/responsibilities.

In [6]:
from lymixture.em import expectation, maximization, maximization_component_wise

params = {k: rng.uniform() for k in mixture.get_params()}
mixture.set_params(**params)
mixture.normalize_mixture_coefs()
latent = normalize(rng.uniform(size=mixture.get_resps().shape).T, axis=0).T

In [7]:
def to_numpy(params: dict[str, float]) -> np.ndarray:
    return np.array([p for p in params.values()])

In [13]:
def check_convergence(params_history, likelihood_history, steps_back_list):
    current_params = params_history[-1]
    current_likelihood = likelihood_history[-1]
    for steps_back in steps_back_list:
        previous_params = params_history[-steps_back - 1]
        if np.allclose(to_numpy(current_params), to_numpy(previous_params)):
            print('stopped due to parameter similarity')
            return True  # Return True if any of the steps is close
        elif (np.isclose(current_likelihood, likelihood_history[-steps_back - 1],rtol = 0, atol = 0.01)) and np.all(current_likelihood >= np.array(likelihood_history)):
            print('stopped due to likelihood similarity')
            return True
    return False

Iterate the computation of the expectation value of the latent variables (E-step) and the maximization of the (complete) data log-likelihood w.r.t. the model parameters (M-step).

In [10]:
is_converged = False
count = 0
params_history = []
likelihood_history = []
params_history.append(params.copy())
likelihood_history.append(mixture.likelihood(use_complete = False))
# Number of steps to look back for convergence
look_back_steps = 3

while not is_converged:
    print(count)
    print(likelihood_history[-1])
    old_params = params
    latent = expectation(mixture, params)
    params = maximization(mixture, latent)

    # Append current params and likelihood to history
    params_history.append(params.copy())
    likelihood_history.append(mixture.likelihood(use_complete=False))
    
    # Check if converged
    if count >= 3:  # Ensure enough history is available
        is_converged = check_convergence(params_history, likelihood_history,list(range(1,look_back_steps+1)))
    count += 1

0
-5813.194372245713
1
-4084.220667757898
2
-4050.880519942173
3
-4021.507938763605
4
-3984.5774800092263
5
-3939.993293088375
6
-3894.9257237766406
7
-3857.3210093687558
8
-3829.899459408352
9
-3811.459326568134
10
-3799.8175591598165
11
-3792.860590895957
12
-3788.729645262283
13
-3786.79249002104
14
-3785.7250309161027
15
-3785.1415100261192
16
-3784.822618143944
17
-3784.6485580441563
18
-3784.552739521725
19
-3784.498385854542
20
-3784.4667313970867
21
-3784.4468904822897
22
-3784.433764288513
stopped due to likelihood similarity


In [14]:
params

{'0_TtoII_spread': 0.2541706932445177,
 '0_TtoIII_spread': 0.08153734768917623,
 '0_IItoIII_spread': 0.044918400622614645,
 '0_late_p': 0.42363422817323076,
 '0_c1_coef': 0.0005849904436662762,
 '0_c2_coef': 0.9999995334201446,
 '0_c3_coef': 0.45360786467209496,
 '1_TtoII_spread': 0.07361096211878104,
 '1_TtoIII_spread': 0.2518831807057421,
 '1_IItoIII_spread': 6.610696135189607e-05,
 '1_late_p': 0.47329409796776206,
 '1_c1_coef': 0.9994150095563338,
 '1_c2_coef': 4.665798554192631e-07,
 '1_c3_coef': 0.546392135327905}

Faster version with component-wise-maximization

In [14]:
is_converged = False
count = 0
params_history = []
likelihood_history = []
params_history.append(params.copy())
likelihood_history.append(mixture.likelihood(use_complete = False))
# Number of steps to look back for convergence
look_back_steps = 3

while not is_converged:
    print(count)
    print(likelihood_history[-1])

    old_params = params
    latent = expectation(mixture, params)
    params = maximization_component_wise(mixture, latent)

    # Append current params and likelihood to history
    params_history.append(params.copy())
    likelihood_history.append(mixture.likelihood(use_complete=False))
    
    # Check if converged
    if count >= 3:  # Ensure enough history is available
        is_converged = check_convergence(params_history, likelihood_history,list(range(1,look_back_steps+1)))
    count += 1

0
-3783.9044529312505
1
-3783.9044529312505
2
-3783.9023468430787
3
-3783.900254092534
stopped due to likelihood similarity


In [15]:
mixture.get_params(as_dict =True)

{'0_TtoII_spread': 0.25867734764522227,
 '0_TtoIII_spread': 0.07462686584696727,
 '0_IItoIII_spread': 0.05806141219185204,
 '0_late_p': 0.42474186931682645,
 '0_c1_coef': 0.06288631180588661,
 '0_c2_coef': 0.9803864662413907,
 '0_c3_coef': 0.4783617554889759,
 '1_TtoII_spread': 0.06533842194762927,
 '1_TtoIII_spread': 0.2600007248774467,
 '1_IItoIII_spread': 0.13315742875163752,
 '1_late_p': 0.4735048782595678,
 '1_c1_coef': 0.9371136881941134,
 '1_c2_coef': 0.019613533758609303,
 '1_c3_coef': 0.521638244511024}

In [29]:
mixture.likelihood()

-3783.4990931760067

## Sample model parameters

In [19]:
from lymixture.em import sample_model_params, get_complete_samples

samples = sample_model_params(mixture, steps = 20)
indices = np.random.choice(len(samples), 50, replace=False)
reduced_set = samples[indices]
complete_samples = get_complete_samples(mixture, reduced_set)

100%|██████████| 20/20 [00:04<00:00,  4.87it/s]


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
