In [3]:
import numpy as np
import pandas as pd
rng = np.random.default_rng(12345)

from lymph.models import Unilateral
from lymixture import LymphMixture
from lymixture.utils import binom_pmf, late_binomial, normalize
from fixtures import get_patient_data
PARAMS_C1 = {
    "TtoII_spread": 0.5,
    "TtoIII_spread": 0.25,
    "TtoIV_spread": 0.1,
    "IItoIII_spread": 0.4,
    "IIItoIV_spread": 0.3,
    "late_p": 0.5,
}
PARAMS_C2 = {
    "TtoII_spread": 0.65,
    "TtoIII_spread": 0.15,
    "TtoIV_spread": 0.05,
    "IItoIII_spread": 0.5,
    "IIItoIV_spread": 0.4,
    "late_p": 0.5,
}

In [4]:
data = pd.read_csv("data/mixture.csv", header=[0,1,2])
# new_column = pd.MultiIndex.from_tuples([('diagnose', 'ipsi', ['II', 'III'])])

# Initialize the new column with None values
# data['diagnose', 'ipsi', 'II'] = None
# data['diagnose', 'ipsi', 'III'] = None

data


Unnamed: 0_level_0,path,path,tumor,tumor
Unnamed: 0_level_1,ipsi,ipsi,1,1
Unnamed: 0_level_2,II,III,t_stage,subsite
0,False,True,late,c1
1,False,True,early,c1
2,False,True,early,c1
3,False,False,early,c1
4,False,True,early,c1
...,...,...,...,...
2995,True,True,early,c3
2996,True,True,late,c3
2997,False,False,early,c3
2998,True,False,early,c3


In [5]:
graph = {
    ("tumor", "T"): ["II", "III"],
    ("lnl", "II"): ["III"],
    ("lnl", "III"): [],
}
num_components = 2

mixture = LymphMixture(
    model_cls=Unilateral,
    model_kwargs={"graph_dict": graph},
    num_components=num_components,
    universal_p= False
)
mixture.load_patient_data(
    data,
    split_by=("tumor", "1", "subsite"),
    mapping=lambda x: x,
)

Set the diagnostic modality to be the same as in the generated dataset.

In [6]:
mixture.set_modality("path", 1., 1.)
# mixture.set_modality("diagnose", 1., 0.81 )
mixture.get_all_modalities()

{'path': Clinical(spec=1.0, sens=1.0, is_trinary=False)}

Fix the distribution over diagnosis times for early T-stage (T1 & T2) to be a binomial distribution with a parameters $p=0.3$.

The late T-stage's diagnosis time distribution is a binomial one with a free model parameter than needs to be learned as well.

In [7]:
mixture.set_distribution("early", binom_pmf(np.arange(11), 10, 0.3))
mixture.set_distribution("late", late_binomial)
mixture.get_all_distributions()

{'early': Distribution([0.0282475249, 0.121060821, 0.23347444050000002, 0.266827932, 0.20012094900000002, 0.1029193452, 0.03675690900000001, 0.009001692, 0.0014467005000000002, 0.00013778100000000004, 5.904900000000001e-06]),
 'late': Distribution([0.0009765625, 0.009765625, 0.0439453125, 0.1171875, 0.205078125, 0.24609375, 0.205078125, 0.1171875, 0.0439453125, 0.009765625, 0.0009765625])}

Initialize random model parameters and latent variables/responsibilities.

In [17]:
from lymixture.em import expectation, maximization

params = {k: rng.uniform() for k in mixture.get_params()}
mixture.set_params(**params)
mixture.normalize_mixture_coefs()
latent = normalize(rng.uniform(size=mixture.get_resps().shape).T, axis=0).T

In [20]:
def to_numpy(params: dict[str, float]) -> np.ndarray:
    return np.array([p for p in params.values()])

In [12]:
mixture.get_params()

{'0_TtoII_spread': 0.020849019688845005,
 '0_TtoIII_spread': 0.4812701641134445,
 '0_IItoIII_spread': 0.9822470072535524,
 '0_late_p': 0.625756249862707,
 '0_c1_coef': 0.9659539609011183,
 '0_c2_coef': 0.7890128239753627,
 '0_c3_coef': 0.5713461630202451,
 '1_TtoII_spread': 0.760006539229159,
 '1_TtoIII_spread': 0.2032222711690339,
 '1_IItoIII_spread': 0.3293068234312516,
 '1_late_p': 0.44985180128356617,
 '1_c1_coef': 0.03404603909888172,
 '1_c2_coef': 0.21098717602463735,
 '1_c3_coef': 0.4286538369797549}

Iterate the computation of the expectation value of the latent variables (E-step) and the maximization of the (complete) data log-likelihood w.r.t. the model parameters (M-step).

In [21]:
is_converged = False
count = 0

while not is_converged:
    print(count)
    old_params = params
    latent = expectation(mixture, params)
    params = maximization(mixture, latent)
    is_converged = np.allclose(to_numpy(params), to_numpy(old_params))
    count += 1

count

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


80

In [22]:
mixture.get_params(as_dict =True)

{'0_TtoII_spread': 0.2541983818385646,
 '0_TtoIII_spread': 0.08155469169240345,
 '0_IItoIII_spread': 0.04484739670390461,
 '0_late_p': 0.4236116039214868,
 '0_c1_coef': 6.610696135189607e-05,
 '0_c2_coef': 0.9999338930386481,
 '0_c3_coef': 0.4532666847788751,
 '1_TtoII_spread': 0.07368960284054812,
 '1_TtoIII_spread': 0.25177421262273714,
 '1_IItoIII_spread': 6.610696135189607e-05,
 '1_late_p': 0.47323689720695056,
 '1_c1_coef': 0.9999338930386481,
 '1_c2_coef': 6.61069613518972e-05,
 '1_c3_coef': 0.5467333152211249}

In [23]:
mixture_coefs = mixture.get_mixture_coefs().to_numpy()
mixture_coefs

array([[6.61069614e-05, 9.99933893e-01, 4.53266685e-01],
       [9.99933893e-01, 6.61069614e-05, 5.46733315e-01]])

In [None]:
mixture.get_resps()

Unnamed: 0,0,1
0,0.999989,0.000011
1,0.999987,0.000013
2,0.999987,0.000013
3,0.999932,0.000068
4,0.999987,0.000013
...,...,...
2995,0.503241,0.496759
2996,0.499489,0.500511
2997,0.543768,0.456232
2998,0.193783,0.806217


In [32]:
mixture.components[0].get_params()

{'TtoII_spread': 0.2541983818385646,
 'TtoIII_spread': 0.08155469169240345,
 'IItoIII_spread': 0.04484739670390461,
 'late_p': 0.4236116039214868}

In [29]:
mixture.likelihood()

-3783.4990931760067

In [None]:
mixture.get_mixture_coefs()

Unnamed: 0,c1,c2,c3
0,0.999934,6.6e-05,0.549033
1,6.6e-05,0.999934,0.450967


In [None]:
mixture.state_dist(subgroup='c1')

array([0.37156805, 0.42267438, 0.07545927, 0.13029829])

In [None]:
mixture.set_modality("diagnose", 1., 0.81 )

print(mixture.risk(subgroup='c1', involvement = {'II': True, 'III': None}, given_diagnosis={'diagnose':{'II': False, 'III': False}}))
mixture.del_modality("diagnose")

0.04043392233510655


In [None]:
mixture.risk(subgroup='c1', involvement = {'II': None, 'III': True}, given_diagnosis=None)

0.5529726723375737

In [None]:
mixture.subgroups['c1'].patient_data['_model']['path'].sum()

II     235
III    609
dtype: int64

In [None]:
mixture.subgroups['c1'].observation_matrix().T

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [None]:
diagnosis = {'diagnose':{'II': True, 'III': True}}
mixture.subgroups['c1'].compute_encoding(diagnosis)

array([ True,  True,  True,  True])