In [27]:
import numpy as np
import pandas as pd
rng = np.random.default_rng(12345)

from lymph.models import Unilateral
from lymixture import LymphMixture
from lymixture.utils import binom_pmf, late_binomial, normalize
from fixtures import get_patient_data
PARAMS_C1 = {
    "TtoII_spread": 0.5,
    "TtoIII_spread": 0.25,
    "TtoIV_spread": 0.1,
    "IItoIII_spread": 0.4,
    "IIItoIV_spread": 0.3,
    "late_p": 0.5,
}
PARAMS_C2 = {
    "TtoII_spread": 0.65,
    "TtoIII_spread": 0.15,
    "TtoIV_spread": 0.05,
    "IItoIII_spread": 0.5,
    "IIItoIV_spread": 0.4,
    "late_p": 0.5,
}

In [28]:
data = pd.read_csv("data/mixture.csv", header=[0,1,2])
# new_column = pd.MultiIndex.from_tuples([('diagnose', 'ipsi', ['II', 'III'])])

# Initialize the new column with None values
# data['diagnose', 'ipsi', 'II'] = None
# data['diagnose', 'ipsi', 'III'] = None

data


Unnamed: 0_level_0,path,path,tumor,tumor
Unnamed: 0_level_1,ipsi,ipsi,1,1
Unnamed: 0_level_2,II,III,t_stage,subsite
0,False,True,late,c1
1,False,True,early,c1
2,False,True,early,c1
3,False,False,early,c1
4,False,True,early,c1
...,...,...,...,...
2995,True,True,early,c3
2996,True,True,late,c3
2997,False,False,early,c3
2998,True,False,early,c3


In [29]:
graph = {
    ("tumor", "T"): ["II", "III"],
    ("lnl", "II"): ["III"],
    ("lnl", "III"): [],
}
num_components = 2

mixture = LymphMixture(
    model_cls=Unilateral,
    model_kwargs={"graph_dict": graph},
    num_components=num_components,
)
mixture.load_patient_data(
    data,
    split_by=("tumor", "1", "subsite"),
    mapping=lambda x: x,
)

Set the diagnostic modality to be the same as in the generated dataset.

In [30]:
mixture.set_modality("path", 1., 1.)
# mixture.set_modality("diagnose", 1., 0.81 )
mixture.get_all_modalities()

{'path': Clinical(spec=1.0, sens=1.0, is_trinary=False)}

Fix the distribution over diagnosis times for early T-stage (T1 & T2) to be a binomial distribution with a parameters $p=0.3$.

The late T-stage's diagnosis time distribution is a binomial one with a free model parameter than needs to be learned as well.

In [31]:
mixture.set_distribution("early", binom_pmf(np.arange(11), 10, 0.3))
mixture.set_distribution("late", late_binomial)
mixture.get_all_distributions()

{'early': Distribution([0.0282475249, 0.121060821, 0.23347444050000002, 0.266827932, 0.20012094900000002, 0.1029193452, 0.03675690900000001, 0.009001692, 0.0014467005000000002, 0.00013778100000000004, 5.904900000000001e-06]),
 'late': Distribution([0.0009765625, 0.009765625, 0.0439453125, 0.1171875, 0.205078125, 0.24609375, 0.205078125, 0.1171875, 0.0439453125, 0.009765625, 0.0009765625])}

In [32]:
mixture.subgroups['c1']

Unilateral(graph_dict={('tumor', 'T'): ['II', 'III'], ('lnl', 'II'): ['III'], ('lnl', 'III'): []}, tumor_state=1, allowed_states=[0, 1], max_time=10)

Initialize random model parameters and latent variables/responsibilities.

In [33]:
from lymixture.em import expectation, maximization

params = {k: rng.uniform() for k in mixture.get_params()}
mixture.set_params(**params)
mixture.normalize_mixture_coefs()
latent = normalize(rng.uniform(size=mixture.get_resps().shape).T, axis=0).T

In [34]:
def to_numpy(params: dict[str, float]) -> np.ndarray:
    return np.array([p for p in params.values()])

Iterate the computation of the expectation value of the latent variables (E-step) and the maximization of the (complete) data log-likelihood w.r.t. the model parameters (M-step).

In [47]:
is_converged = False
count = 0

while not is_converged:
    old_params = params
    latent = expectation(mixture, params)
    params = maximization(mixture, latent)
    is_converged = np.allclose(to_numpy(params), to_numpy(old_params))
    count += 1

count

108

In [36]:
mixture.get_params()

{'0_TtoII_spread': 0.05877846240055626,
 '0_TtoIII_spread': 0.27356531381708626,
 '0_IItoIII_spread': 0.4558248342219044,
 '0_c1_coef': 0.8813834596335033,
 '0_c2_coef': 0.020526748939364358,
 '0_c3_coef': 0.49364946381159097,
 '1_TtoII_spread': 0.25323291477043985,
 '1_TtoIII_spread': 0.0711475788559298,
 '1_IItoIII_spread': 0.23606797749978967,
 '1_c1_coef': 0.11861654036649671,
 '1_c2_coef': 0.9794732510606357,
 '1_c3_coef': 0.5063505361884091,
 'late_p': 0.44800882921179597}

In [37]:
mixture.get_resps()

Unnamed: 0,0,1
0,0.982947,0.017053
1,0.979775,0.020225
2,0.979775,0.020225
3,0.877893,0.122107
4,0.979775,0.020225
...,...,...
2995,0.444552,0.555448
2996,0.430317,0.569683
2997,0.485411,0.514589
2998,0.078260,0.921740


In [38]:
mixture.components[1].get_params()

{'TtoII_spread': 0.25323291477043985,
 'TtoIII_spread': 0.0711475788559298,
 'IItoIII_spread': 0.23606797749978967,
 'late_p': 0.44800882921179597}

In [39]:
mixture.likelihood()

-3841.015890213266

In [40]:
mixture.get_mixture_coefs()

Unnamed: 0,c1,c2,c3
0,0.881383,0.020527,0.493649
1,0.118617,0.979473,0.506351


In [41]:
mixture.state_dist(subgroup='c1')

array([0.37049099, 0.42107142, 0.06651523, 0.14192236])

In [48]:
mixture.set_modality("diagnose", 1., 0.81 )

print(mixture.risk(subgroup='c1', involvement = {'II': True, 'III': None}, given_diagnosis={'diagnose':{'II': False, 'III': False}}))
mixture.del_modality("diagnose")

0.04043392233510655


In [45]:
mixture.risk(subgroup='c1', involvement = {'II': None, 'III': True}, given_diagnosis=None)

0.5629937801809121

In [16]:
mixture.subgroups['c1'].patient_data['_model']['path'].sum()

II     235
III    609
dtype: int64

In [17]:
mixture.subgroups['c1'].observation_matrix().T

array([[1.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.19  , 0.    , 0.    ],
       [0.    , 0.81  , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.19  , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.81  , 0.    ],
       [0.    , 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , 0.0361],
       [0.    , 0.    , 0.    , 0.1539],
       [0.    , 0.    , 0.    , 0.1539],
       [0.    , 0.    , 0.    , 0.6561]])

In [18]:
diagnosis = {'diagnose':{'II': True, 'III': True}}
mixture.subgroups['c1'].compute_encoding(diagnosis)

array([False, False, False,  True, False, False, False,  True, False,
       False, False,  True, False, False, False,  True])