diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d87b24..bcc8030 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,48 @@ All notable changes to this project will be documented in this file. + +## [1.2.0] - 2024-03-29 + +### Bug Fixes + +- (**mid**) `obs_dist` may return 3D array. + + +### Documentation + +- Fix unknown version in title. +- Add missing blank before list. +- (**mid**) Add comment about midext marginalizing. + + +### Features + +- (**mid**) Add `posterior_state_dist()` method.\ + The `Midline` model now has a `posterior_state_dist()` method, too. +- (**types**) Base `Model` has state dist methods.\ + Both `state_dist()` and `posterior_state_dist()` have been added to the + `types.Model` base class. +- Add `marginalize()` method.\ + With this new method, one can marginalize a (prior or posterior) state + distribution over all states that match a provided involvement.\ + It is used e.g. to refactor the code of the `risk()` methods. +- (**types**) Add `obs_dist` and `marginalize`.\ + The `types.Model` base abstract base class now also has the methods + `obs_dist` and `marginalize` for better autocomplete support in editors. + + +### Testing + +- Remove plain test risk. + + +### Change + +- (**types**) Improve type hints for inv. pattern. +- Rename "diagnose" to "diagnosis" when noun.\ + When used as a noun, "diagnosis" is correct, not "diagnose". + ## [1.1.0] - 2024-03-20 @@ -626,7 +668,8 @@ Almost the entire API has changed. I'd therefore recommend to have a look at the - add pre-commit hook to check commit msg -[Unreleased]: https://github.com/rmnldwg/lymph/compare/1.1.0...HEAD +[Unreleased]: https://github.com/rmnldwg/lymph/compare/1.2.0...HEAD +[1.2.0]: https://github.com/rmnldwg/lymph/compare/1.1.0...1.2.0 [1.1.0]: https://github.com/rmnldwg/lymph/compare/1.0.0...1.1.0 [1.0.0]: https://github.com/rmnldwg/lymph/compare/1.0.0.rc2...1.0.0 [1.0.0.rc2]: https://github.com/rmnldwg/lymph/compare/1.0.0.rc1...1.0.0.rc2 diff --git a/README.rst b/README.rst index 6264881..643256a 100644 --- a/README.rst +++ b/README.rst @@ -26,13 +26,13 @@ HNSCC spreads though the lymphatic system of the neck and forms metastases in re To account for this microscopic involvement, parts of the lymphatic system are often irradiated electively to increase tumor control. Which parts are included in this elective clinical target volume is currently decided based on guidelines [1]_ [2]_ [3]_ [4]_. These in turn are derived from reports of the prevalence of involvement per lymph node level (LNL), i.e. the portion of patients that were diagnosed with metastases in any given LNL, stratified by primary tumor location. It is recommended to include a LNL in the elective target volume if 10 - 15% of patients showed involvement in that particular level. -However, while the prevalence of involvement has been reported in the literature [5]_ [6]_, and the general lymph drainage pathways are understood well, the detailed progression patterns of HNSCC remain poorly quantified. We believe that the risk for microscopic involvement in an LNL depends highly on the specific diagnose of a particular patient and their treatment can hence be personalized if the progression patterns were better quantified. +However, while the prevalence of involvement has been reported in the literature [5]_ [6]_, and the general lymph drainage pathways are understood well, the detailed progression patterns of HNSCC remain poorly quantified. We believe that the risk for microscopic involvement in an LNL depends highly on the specific diagnosis of a particular patient and their treatment can hence be personalized if the progression patterns were better quantified. Our Goal ======== -With this Python package we want to provide a framework to accurately predict the risk for microscopic metastases in any lymph node level for the specific diagnose a particular patient presents with. +With this Python package we want to provide a framework to accurately predict the risk for microscopic metastases in any lymph node level for the specific diagnosis a particular patient presents with. The implemented model is highly interpretable and was developed together with clinicians to accurately represent the anatomy of the lymphatic drainiage. It can be trained with data that reports the patterns of lymphatic progression in detail, like the `dataset(s) `_ we collected at our institution, the University Hospital Zurich (USZ). diff --git a/docs/source/components.rst b/docs/source/components.rst index c392ab5..31efd3b 100644 --- a/docs/source/components.rst +++ b/docs/source/components.rst @@ -18,10 +18,10 @@ Diagnostic Modalities :show-inheritance: -Marginalization over Diagnose Times +Marginalization over Diagnosis Times ----------------------------------- -.. automodule:: lymph.diagnose_times +.. automodule:: lymph.diagnosis_times :members: :special-members: __init__, __hash__ :show-inheritance: diff --git a/docs/source/conf.py b/docs/source/conf.py index 02af062..80f7d1e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3,36 +3,19 @@ # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/main/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys - -from pkg_resources import DistributionNotFound, get_distribution - -sys.path.insert(0, os.path.abspath('../..')) - -try: - __version__ = get_distribution("lymph").version -except DistributionNotFound: - __version__ = "unknown version" - +import lymph # -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'lymph' copyright = '2022, Roman Ludwig' author = 'Roman Ludwig' gh_username = 'rmnldwg' -version = __version__ +version = lymph.__version__ # The full version, including alpha/beta/rc tags -release = __version__ +release = lymph.__version__ # -- General configuration --------------------------------------------------- diff --git a/docs/source/quickstart_bilateral.ipynb b/docs/source/quickstart_bilateral.ipynb index fda1214..dd656d3 100644 --- a/docs/source/quickstart_bilateral.ipynb +++ b/docs/source/quickstart_bilateral.ipynb @@ -222,9 +222,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Distribution over Diagnose Times\n", + "## Distribution over Diagnosis Times\n", "\n", - "Just as with the modalities, the distributions over diagnose times are delegated to the two sides via the exact same API as in the `Unilateral` model:" + "Just as with the modalities, the distributions over diagnosis times are delegated to the two sides via the exact same API as in the `Unilateral` model:" ] }, { @@ -276,7 +276,7 @@ "\n", ":::{note}\n", "\n", - "You cannot set the diagnose time distributions asymmetrically! With the modalities this may make sense (although it is not really supported, you may try), but for the diagnose times, this will surely break!\n", + "You cannot set the diagnosis time distributions asymmetrically! With the modalities this may make sense (although it is not really supported, you may try), but for the diagnosis times, this will surely break!\n", ":::\n", "\n", "## Likelihood\n", diff --git a/docs/source/quickstart_unilateral.ipynb b/docs/source/quickstart_unilateral.ipynb index 779587a..20540a0 100644 --- a/docs/source/quickstart_unilateral.ipynb +++ b/docs/source/quickstart_unilateral.ipynb @@ -6,7 +6,7 @@ "source": [ "# Getting started\n", "\n", - "A lot of people get diagnosed with squamous cell carcinoma in the head & neck region ([HNSCC](https://en.wikipedia.org/wiki/Head_and_neck_cancer)), which frequently metastasizes via the lymphatic system. We set out to develop a methodology to predict the risk of a new patient having metastases in so-called lymph node levels (LNLs), based on their personal diagnose (e.g. findings from a CT scan) and information of previously diagnosed and treated patients. And that's exactly what this code enables you to do as well.\n", + "A lot of people get diagnosed with squamous cell carcinoma in the head & neck region ([HNSCC](https://en.wikipedia.org/wiki/Head_and_neck_cancer)), which frequently metastasizes via the lymphatic system. We set out to develop a methodology to predict the risk of a new patient having metastases in so-called lymph node levels (LNLs), based on their personal diagnosis (e.g. findings from a CT scan) and information of previously diagnosed and treated patients. And that's exactly what this code enables you to do as well.\n", "\n", "As mentioned, this package is meant to be a relatively simple-to-use frontend. The math is done under the hood and one does not need to worry about it a lot. But let's have a quick look at what we're doing here.\n", "\n", @@ -152,7 +152,7 @@ "source": [ "## Diagnostic Modalities\n", "\n", - "To ultimately compute the likelihoods of observations, we need to fix the sensitivities and specificities of the obtained diagnoses. And since we might have multiple diagnostic modalities available, we need to tell the system which of them comes with which specificity and sensitivity. We do this by adding specificity/sensitivity pairs to our model:" + "To ultimately compute the likelihoods of observations, we need to fix the sensitivities and specificities of the obtained diagnosis. And since we might have multiple diagnostic modalities available, we need to tell the system which of them comes with which specificity and sensitivity. We do this by adding specificity/sensitivity pairs to our model:" ] }, { @@ -256,7 +256,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To feed the dataset into the system, we assign the dataset to the attribute `patient_data`. What the system then does here is creating a diagnose matrix for every T-stage in the data." + "To feed the dataset into the system, we assign the dataset to the attribute `patient_data`. What the system then does here is creating a diagnosis matrix for every T-stage in the data." ] }, { @@ -275,7 +275,7 @@ "source": [ ":::{note}\n", "\n", - "The data now has an additional top-level header `\"_model\"` which stores only the information the model actually needs. In this case, it only stores the ipsilateral CT diagnoses of the LNLs I, II, III, and IV, as well as the mapped T-stage of the patients. Note that from the original T-stages 1, 2, 3, and 4, only \"early\" and \"late\" are left. This is the default transformation, but it can be changed by providing a function to the `mapping` keyword argument in the `load_patient_data()` method.\n", + "The data now has an additional top-level header `\"_model\"` which stores only the information the model actually needs. In this case, it only stores the ipsilateral CT diagnosis of the LNLs I, II, III, and IV, as well as the mapped T-stage of the patients. Note that from the original T-stages 1, 2, 3, and 4, only \"early\" and \"late\" are left. This is the default transformation, but it can be changed by providing a function to the `mapping` keyword argument in the `load_patient_data()` method.\n", ":::" ] }, @@ -283,9 +283,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Distribution over Diagnose Times\n", + "## Distribution over Diagnosis Times\n", "\n", - "The last ingredient to set up (at least when using the hidden Markov model) would now be the distribution over diagnose times. Our dataset contains two different T-stages \"early\" and \"late\". One of the underlying assumptions with our model is that earlier T-stage patients have been - on average - diagnosed at an earlier time-point, compared to late T-stage patients. We can reflect that using distributions over the diagnosis time:" + "The last ingredient to set up (at least when using the hidden Markov model) would now be the distribution over diagnosis times. Our dataset contains two different T-stages \"early\" and \"late\". One of the underlying assumptions with our model is that earlier T-stage patients have been - on average - diagnosed at an earlier time-point, compared to late T-stage patients. We can reflect that using distributions over the diagnosis time:" ] }, { @@ -313,7 +313,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can now set a fixed prior for the distribution over diagnose times of early T-stage patients (i.e., patients with T1 and T2 tumors)." + "We can now set a fixed prior for the distribution over diagnosis times of early T-stage patients (i.e., patients with T1 and T2 tumors)." ] }, { @@ -330,11 +330,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's define a parametrized PMF over diagnose times for patients with late T-stage tumors (T3 and T4) to show this functionality. For that, we first define a parametrized function with the signature\n", + "Let's define a parametrized PMF over diagnosis times for patients with late T-stage tumors (T3 and T4) to show this functionality. For that, we first define a parametrized function with the signature\n", "\n", "```python\n", "def distribution(support: list[float] | np.ndarray, a=1, b=2, c=3, ...) -> np.ndarray:\n", - " \"\"\"PMF over diagnose times (``support``) with parameters ``a``, ``b``, and ``c``.\"\"\"\n", + " \"\"\"PMF over diagnosis times (``support``) with parameters ``a``, ``b``, and ``c``.\"\"\"\n", " ...\n", " return result\n", "```\n", @@ -405,7 +405,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note how the set of adjustable parameters now also contains the `p` parameter for the late T-stage's distribution over diagnose times. For the early T-stage, it is not present, because that one was provided as a fixed array." + "Note how the set of adjustable parameters now also contains the `p` parameter for the late T-stage's distribution over diagnosis times. For the early T-stage, it is not present, because that one was provided as a fixed array." ] }, { diff --git a/lymph/__init__.py b/lymph/__init__.py index 73df9a3..7278928 100644 --- a/lymph/__init__.py +++ b/lymph/__init__.py @@ -16,11 +16,11 @@ # nopycln: file -from lymph import diagnose_times, graph, matrix, models +from lymph import diagnosis_times, graph, matrix, models from lymph.utils import clinical, pathological __all__ = [ - "diagnose_times", "matrix", + "diagnosis_times", "matrix", "graph", "models", "clinical", "pathological", ] diff --git a/lymph/diagnose_times.py b/lymph/diagnosis_times.py similarity index 98% rename from lymph/diagnose_times.py rename to lymph/diagnosis_times.py index 168a540..58adde4 100644 --- a/lymph/diagnose_times.py +++ b/lymph/diagnosis_times.py @@ -1,5 +1,5 @@ """ -Module for marginalizing over diagnose times. +Module for marginalizing over diagnosis times. The hidden Markov model we implement assumes that every patient started off with a healthy neck, meaning no lymph node levels harboured any metastases. This is a valid @@ -33,22 +33,22 @@ class SupportError(Exception): class Distribution: - """Class that provides a way of storing distributions over diagnose times.""" + """Class that provides a way of storing distributions over diagnosis times.""" def __init__( self, distribution: Iterable[float] | callable, max_time: int | None = None, **kwargs, ) -> None: - """Initialize a distribution over diagnose times. + """Initialize a distribution over diagnosis times. This object can either be created by passing a parametrized function (e.g., ``scipy.stats`` distribution) or by passing a list of probabilities for each - diagnose time. + diagnosis time. The signature of the function must be ``func(support, **kwargs)``, where ``support`` is the support of the distribution from 0 to ``max_time``. The - function must return a list of probabilities for each diagnose time. + function must return a list of probabilities for each diagnosis time. Note: All arguments except ``support`` must have default values and if some @@ -214,7 +214,7 @@ def get_params( """If updateable, return the dist's ``param`` value or all params in a dict. See Also: - :py:meth:`lymph.diagnose_times.DistributionsUserDict.get_params` + :py:meth:`lymph.diagnosis_times.DistributionsUserDict.get_params` :py:meth:`lymph.graph.Edge.get_params` :py:meth:`lymph.models.Unilateral.get_params` :py:meth:`lymph.models.Bilateral.get_params` @@ -264,7 +264,7 @@ def draw_diag_times( rng: np.random.Generator | None = None, seed: int = 42, ) -> np.ndarray: - """Draw ``num`` samples of diagnose times from the stored PMF. + """Draw ``num`` samples of diagnosis times from the stored PMF. A random number generator can be provided as ``rng``. If ``None``, a new one is initialized with the given ``seed`` (or ``42``, by default). diff --git a/lymph/graph.py b/lymph/graph.py index a4188bc..ffb79ac 100644 --- a/lymph/graph.py +++ b/lymph/graph.py @@ -406,8 +406,8 @@ def get_params( """Return the value of the parameter ``param`` or all params in a dict. See Also: - :py:meth:`lymph.diagnose_times.Distribution.get_params` - :py:meth:`lymph.diagnose_times.DistributionsUserDict.get_params` + :py:meth:`lymph.diagnosis_times.Distribution.get_params` + :py:meth:`lymph.diagnosis_times.DistributionsUserDict.get_params` :py:meth:`lymph.models.Unilateral.get_params` :py:meth:`lymph.models.Bilateral.get_params` """ diff --git a/lymph/matrix.py b/lymph/matrix.py index cccbef9..9abc149 100644 --- a/lymph/matrix.py +++ b/lymph/matrix.py @@ -11,9 +11,9 @@ import numpy as np import pandas as pd -from lymph import graph -from lymph.utils import get_state_idx_matrix, row_wise_kron, tile_and_repeat +from lymph import graph, types from lymph.modalities import Modality +from lymph.utils import get_state_idx_matrix, row_wise_kron, tile_and_repeat @lru_cache(maxsize=128) @@ -94,17 +94,18 @@ def generate_observation( def compute_encoding( lnls: list[str], - pattern: pd.Series | dict[str, bool | int | str], + pattern: pd.Series | dict[str, types.InvolvementIndicator], base: int = 2, ) -> np.ndarray: """Compute the encoding of a particular ``pattern`` of involvement. A ``pattern`` holds information about the involvement of each LNL and the function transforms this into a binary encoding which is ``True`` for all possible complete - states/diagnoses that are compatible with the given ``pattern``. + states/diagnosis that are compatible with the given ``pattern``. In the binary case (``base=2``), the value behind ``pattern[lnl]`` can be one of the following things: + - ``False``: The LNL is healthy. - ``"healthy"``: The LNL is healthy. - ``True``: The LNL is involved. @@ -113,6 +114,7 @@ def compute_encoding( In the trinary case (``base=3``), the value behind ``pattern[lnl]`` can be one of these things: + - ``False``: The LNL is healthy. - ``"healthy"``: The LNL is healthy. - ``True``: The LNL is involved (micro- or macroscopic). @@ -211,12 +213,12 @@ def generate_data_encoding( if modality_name not in patient_row: warnings.warn(f"Modality {modality_name} not in data. Skipping.") continue - diagnose_encoding = compute_encoding( + diagnosis_encoding = compute_encoding( lnls=lnls, pattern=patient_row[modality_name], base=2, # observations are always binary! ) - patient_encoding = np.kron(patient_encoding, diagnose_encoding) + patient_encoding = np.kron(patient_encoding, diagnosis_encoding) result[:,i] = patient_encoding diff --git a/lymph/models/bilateral.py b/lymph/models/bilateral.py index 899c5c2..eef5174 100644 --- a/lymph/models/bilateral.py +++ b/lymph/models/bilateral.py @@ -7,14 +7,14 @@ import numpy as np import pandas as pd -from lymph import diagnose_times, matrix, modalities, models, types, utils +from lymph import diagnosis_times, matrix, modalities, models, types, utils warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) logger = logging.getLogger(__name__) class Bilateral( - diagnose_times.Composite, + diagnosis_times.Composite, modalities.Composite, types.Model, ): @@ -23,7 +23,7 @@ class Bilateral( This is achieved by creating two instances of the :py:class:`~lymph.models.Unilateral` model, one for the ipsi- and one for the contralateral side of the neck. The two sides are assumed to be independent of each - other, given the diagnose time over which we marginalize. + other, given the diagnosis time over which we marginalize. See Also: :py:class:`~lymph.models.Unilateral` @@ -78,7 +78,7 @@ def __init__( self.is_symmetric = is_symmetric - diagnose_times.Composite.__init__( + diagnosis_times.Composite.__init__( self, distribution_children={"ipsi": self.ipsi, "contra": self.contra}, is_distribution_leaf=False, @@ -377,7 +377,7 @@ def set_params(self, *args: float, **kwargs: float) -> tuple[float]: sides. 3. The parameters of the parametric distributions for marginalizing over - diagnose times. + diagnosis times. When still some positional arguments remain after that, they are returned in a tuple. @@ -409,7 +409,7 @@ def state_dist( This computes the state distributions of both sides and returns their outer product. In case ``mode`` is ``"HMM"`` (default), the state distributions are - first marginalized over the diagnose time distribtions of the respective + first marginalized over the diagnosis time distribtions of the respective ``t_stage``. See Also: @@ -437,6 +437,7 @@ def state_dist( def obs_dist( self, + given_state_dist: np.ndarray | None = None, t_stage: str = "early", mode: Literal["HMM", "BN"] = "HMM", ) -> np.ndarray: @@ -448,10 +449,12 @@ def obs_dist( a 2D array, because it computes the probability of any possible combination of ipsi- and contralateral observations. """ - joint_state_dist = self.state_dist(t_stage=t_stage, mode=mode) + if given_state_dist is None: + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode) + return ( self.ipsi.observation_matrix().T - @ joint_state_dist + @ given_state_dist @ self.contra.observation_matrix() ) @@ -464,8 +467,8 @@ def patient_likelihoods( """Compute the likelihood of each patient individually.""" joint_state_dist = self.state_dist(t_stage=t_stage, mode=mode) return matrix.fast_trace( - self.ipsi.diagnose_matrix(t_stage), - joint_state_dist @ self.contra.diagnose_matrix(t_stage).T, + self.ipsi.diagnosis_matrix(t_stage), + joint_state_dist @ self.contra.diagnosis_matrix(t_stage).T, ) @@ -473,8 +476,8 @@ def _bn_likelihood(self, log: bool = True, t_stage: str | None = None) -> float: """Compute the BN likelihood of data, using the stored params.""" joint_state_dist = self.state_dist(mode="BN") patient_llhs = matrix.fast_trace( - self.ipsi.diagnose_matrix(t_stage), - joint_state_dist @ self.contra.diagnose_matrix(t_stage).T, + self.ipsi.diagnosis_matrix(t_stage), + joint_state_dist @ self.contra.diagnosis_matrix(t_stage).T, ) return np.sum(np.log(patient_llhs)) if log else np.prod(patient_llhs) @@ -503,8 +506,8 @@ def _hmm_likelihood(self, log: bool = True, t_stage: str | None = None) -> float @ contra_dist_evo ) patient_llhs = matrix.fast_trace( - self.ipsi.diagnose_matrix(stage), - joint_state_dist @ self.contra.diagnose_matrix(stage).T, + self.ipsi.diagnosis_matrix(stage), + joint_state_dist @ self.contra.diagnosis_matrix(stage).T, ) llh = utils.add_or_mult(llh, patient_llhs, log) @@ -515,8 +518,8 @@ def likelihood( self, given_params: types.ParamsType | None = None, log: bool = True, + t_stage: str | None = None, mode: Literal["HMM", "BN"] = "HMM", - for_t_stage: str | None = None, ): """Compute the (log-)likelihood of the stored data given the model (and params). @@ -543,9 +546,9 @@ def likelihood( return -np.inf if log else 0. if mode == "HMM": - return self._hmm_likelihood(log, for_t_stage) + return self._hmm_likelihood(log, t_stage) if mode == "BN": - return self._bn_likelihood(log, for_t_stage) + return self._bn_likelihood(log, t_stage) raise ValueError("Invalid mode. Must be either 'HMM' or 'BN'.") @@ -554,17 +557,17 @@ def posterior_state_dist( self, given_params: types.ParamsType | None = None, given_state_dist: np.ndarray | None = None, - given_diagnoses: dict[str, types.DiagnoseType] | None = None, + given_diagnosis: dict[str, types.DiagnosisType] | None = None, t_stage: str | int = "early", mode: Literal["HMM", "BN"] = "HMM", ) -> np.ndarray: - """Compute joint post. dist. over ipsi & contra states, ``given_diagnoses``. + """Compute joint post. dist. over ipsi & contra states, ``given_diagnosis``. - The ``given_diagnoses`` is a dictionary storing one :py:obj:`.types.DiagnoseType` + The ``given_diagnosis`` is a dictionary storing one :py:obj:`.types.DiagnosisType` each for the ``"ipsi"`` and ``"contra"`` side of the neck. Essentially, this is the risk for any possible combination of ipsi- and - contralateral involvement, given the provided diagnoses. + contralateral involvement, given the provided diagnosis. Warning: As in the :py:meth:`.Unilateral.posterior_state_dist` method, one may @@ -581,29 +584,66 @@ def posterior_state_dist( utils.safe_set_params(self, given_params) given_state_dist = self.state_dist(t_stage=t_stage, mode=mode) - if given_diagnoses is None: - given_diagnoses = {} + if given_diagnosis is None: + given_diagnosis = {} - diagnose_given_state = {} + diagnosis_given_state = {} for side in ["ipsi", "contra"]: - if side not in given_diagnoses: - warnings.warn(f"No diagnoses given for {side}lateral side.") + if side not in given_diagnosis: + warnings.warn(f"No diagnosis given for {side}lateral side.") - diagnose_encoding = getattr(self, side).compute_encoding( - given_diagnoses.get(side, {}) + diagnosis_encoding = getattr(self, side).compute_encoding( + given_diagnosis.get(side, {}) ) observation_matrix = getattr(self, side).observation_matrix() # vector with P(Z=z|X) for each state X. A data matrix for one "patient" - diagnose_given_state[side] = diagnose_encoding @ observation_matrix.T + diagnosis_given_state[side] = diagnosis_encoding @ observation_matrix.T # matrix with P(Zi=zi,Zc=zc|Xi,Xc) * P(Xi,Xc) for all states Xi,Xc. - joint_diagnose_and_state = np.outer( - diagnose_given_state["ipsi"], - diagnose_given_state["contra"], + joint_diagnosis_and_state = np.outer( + diagnosis_given_state["ipsi"], + diagnosis_given_state["contra"], ) * given_state_dist # Following Bayes' theorem, this is P(Xi,Xc|Zi=zi,Zc=zc) which is given by # P(Zi=zi,Zc=zc|Xi,Xc) * P(Xi,Xc) / P(Zi=zi,Zc=zc) - return joint_diagnose_and_state / np.sum(joint_diagnose_and_state) + return joint_diagnosis_and_state / np.sum(joint_diagnosis_and_state) + + + def marginalize( + self, + involvement: dict[str, types.PatternType] | None = None, + given_state_dist: np.ndarray | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + ) -> float: + """Marginalize ``given_state_dist`` over matching ``involvement`` patterns. + + Any state that matches the provided ``involvement`` pattern is marginalized + over. For this, the :py:func:`.matrix.compute_encoding` function is used. + + If ``given_state_dist`` is ``None``, it will be computed by calling + :py:meth:`.state_dist` with the given ``t_stage`` and ``mode``. These arguments + are ignored if ``given_state_dist`` is provided. + """ + if involvement is None: + return 1. + + if given_state_dist is None: + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode) + + marginalize_over_states = {} + for side in ["ipsi", "contra"]: + side_graph = getattr(self, side).graph + marginalize_over_states[side] = matrix.compute_encoding( + lnls=side_graph.lnls.keys(), + pattern=involvement.get(side, {}), + base=3 if self.is_trinary else 2, + ) + return ( + marginalize_over_states["ipsi"] + @ given_state_dist + @ marginalize_over_states["contra"] + ) def risk( @@ -611,11 +651,11 @@ def risk( involvement: dict[str, types.PatternType] | None = None, given_params: types.ParamsType | None = None, given_state_dist: np.ndarray | None = None, - given_diagnoses: dict[str, types.DiagnoseType] | None = None, + given_diagnosis: dict[str, types.DiagnosisType] | None = None, t_stage: str = "early", mode: Literal["HMM", "BN"] = "HMM", ) -> float: - """Compute risk of the ``involvement`` patterns, given parameters and diagnoses. + """Compute risk of the ``involvement`` patterns, given parameters and diagnosis. The ``involvement`` of interest is expected to be a :py:obj:`.PatternType` for each side of the neck (``"ipsi"`` and ``"contra"``). This method then @@ -627,30 +667,15 @@ def risk( its docstring for more details on the remaining arguments. """ # TODO: test this method - posterior_state_probs = self.posterior_state_dist( + posterior_state_dist = self.posterior_state_dist( given_params=given_params, given_state_dist=given_state_dist, - given_diagnoses=given_diagnoses, + given_diagnosis=given_diagnosis, t_stage=t_stage, mode=mode, ) - if involvement is None: - return posterior_state_probs - - marginalize_over_states = {} - for side in ["ipsi", "contra"]: - side_graph = getattr(self, side).graph - marginalize_over_states[side] = matrix.compute_encoding( - lnls=side_graph.lnls.keys(), - pattern=involvement.get(side, {}), - base=3 if self.is_trinary else 2, - ) - return ( - marginalize_over_states["ipsi"] - @ posterior_state_probs - @ marginalize_over_states["contra"] - ) + return self.marginalize(involvement, posterior_state_dist) def draw_patients( @@ -664,10 +689,10 @@ def draw_patients( """Draw ``num`` random patients from the parametrized model. See Also: - :py:meth:`.diagnose_times.Distribution.draw_diag_times` - Method to draw diagnose times from a distribution. - :py:meth:`.Unilateral.draw_diagnoses` - Method to draw individual diagnoses from a unilateral model. + :py:meth:`.diagnosis_times.Distribution.draw_diag_times` + Method to draw diagnosis times from a distribution. + :py:meth:`.Unilateral.draw_diagnosis` + Method to draw individual diagnosis from a unilateral model. :py:meth:`.Unilateral.draw_patients` The unilateral method to draw a synthetic dataset. """ @@ -688,12 +713,12 @@ def draw_patients( for t_stage in drawn_t_stages ] - drawn_obs_ipsi = self.ipsi.draw_diagnoses(drawn_diag_times, rng=rng) - drawn_obs_contra = self.contra.draw_diagnoses(drawn_diag_times, rng=rng) + drawn_obs_ipsi = self.ipsi.draw_diagnosis(drawn_diag_times, rng=rng) + drawn_obs_contra = self.contra.draw_diagnosis(drawn_diag_times, rng=rng) drawn_obs = np.concatenate([drawn_obs_ipsi, drawn_obs_contra], axis=1) # construct MultiIndex with "ipsi" and "contra" at top level to allow - # concatenation of the two separate drawn diagnoses + # concatenation of the two separate drawn diagnosis sides = ["ipsi", "contra"] modality_names = list(self.get_all_modalities().keys()) lnl_names = [lnl for lnl in self.ipsi.graph.lnls.keys()] diff --git a/lymph/models/midline.py b/lymph/models/midline.py index dff1562..0a31a0c 100644 --- a/lymph/models/midline.py +++ b/lymph/models/midline.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from lymph import diagnose_times, matrix, modalities, models, types, utils +from lymph import diagnosis_times, matrix, modalities, models, types, utils warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) logger = logging.getLogger(__name__) @@ -19,7 +19,7 @@ class Midline( - diagnose_times.Composite, + diagnosis_times.Composite, modalities.Composite, types.Model, ): @@ -80,7 +80,7 @@ def __init__( data is stored in a :py:class:`~.Bilateral` instance accessible via the attribute ``"unknown"``. Note that this bilateral instance does not get updated parameters or any other kind of attention. It is solely used to store the data - and generate diagnose matrices for those data. + and generate diagnosis matrices for those data. The ``uni_kwargs`` are passed to all bilateral models. @@ -148,7 +148,7 @@ def __init__( self.midext_prob = 0. - diagnose_times.Composite.__init__( + diagnosis_times.Composite.__init__( self, distribution_children={"ext": self.ext, "noext": self.noext, **other_children}, is_distribution_leaf=False, @@ -343,7 +343,7 @@ def get_params( This includes the spread parameters from the call to :py:meth:`get_spread_params` and the distribution parameters from the call to - :py:meth:`~.diagnose_times.Composite.get_distribution_params`. + :py:meth:`~.diagnosis_times.Composite.get_distribution_params`. """ params = {} params["midext_prob"] = self.midext_prob @@ -552,8 +552,8 @@ def contra_state_dist_evo(self) -> tuple[np.ndarray, np.ndarray]: def state_dist( self, t_stage: str = "early", - central: bool = False, mode: Literal["HMM", "BN"] = "HMM", + central: bool = False, ) -> np.ndarray: """Compute the joint over ipsi- & contralaleral hidden states and midline ext. @@ -580,6 +580,40 @@ def state_dist( raise NotImplementedError("Only HMM mode is supported as of now.") + def obs_dist( + self, + given_state_dist: np.ndarray | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + central: bool = False, + ) -> np.ndarray: + """Compute the joint distribution over the ipsi- & contralateral observations. + + If ``given_state_dist`` is provided, ``t_stage``, ``mode``, and ``central`` are + ignored. The provided state distribution may be 2D or 3D. The returned + distribution will have the same dimensionality. + + See Also: + :py:meth:`.Unilateral.obs_dist` + The corresponding unilateral function. Note that this method returns + a 2D array, because it computes the probability of any possible + combination of ipsi- and contralateral observations. + """ + if given_state_dist is None: + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode, central=central) + + if given_state_dist.ndim == 2: + return self.ext.obs_dist(given_state_dist=given_state_dist) + + # Theoretically, we also have a sensitivity and specificity for observing + # midline extension, but realisitically, these values will just be 1. + obs_dist = [ + self.ext.obs_dist(given_state_dist=given_state_dist[0]), + self.ext.obs_dist(given_state_dist=given_state_dist[1]), + ] + return np.stack(obs_dist) + + def _hmm_likelihood(self, log: bool = True, for_t_stage: str | None = None) -> float: """Compute the likelihood of the stored data under the hidden Markov model.""" llh = 0. if log else 1. @@ -603,15 +637,15 @@ def _hmm_likelihood(self, log: bool = True, for_t_stage: str | None = None) -> f marg_joint_state_dist += joint_state_dist _model = getattr(self, case) patient_llhs = matrix.fast_trace( - _model.ipsi.diagnose_matrix(stage), - joint_state_dist @ _model.contra.diagnose_matrix(stage).T + _model.ipsi.diagnosis_matrix(stage), + joint_state_dist @ _model.contra.diagnosis_matrix(stage).T ) llh = utils.add_or_mult(llh, patient_llhs, log=log) try: marg_patient_llhs = matrix.fast_trace( - self.unknown.ipsi.diagnose_matrix(stage), - marg_joint_state_dist @ self.unknown.contra.diagnose_matrix(stage).T + self.unknown.ipsi.diagnosis_matrix(stage), + marg_joint_state_dist @ self.unknown.contra.diagnosis_matrix(stage).T ) llh = utils.add_or_mult(llh, marg_patient_llhs, log=log) except AttributeError: @@ -621,9 +655,9 @@ def _hmm_likelihood(self, log: bool = True, for_t_stage: str | None = None) -> f if self.use_central: if log: - llh += self.central.likelihood(log=log, for_t_stage=for_t_stage) + llh += self.central.likelihood(log=log, t_stage=for_t_stage) else: - llh *= self.central.likelihood(log=log, for_t_stage=for_t_stage) + llh *= self.central.likelihood(log=log, t_stage=for_t_stage) return llh @@ -632,8 +666,8 @@ def likelihood( self, given_params: types.ParamsType | None = None, log: bool = True, + t_stage: str | None = None, mode: Literal["HMM", "BN"] = "HMM", - for_t_stage: str | None = None, ) -> float: """Compute the (log-)likelihood of the stored data given the model (and params). @@ -660,23 +694,119 @@ def likelihood( return -np.inf if log else 0. if mode == "HMM": - return self._hmm_likelihood(log, for_t_stage) + return self._hmm_likelihood(log, t_stage) raise NotImplementedError("Only HMM mode is supported as of now.") + def posterior_state_dist( + self, + given_params: types.ParamsType | None = None, + given_state_dist: np.ndarray | None = None, + given_diagnosis: dict[str, types.DiagnosisType] | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + midext: bool | None = None, + central: bool = False, + ) -> float: + """Compute the posterior state distribution. + + Using either the ``given_params`` or the ``given_state_dist`` argument, this + method computes the posterior state distribution of the model for the + ``given_diagnosis``, a specific ``t_stage``, whether the tumor extends over the + mid-sagittal line (``midext``), and whether it is central (``central``, only + used if :py:attr:`use_central` is ``True``). + + See Also: + :py:meth:`.types.Model.posterior_state_dist` + The corresponding method in the base class. + :py:meth:`.Bilateral.posterior_state_dist` + The bilateral method that is ultimately called by this one. + """ + # NOTE: When given a 2D state distribution, it does not matter which of the + # Bilateral models is used to compute the risk, since the state dist is + # is the only thing that could differ between models. + if given_state_dist is None: + utils.safe_set_params(self, given_params) + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode, central=central) + + if given_state_dist.ndim == 2: + return self.ext.posterior_state_dist( + given_state_dist=given_state_dist, + given_diagnosis=given_diagnosis, + ) + + if central: + raise ValueError("The `given_state_dist` must be 2D for the central model.") + + if midext is None: + given_state_dist = np.sum(given_state_dist, axis=0) + else: + given_state_dist = given_state_dist[int(midext)] + given_state_dist = given_state_dist / given_state_dist.sum() + + return self.ext.posterior_state_dist( + given_state_dist=given_state_dist, + given_diagnosis=given_diagnosis, + ) + + + def marginalize( + self, + involvement: types.PatternType | None = None, + given_state_dist: np.ndarray | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + midext: bool | None = None, + central: bool = False, + ) -> float: + """Marginalize ``given_state_dist`` over matching ``involvement`` patterns. + + Any state that matches the provided ``involvement`` pattern is marginalized + over. For this, the :py:func:`.matrix.compute_encoding` function is used. + + The arguments ``t_stage``, ``mode``, and ``central`` are only used if + ``given_state_dist`` is ``None``. In this case they are passed to the + :py:meth:`.state_dist` method. + """ + if involvement is None: + return 1. + + if given_state_dist is None: + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode, central=central) + + if given_state_dist.ndim == 2: + return self.ext.marginalize( + involvement=involvement, + given_state_dist=given_state_dist, + ) + + if midext is None: + given_state_dist = np.sum(given_state_dist, axis=0) + else: + given_state_dist = given_state_dist[int(midext)] + # I think I don't need to normalize here, since I am not computing a + # probability of something *given* midext, but only sum up all states that + # match the involvement pattern (which includes the midext status). + + return self.ext.marginalize( + involvement=involvement, + given_state_dist=given_state_dist, + ) + + def risk( self, involvement: types.PatternType | None = None, given_params: types.ParamsType | None = None, given_state_dist: np.ndarray | None = None, - given_diagnoses: dict[str, types.DiagnoseType] | None = None, + given_diagnosis: dict[str, types.DiagnosisType] | None = None, t_stage: str = "early", midext: bool | None = None, central: bool = False, mode: Literal["HMM", "BN"] = "HMM", ) -> float: - """Compute the risk of nodal involvement ``given_diagnoses``. + """Compute the risk of nodal involvement ``given_diagnosis``. In addition to the arguments of the :py:meth:`.Bilateral.risk` method, this also allows specifying if the patient's tumor extended over the mid-sagittal @@ -698,33 +828,20 @@ def risk( distribution (when ``True`` or ``False``), or marginalize over the midline extension status (when ``midext=None``). """ - # NOTE: When given a 2D state distribution, it does not matter which of the - # Bilateral models is used to compute the risk, since the state dist is - # is the only thing that could differ between models. - if given_state_dist is None: - utils.safe_set_params(self, given_params) - given_state_dist = self.state_dist(t_stage, central, mode) - - if given_state_dist.ndim == 2: - return self.ext.risk( - involvement=involvement, - given_state_dist=given_state_dist, - given_diagnoses=given_diagnoses, - ) - - if central: - raise ValueError("The `given_state_dist` must be 2D for the central model.") - - if midext is None: - given_state_dist = np.sum(given_state_dist, axis=0) - else: - given_state_dist = given_state_dist[int(midext)] - given_state_dist = given_state_dist / given_state_dist.sum() + posterior_state_dist = self.posterior_state_dist( + given_params=given_params, + given_state_dist=given_state_dist, + given_diagnosis=given_diagnosis, + t_stage=t_stage, + midext=midext, + central=central, + mode=mode, + ) - return self.ext.risk( + return self.marginalize( involvement=involvement, - given_state_dist=given_state_dist, - given_diagnoses=given_diagnoses, + given_state_dist=posterior_state_dist, + midext=midext, ) @@ -776,19 +893,19 @@ def draw_patients( drawn_diags = np.empty(shape=(num, len(self.ext.ipsi.obs_list))) for case in ["ext", "noext"]: case_model = getattr(self, case) - drawn_ipsi_diags = utils.draw_diagnoses( - diagnose_times=drawn_diag_times[drawn_midexts == (case == "ext")], + drawn_ipsi_diags = utils.draw_diagnosis( + diagnosis_times=drawn_diag_times[drawn_midexts == (case == "ext")], state_evolution=ipsi_evo, observation_matrix=case_model.ipsi.observation_matrix(), - possible_diagnoses=case_model.ipsi.obs_list, + possible_diagnosis=case_model.ipsi.obs_list, rng=rng, seed=seed, ) - drawn_contra_diags = utils.draw_diagnoses( - diagnose_times=drawn_diag_times[drawn_midexts == (case == "ext")], + drawn_contra_diags = utils.draw_diagnosis( + diagnosis_times=drawn_diag_times[drawn_midexts == (case == "ext")], state_evolution=case_model.contra.state_dist_evo(), observation_matrix=case_model.contra.observation_matrix(), - possible_diagnoses=case_model.contra.obs_list, + possible_diagnosis=case_model.contra.obs_list, rng=rng, seed=seed, ) @@ -796,7 +913,7 @@ def draw_patients( drawn_diags[drawn_midexts == (case == "ext")] = drawn_case_diags # construct MultiIndex with "ipsi" and "contra" at top level to allow - # concatenation of the two separate drawn diagnoses + # concatenation of the two separate drawn diagnosis sides = ["ipsi", "contra"] modality_names = list(self.get_all_modalities().keys()) lnl_names = [lnl for lnl in self.ext.ipsi.graph.lnls.keys()] @@ -809,6 +926,6 @@ def draw_patients( dataset = dataset.sort_index(axis="columns", level=0) dataset["tumor", "1", "t_stage"] = drawn_t_stages dataset["tumor", "1", "extension"] = drawn_midexts - dataset["patient", "#", "diagnose_time"] = drawn_diag_times + dataset["patient", "#", "diagnosis_time"] = drawn_diag_times return dataset diff --git a/lymph/models/unilateral.py b/lymph/models/unilateral.py index e1bfac1..2d9662e 100644 --- a/lymph/models/unilateral.py +++ b/lymph/models/unilateral.py @@ -8,13 +8,13 @@ import pandas as pd from cachetools import LRUCache -from lymph import diagnose_times, graph, matrix, modalities, types, utils +from lymph import diagnosis_times, graph, matrix, modalities, types, utils # pylint: disable=unused-import from lymph.utils import ( # nopycln: import add_or_mult, dict_to_func, - draw_diagnoses, + draw_diagnosis, early_late_mapping, flatten, get_params_from, @@ -29,7 +29,7 @@ class Unilateral( - diagnose_times.Composite, + diagnosis_times.Composite, modalities.Composite, types.Model, ): @@ -97,12 +97,12 @@ def __init__( allowed_states=allowed_states, ) - diagnose_times.Composite.__init__(self, max_time=max_time, is_distribution_leaf=True) + diagnosis_times.Composite.__init__(self, max_time=max_time, is_distribution_leaf=True) modalities.Composite.__init__(self, is_modality_leaf=True) self._patient_data: pd.DataFrame | None = None self._cache_version: int = 0 self._data_matrix_cache: LRUCache = LRUCache(maxsize=64) - self._diagnose_matrix_cache: LRUCache = LRUCache(maxsize=64) + self._diagnosis_matrix_cache: LRUCache = LRUCache(maxsize=64) @classmethod @@ -243,7 +243,7 @@ def set_params(self, *args: float, **kwargs: float) -> tuple[float]: :py:meth:`lymph.models.Unilateral.set_distribution_params` method. The keyword arguments can be of the format ``"_"`` or - ``"_"`` for the distributions over diagnose times. If only + ``"_"`` for the distributions over diagnosis times. If only a ``""`` is provided, it is assumed to be a global parameter and is sent to all edges or distributions. But the more specific keyword arguments override the global ones, which in turn override the positional arguments. @@ -303,34 +303,34 @@ def transition_prob( return trans_prob - def diagnose_prob( + def diagnosis_prob( self, - diagnoses: pd.Series | dict[str, dict[str, bool]] + diagnosis: pd.Series | dict[str, dict[str, bool]] ) -> float: - """Compute the probability to observe a diagnose given the current state. + """Compute the probability to observe a diagnosis given the current state. - The ``diagnoses`` is either a pandas ``Series`` object corresponding to one row + The ``diagnosis`` is either a pandas ``Series`` object corresponding to one row of a patient data table, or a dictionary with keys of diagnostic modalities and values of dictionaries holding the observation for each LNL under the respective key. It returns the probability of observing this particular combination of - diagnoses, given the current state of the system. + diagnosis, given the current state of the system. """ prob = 1. for name, modality in self.get_all_modalities().items(): - if name in diagnoses: - mod_diagnose = diagnoses[name] + if name in diagnosis: + mod_diagnosis = diagnosis[name] for lnl in self.graph.lnls: try: - lnl_diagnose = mod_diagnose[lnl.name] + lnl_diagnosis = mod_diagnosis[lnl.name] except KeyError: continue except IndexError as idx_err: raise ValueError( - "diagnoses were not provided in the correct format" + "diagnosis were not provided in the correct format" ) from idx_err - prob *= lnl.comp_obs_prob(lnl_diagnose, modality.confusion_matrix) + prob *= lnl.comp_obs_prob(lnl_diagnosis, modality.confusion_matrix) return prob @@ -409,7 +409,7 @@ def observation_matrix(self) -> np.ndarray: """The matrix encoding the probabilities to observe a certain diagnosis. Every element in this matrix holds a probability to observe a certain diagnosis - (or combination of diagnoses, when using multiple diagnostic modalities) given + (or combination of diagnosis, when using multiple diagnostic modalities) given the current state of the system. It has the shape :math:`2^N \\times 2^\\{N \\times M\\}` where :math:`N` is the number of nodes in the graph and :math:`M` is the number of diagnostic modalities. @@ -432,10 +432,10 @@ def data_matrix(self, t_stage: str | None = None) -> np.ndarray: it encodes the information which observational state could have led to the observed diagnosis. If a diagnosis is complete, i.e., for every diagnostic modality and every LNL we have an observation, the data matrix is a one-hot - encoding of the observed diagnoses. Otherwise it may contain multiple 1s, + encoding of the observed diagnosis. Otherwise it may contain multiple 1s, indicating over which observational state one should marginalize. - The data matrix is used to compute the :py:attr:`~diagnose_matrix`, which in + The data matrix is used to compute the :py:attr:`~diagnosis_matrix`, which in turn is used to compute the likelihood of the model given the patient data. See Also: @@ -466,23 +466,23 @@ def data_matrix(self, t_stage: str | None = None) -> np.ndarray: return self._data_matrix_cache[t_hash] - def diagnose_matrix(self, t_stage: str | None = None) -> np.ndarray: - """Extract the diagnose matrix for a given ``t_stage``. + def diagnosis_matrix(self, t_stage: str | None = None) -> np.ndarray: + """Extract the diagnosis matrix for a given ``t_stage``. For every patient this matrix stores the probability to observe this patient's diagnosis, given one of the possible hidden states of the model. It is computed by multiplying the :py:meth:`.data_matrix` with the :py:meth:`.observation_matrix`. """ - # Compute the entire diagnose matrix if it is not in the cache. Note that this + # Compute the entire diagnosis matrix if it is not in the cache. Note that this # requires the data matrix to be computed as well. _hash = hash((t_stage, self.modalities_hash(), self._cache_version)) - if _hash not in self._diagnose_matrix_cache: - self._diagnose_matrix_cache[_hash] = ( + if _hash not in self._diagnosis_matrix_cache: + self._diagnosis_matrix_cache[_hash] = ( self.observation_matrix() @ self.data_matrix(t_stage).T ) - return self._diagnose_matrix_cache[_hash].T + return self._diagnosis_matrix_cache[_hash].T def load_patient_data( @@ -499,13 +499,13 @@ def load_patient_data( With the ``mapping`` function or dictionary, the reported T-stages (usually 0, 1, 2, 3, and 4) can be mapped to any keys also used to access the corresponding - distribution over diagnose times. The default mapping is to map 0, 1, and 2 to + distribution over diagnosis times. The default mapping is to map 0, 1, and 2 to "early" and 3 and 4 to "late". What this method essentially does is to copy the entire data frame, check all necessary information is present, and add a new top-level header ``"_model"`` to the data frame. Under this header, columns are assembled that contain all the - information necessary to compute the observation and diagnose matrices. + information necessary to compute the observation and diagnosis matrices. .. _LyProX: https://lyprox.org/ """ @@ -581,14 +581,14 @@ def patient_data(self) -> pd.DataFrame: Additionally, it holds the data encodings and probability of diagnosis given the hidden states for each patient under the headers ``("_model", "_encoding", - )`` and ``("_model", "_diagnose_prob", )``, + )`` and ``("_model", "_diagnosis_prob", )``, respectively. """ if self._patient_data is None: raise AttributeError("No patient data loaded yet.") - # if not present, this will recompute the full data and diagnose matrices - _ = self.diagnose_matrix() + # if not present, this will recompute the full data and diagnosis matrices + _ = self.diagnosis_matrix() return self._patient_data @@ -612,10 +612,10 @@ def state_dist_evo(self) -> np.ndarray: This returns a matrix with the distribution over the possible states for each time step from :math:`t = 0` to :math:`t = T`, where :math:`T` is the - maximum diagnose time stored in the model's attribute ``max_time``. + maximum diagnosis time stored in the model's attribute ``max_time``. Note that at this point, the distributions are not weighted with the - distribution over diagnose times that are stored and managed for each T-stage + distribution over diagnosis times that are stored and managed for each T-stage in the dictionary returned by :py:meth:`.get_all_distributions`. """ state_dists = np.zeros(shape=(self.max_time + 1, len(self.graph.state_list))) @@ -637,7 +637,7 @@ def state_dist( Do this either for a given ``t_stage``, when ``mode`` is set to ``"HMM"``, which is essentially a marginalization of the evolution over the possible states as computed by :py:meth:`.state_dist_evo` with the distribution - over diagnose times for the given T-stage from the dictionary returned by + over diagnosis times for the given T-stage from the dictionary returned by :py:meth:`.get_all_distributions`. Or, when ``mode`` is set to ``"BN"``, compute the distribution over states for @@ -662,6 +662,7 @@ def state_dist( def obs_dist( self, + given_state_dist: np.ndarray | None = None, t_stage: str = "early", mode: Literal["HMM", "BN"] = "HMM", ) -> np.ndarray: @@ -673,17 +674,19 @@ def obs_dist( Note that since the :py:attr:`.observation_matrix` can become very large, this method is not very efficient for inference. Instead, we compute the - :py:meth:`.diagnose_matrix` from the :py:meth:`.observation_matrix` and + :py:meth:`.diagnosis_matrix` from the :py:meth:`.observation_matrix` and the :py:meth:`.data_matrix` and use these to compute the likelihood. """ - state_dist = self.state_dist(t_stage=t_stage, mode=mode) - return state_dist @ self.observation_matrix() + if given_state_dist is None: + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode) + + return given_state_dist @ self.observation_matrix() def _bn_likelihood(self, log: bool = True, t_stage: str | None = None) -> float: """Compute the BN likelihood, using the stored params.""" state_dist = self.state_dist(mode="BN") - patient_llhs = state_dist @ self.diagnose_matrix(t_stage).T + patient_llhs = state_dist @ self.diagnosis_matrix(t_stage).T return np.sum(np.log(patient_llhs)) if log else np.prod(patient_llhs) @@ -702,7 +705,7 @@ def _hmm_likelihood(self, log: bool = True, t_stage: str | None = None) -> float patient_llhs = ( self.get_distribution(t_stage).pmf @ evolved_model - @ self.diagnose_matrix(t_stage).T + @ self.diagnosis_matrix(t_stage).T ) llh = add_or_mult(llh, patient_llhs, log) @@ -713,8 +716,8 @@ def likelihood( self, given_params: types.ParamsType | None = None, log: bool = True, + t_stage: str | None = None, mode: Literal["HMM", "BN"] = "HMM", - for_t_stage: str | None = None, ) -> float: """Compute the (log-)likelihood of the stored data given the model (and params). @@ -733,49 +736,49 @@ def likelihood( return -np.inf if log else 0. if mode == "HMM": - return self._hmm_likelihood(log, for_t_stage) + return self._hmm_likelihood(log, t_stage) if mode == "BN": - return self._bn_likelihood(log, for_t_stage) + return self._bn_likelihood(log, t_stage) raise ValueError("Invalid mode. Must be either 'HMM' or 'BN'.") def compute_encoding( self, - given_diagnoses: types.DiagnoseType | None = None, + given_diagnosis: types.DiagnosisType | None = None, ) -> np.ndarray: """Compute one-hot vector encoding of a given diagnosis.""" - diagnose_encoding = np.array([True], dtype=bool) + diagnosis_encoding = np.array([True], dtype=bool) for modality in self.get_all_modalities().keys(): - diagnose_encoding = np.kron( - diagnose_encoding, + diagnosis_encoding = np.kron( + diagnosis_encoding, matrix.compute_encoding( lnls=self.graph.lnls.keys(), - pattern=given_diagnoses.get(modality, {}), - base=2, # diagnoses are always binary! + pattern=given_diagnosis.get(modality, {}), + base=2, # diagnosis are always binary! ), ) - return diagnose_encoding + return diagnosis_encoding def posterior_state_dist( self, given_params: types.ParamsType | None = None, given_state_dist: np.ndarray | None = None, - given_diagnoses: types.DiagnoseType | None = None, + given_diagnosis: types.DiagnosisType | None = None, t_stage: str | int = "early", mode: Literal["HMM", "BN"] = "HMM", ) -> np.ndarray: """Compute the posterior distribution over hidden states given a diagnosis. - The ``given_diagnoses`` is a dictionary of diagnoses for each modality. E.g., + The ``given_diagnosis`` is a dictionary of diagnosis for each modality. E.g., this could look like this: .. code-block:: python - given_diagnoses = { + given_diagnosis = { "MRI": {"II": True, "III": False, "IV": False}, "PET": {"II": True, "III": True, "IV": None}, } @@ -799,19 +802,49 @@ def posterior_state_dist( # vector P(X=x) of probs of arriving in state x (marginalized over time) given_state_dist = self.state_dist(t_stage, mode=mode) - if given_diagnoses is None: - given_diagnoses = {} + if given_diagnosis is None: + given_diagnosis = {} - diagnose_encoding = self.compute_encoding(given_diagnoses) + diagnosis_encoding = self.compute_encoding(given_diagnosis) # vector containing P(Z=z|X). Essentially a data matrix for one patient - diagnose_given_state = diagnose_encoding @ self.observation_matrix().T + diagnosis_given_state = diagnosis_encoding @ self.observation_matrix().T # multiply P(Z=z|X) * P(X) elementwise to get vector of joint probs P(Z=z,X) - joint_diagnose_and_state = given_state_dist * diagnose_given_state + joint_diagnosis_and_state = given_state_dist * diagnosis_given_state # compute vector of probabilities for all possible involvements given the # specified diagnosis P(X|Z=z) = P(Z=z,X) / P(X), where P(X) = sum_z P(Z=z,X) - return joint_diagnose_and_state / np.sum(joint_diagnose_and_state) + return joint_diagnosis_and_state / np.sum(joint_diagnosis_and_state) + + + def marginalize( + self, + involvement: types.PatternType | None = None, + given_state_dist: np.ndarray | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + ) -> float: + """Marginalize ``given_state_dist`` over matching ``involvement`` patterns. + + Any state that matches the provided ``involvement`` pattern is marginalized + over. For this, the :py:func:`.matrix.compute_encoding` function is used. + + If ``given_state_dist`` is ``None``, it will be computed by calling + :py:meth:`.state_dist` with the given ``t_stage`` and ``mode``. These arguments + are ignored if ``given_state_dist`` is provided. + """ + if involvement is None: + return 1. + + if given_state_dist is None: + given_state_dist = self.state_dist(t_stage=t_stage, mode=mode) + + marginalize_over_states = matrix.compute_encoding( + lnls=self.graph.lnls.keys(), + pattern=involvement, + base=3 if self.is_trinary else 2, + ) + return marginalize_over_states @ given_state_dist def risk( @@ -819,50 +852,42 @@ def risk( involvement: types.PatternType | None = None, given_params: types.ParamsType | None = None, given_state_dist: np.ndarray | None = None, - given_diagnoses: dict[str, types.PatternType] | None = None, + given_diagnosis: dict[str, types.PatternType] | None = None, t_stage: str = "early", mode: Literal["HMM", "BN"] = "HMM", - ) -> float | np.ndarray: - """Compute risk of a certain ``involvement``, using the ``given_diagnoses``. + ) -> float: + """Compute risk of a certain ``involvement``, using the ``given_diagnosis``. If an ``involvement`` pattern of interest is provided, this method computes the risk of seeing just that pattern for the set of given parameters and a - dictionary of diagnoses for each modality. + dictionary of diagnosis for each modality. If no ``involvement`` is provided, this will simply return the posterior - distribution over hidden states, given the diagnoses, as computed by the + distribution over hidden states, given the diagnosis, as computed by the :py:meth:`.posterior_state_dist` method. See its documentaiton for more details about the arguments and the return value. """ posterior_state_dist = self.posterior_state_dist( given_params=given_params, given_state_dist=given_state_dist, - given_diagnoses=given_diagnoses, + given_diagnosis=given_diagnosis, t_stage=t_stage, mode=mode, ) - if involvement is None: - return posterior_state_dist - # if a specific involvement of interest is provided, marginalize the # resulting vector of hidden states to match that involvement of # interest - marginalize_over_states = matrix.compute_encoding( - lnls=self.graph.lnls.keys(), - pattern=involvement, - base=3 if self.is_trinary else 2, - ) - return marginalize_over_states @ posterior_state_dist + return self.marginalize(involvement, posterior_state_dist) - def draw_diagnoses( + def draw_diagnosis( self, diag_times: list[int], rng: np.random.Generator | None = None, seed: int = 42, ) -> np.ndarray: - """Given some ``diag_times``, draw diagnoses for each LNL. + """Given some ``diag_times``, draw diagnosis for each LNL. >>> model = Unilateral(graph_dict={ ... ("tumor", "T"): ["II" , "III"], @@ -870,17 +895,17 @@ def draw_diagnoses( ... ("lnl", "III"): [], ... }) >>> model.set_modality("CT", spec=0.8, sens=0.8) - >>> model.draw_diagnoses([0, 1, 2, 3, 4]) # doctest: +NORMALIZE_WHITESPACE + >>> model.draw_diagnosis([0, 1, 2, 3, 4]) # doctest: +NORMALIZE_WHITESPACE array([[False, True], [False, False], [ True, False], [False, True], [False, False]]) - >>> draw_diagnoses( # this is the same as the previous example - ... diagnose_times=[0, 1, 2, 3, 4], + >>> draw_diagnosis( # this is the same as the previous example + ... diagnosis_times=[0, 1, 2, 3, 4], ... state_evolution=model.state_dist_evo(), ... observation_matrix=model.observation_matrix(), - ... possible_diagnoses=model.obs_list, + ... possible_diagnosis=model.obs_list, ... ) array([[False, True], [False, False], @@ -922,10 +947,10 @@ def draw_patients( is initialized with the given ``seed`` (or ``42``, by default). See Also: - :py:meth:`lymph.diagnose_times.Distribution.draw_diag_times` - Method to draw diagnose times from a distribution. - :py:meth:`lymph.models.Unilateral.draw_diagnoses` - Method to draw individual diagnoses. + :py:meth:`lymph.diagnosis_times.Distribution.draw_diag_times` + Method to draw diagnosis times from a distribution. + :py:meth:`lymph.models.Unilateral.draw_diagnosis` + Method to draw individual diagnosis. :py:meth:`lymph.models.Bilateral.draw_patients` The corresponding bilateral method. """ @@ -947,7 +972,7 @@ def draw_patients( for t_stage in drawn_t_stages ] - drawn_obs = self.draw_diagnoses(drawn_diag_times, rng=rng) + drawn_obs = self.draw_diagnosis(drawn_diag_times, rng=rng) modality_names = list(self.get_all_modalities().keys()) lnl_names = list(self.graph.lnls.keys()) diff --git a/lymph/types.py b/lymph/types.py index 5def3aa..6652505 100644 --- a/lymph/types.py +++ b/lymph/types.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd -from pandas._libs.missing import NAType class DataWarning(UserWarning): @@ -45,10 +44,21 @@ def get_params( ParamsType = Iterable[float] | dict[str, float] """Type alias for how parameters are passed around. -This is e.g. the type that the :py:meth:`Model.get_params` method returns. +This is e.g. the type that the :py:meth:`.Model.get_params` method returns. """ -PatternType = dict[str, bool | str | NAType | None] +InvolvementIndicator = Literal[ + False, 0, "healthy", + True, 1, "involved", + "micro", "macro", "notmacro", +] +"""Type alias for how to encode lymphatic involvement for a single lymph node level. + +The choices ``"micro"``, ``"macro"``, and ``"notmacro"`` are only relevant for the +trinary models. +""" + +PatternType = dict[str, InvolvementIndicator | None] """Type alias for an involvement pattern. An involvement pattern is a dictionary with keys for the lymph node levels and values @@ -61,10 +71,10 @@ def get_params( >>> pattern = {"I": True, "II": False, "III": None} """ -DiagnoseType = dict[str, PatternType] -"""Type alias for a diagnose, which is an involvement pattern per diagnostic modality. +DiagnosisType = dict[str, PatternType] +"""Type alias for a diagnosis, which is an involvement pattern per diagnostic modality. ->>> diagnose = { +>>> diagnosis = { ... "CT": {"I": True, "II": False, "III": None}, ... "MRI": {"I": True, "II": True, "III": None}, ... } @@ -101,7 +111,7 @@ def get_num_dims(self: ModelT, mode: Literal["HMM", "BN"] = "HMM") -> int: A hidden Markov model (``mode="HMM"``) typically has more parameters than a Bayesian network (``mode="BN"``), because it we need parameters for the distributions over diagnosis times. Your can read more about that in the - :py:mod:`lymph.diagnose_times` module. + :py:mod:`lymph.diagnosis_times` module. """ # pylint: disable=no-member num = len(self.get_params()) @@ -119,6 +129,35 @@ def set_params(self: ModelT, *args: float, **kwargs: float) -> tuple[float]: """ raise NotImplementedError + @abstractmethod + def state_dist( + self: ModelT, + t_stage: str, + mode: Literal["HMM", "BN"] = "HMM", + ) -> np.ndarray: + """Return the prior state distribution of the model. + + The state distribution is the probability of the model being in any of the + possible hidden states. + """ + raise NotImplementedError + + def obs_dist( + self: ModelT, + given_state_dist: np.ndarray | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + ) -> np.ndarray: + """Return the distribution over observational states. + + If ``given_state_dist`` is ``None``, it will first compute the + :py:meth:`.state_dist` using the arguments ``t_stage`` and ``mode`` (which are + otherwise ignored). Then it multiplies the distribution over (hidden) states + with the specificity and sensitivity values stored in the model (see + :py:meth:`.modalities.Composite`) and marginalizes over the hidden states. + """ + raise NotImplementedError + @abstractmethod def load_patient_data( self: ModelT, @@ -144,13 +183,48 @@ def likelihood( """ raise NotImplementedError + @abstractmethod + def posterior_state_dist( + self: ModelT, + given_params: ParamsType | None = None, + given_state_dist: np.ndarray | None = None, + given_diagnosis: dict[str, PatternType] | None = None, + ) -> np.ndarray: + """Return the posterior state distribution using the ``given_diagnosis``. + + The posterior state distribution is the probability of the model being in a + certain state given the diagnosis. The ``given_params`` are passed to the + :py:meth:`set_params` method. Alternatively to parameters, one may also pass + a ``given_state_dist``, which is effectively the precomputed prior from calling + :py:meth:`.state_dist`. + """ + raise NotImplementedError + + def marginalize( + self, + involvement: dict[str, PatternType] | None = None, + given_state_dist: np.ndarray | None = None, + t_stage: str = "early", + mode: Literal["HMM", "BN"] = "HMM", + ) -> float: + """Marginalize ``given_state_dist`` over matching ``involvement`` patterns. + + Any state that matches the provided ``involvement`` pattern is marginalized + over. For this, the :py:func:`.matrix.compute_encoding` function is used. + + If ``given_state_dist`` is ``None``, it will be computed by calling + :py:meth:`.state_dist` with the given ``t_stage`` and ``mode``. These arguments + are ignored if ``given_state_dist`` is provided. + """ + raise NotImplementedError + @abstractmethod def risk( self, involvement: PatternType | None = None, given_params: ParamsType | None = None, given_state_dist: np.ndarray | None = None, - given_diagnoses: dict[str, PatternType] | None = None, - ) -> float | np.ndarray: - """Return the risk of ``involvement``, given params/state_dist and diagnoses.""" + given_diagnosis: dict[str, PatternType] | None = None, + ) -> float: + """Return the risk of ``involvement``, given params/state_dist and diagnosis.""" raise NotImplementedError diff --git a/lymph/utils.py b/lymph/utils.py index 88ada41..4183a41 100644 --- a/lymph/utils.py +++ b/lymph/utils.py @@ -374,26 +374,26 @@ def synchronize_params( obj.set_params(**get_from[key].get_params(as_dict=True)) -def draw_diagnoses( - diagnose_times: list[int], +def draw_diagnosis( + diagnosis_times: list[int], state_evolution: np.ndarray, observation_matrix: np.ndarray, - possible_diagnoses: np.ndarray, + possible_diagnosis: np.ndarray, rng: np.random.Generator | None = None, seed: int = 42, ) -> np.ndarray: - """Given the ``diagnose_times`` and a hidden ``state_evolution``, draw diagnoses.""" + """Given the ``diagnosis_times`` and a hidden ``state_evolution``, draw diagnosis.""" if rng is None: rng = np.random.default_rng(seed) - state_dists_given_time = state_evolution[diagnose_times] + state_dists_given_time = state_evolution[diagnosis_times] observation_dists_given_time = state_dists_given_time @ observation_matrix drawn_observation_idxs = [ - rng.choice(a=np.arange(len(possible_diagnoses)), p=dist) + rng.choice(a=np.arange(len(possible_diagnosis)), p=dist) for dist in observation_dists_given_time ] - return possible_diagnoses[drawn_observation_idxs].astype(bool) + return possible_diagnosis[drawn_observation_idxs].astype(bool) def add_or_mult(llh: float, arr: np.ndarray, log: bool = True) -> float: diff --git a/tests/binary_bilateral_test.py b/tests/binary_bilateral_test.py index d18cfa0..efc198a 100644 --- a/tests/binary_bilateral_test.py +++ b/tests/binary_bilateral_test.py @@ -137,7 +137,7 @@ def test_modality_reset(self): ) def test_diag_time_dists_delegation(self): - """Test that the diagnose time distributions are delegated.""" + """Test that the diagnosis time distributions are delegated.""" self.assertEqual( list(self.model.get_distribution("early").pmf), list(self.model.ipsi.get_distribution("early").pmf), @@ -299,28 +299,28 @@ def setUp(self): super().setUp() self.model.replace_all_modalities(fixtures.MODALITIES) - def create_random_diagnoses(self): + def create_random_diagnosis(self): """Create a random diagnosis for each modality and LNL.""" - diagnoses = {} + diagnosis = {} for side in ["ipsi", "contra"]: - diagnoses[side] = {} + diagnosis[side] = {} side_model = getattr(self.model, side) lnl_names = side_model.graph.lnls.keys() for modality in side_model.get_all_modalities(): - diagnoses[side][modality] = fixtures.create_random_pattern(lnl_names) + diagnosis[side][modality] = fixtures.create_random_pattern(lnl_names) - return diagnoses + return diagnosis def test_posterior_state_dist(self): """Test that the posterior state distribution is computed correctly.""" num_states = len(self.model.ipsi.graph.state_list) random_parameters = self.create_random_params() - random_diagnoses = self.create_random_diagnoses() + random_diagnosis = self.create_random_diagnosis() posterior = self.model.posterior_state_dist( given_params=random_parameters, - given_diagnoses=random_diagnoses, + given_diagnosis=random_diagnosis, ) self.assertEqual(posterior.shape, (num_states, num_states)) self.assertEqual(posterior.dtype, float) @@ -329,7 +329,7 @@ def test_posterior_state_dist(self): def test_risk(self): """Test that the risk is computed correctly.""" random_parameters = self.create_random_params() - random_diagnoses = self.create_random_diagnoses() + random_diagnosis = self.create_random_diagnosis() random_pattern = { "ipsi": fixtures.create_random_pattern(self.model.ipsi.graph.lnls.keys()), "contra": fixtures.create_random_pattern(self.model.contra.graph.lnls.keys()), @@ -339,7 +339,7 @@ def test_risk(self): risk = self.model.risk( involvement=random_pattern, given_params=random_parameters, - given_diagnoses=random_diagnoses, + given_diagnosis=random_diagnosis, t_stage=random_t_stage, ) self.assertLessEqual(risk, 1.) diff --git a/tests/binary_midline_test.py b/tests/binary_midline_test.py index c949278..c65ace0 100644 --- a/tests/binary_midline_test.py +++ b/tests/binary_midline_test.py @@ -119,12 +119,6 @@ def setUp(self) -> None: def test_risk(self) -> None: """Check that the risk method works correctly.""" - plain_risk = self.model.risk() - self.assertEqual(plain_risk.shape, (4,4)) - self.assertTrue(np.isclose(plain_risk.sum(), 1.0)) - self.assertTrue(np.allclose(plain_risk[1,:], 0.)) - self.assertTrue(np.allclose(plain_risk[:,1], 0.)) - lnlIII_risk = self.model.risk(involvement={"ipsi": {"II": False, "III": True}}) self.assertTrue(np.isscalar(lnlIII_risk)) self.assertAlmostEqual(lnlIII_risk, 0.0) diff --git a/tests/binary_unilateral_test.py b/tests/binary_unilateral_test.py index f7d2326..c2df921 100644 --- a/tests/binary_unilateral_test.py +++ b/tests/binary_unilateral_test.py @@ -258,42 +258,42 @@ def test_data_matrices(self): has_t_stage.sum(), ) - def test_diagnose_matrices(self): - """Make sure the diagnose matrices are generated correctly.""" + def test_diagnosis_matrices(self): + """Make sure the diagnosis matrices are generated correctly.""" for t_stage in ["early", "late"]: has_t_stage = self.raw_data["tumor", "1", "t_stage"].isin({ "early": [0,1,2], "late": [3,4], }[t_stage]) - diagnose_matrix = self.model.diagnose_matrix(t_stage).T + diagnosis_matrix = self.model.diagnosis_matrix(t_stage).T self.assertEqual( - diagnose_matrix.shape[0], + diagnosis_matrix.shape[0], self.model.transition_matrix().shape[1], ) self.assertEqual( - diagnose_matrix.shape[1], + diagnosis_matrix.shape[1], has_t_stage.sum(), ) - # some times, entries in the diagnose matrix are almost one, but just + # some times, entries in the diagnosis matrix are almost one, but just # slightly larger. That's why we also have to have the `isclose` here. self.assertTrue(np.all( - np.isclose(diagnose_matrix, 1.) - | np.less_equal(diagnose_matrix, 1.) + np.isclose(diagnosis_matrix, 1.) + | np.less_equal(diagnosis_matrix, 1.) )) def test_modality_replacement(self) -> None: - """Check if the data & diagnose matrices get updated when modalities change.""" + """Check if the data & diagnosis matrices get updated when modalities change.""" data_matrix = self.model.data_matrix() - diagnose_matrix = self.model.diagnose_matrix() + diagnosis_matrix = self.model.diagnosis_matrix() self.model.replace_all_modalities({"PET": Clinical(spec=0.8, sens=0.8)}) self.assertNotEqual( hash(data_matrix.tobytes()), hash(self.model.data_matrix().tobytes()), ) self.assertNotEqual( - hash(diagnose_matrix.tobytes()), - hash(self.model.diagnose_matrix().tobytes()), + hash(diagnosis_matrix.tobytes()), + hash(self.model.diagnosis_matrix().tobytes()), ) @@ -348,32 +348,32 @@ def setUp(self): self.init_diag_time_dists(early="frozen", late="parametric") self.model.set_params(**self.create_random_params()) - def create_random_diagnoses(self): + def create_random_diagnosis(self): """Create a random diagnosis for each modality and LNL.""" lnl_names = list(self.model.graph.lnls.keys()) - diagnoses = {} + diagnosis = {} for modality in self.model.get_all_modalities(): - diagnoses[modality] = fixtures.create_random_pattern(lnl_names) + diagnosis[modality] = fixtures.create_random_pattern(lnl_names) - return diagnoses + return diagnosis def test_compute_encoding(self): - """Check computation of one-hot encoding of diagnoses.""" - random_diagnoses = self.create_random_diagnoses() + """Check computation of one-hot encoding of diagnosis.""" + random_diagnosis = self.create_random_diagnosis() num_lnls = len(self.model.graph.lnls) num_mods = len(self.model.get_all_modalities()) - num_posible_diagnoses = 2**(num_lnls * num_mods) + num_posible_diagnosis = 2**(num_lnls * num_mods) - diagnose_encoding = self.model.compute_encoding(random_diagnoses) - self.assertEqual(diagnose_encoding.shape, (num_posible_diagnoses,)) - self.assertEqual(diagnose_encoding.dtype, bool) + diagnosis_encoding = self.model.compute_encoding(random_diagnosis) + self.assertEqual(diagnosis_encoding.shape, (num_posible_diagnosis,)) + self.assertEqual(diagnosis_encoding.dtype, bool) def test_posterior_state_dist(self): """Make sure the posterior state dist is correctly computed.""" posterior_state_dist = self.model.posterior_state_dist( given_params=self.create_random_params(), - given_diagnoses=self.create_random_diagnoses(), + given_diagnosis=self.create_random_diagnosis(), t_stage=self.rng.choice(["early", "late"]), ) self.assertEqual(posterior_state_dist.shape, (2**len(self.model.graph.lnls),)) @@ -383,14 +383,14 @@ def test_posterior_state_dist(self): def test_risk(self): """Make sure the risk is correctly computed.""" random_pattern = fixtures.create_random_pattern(self.model.graph.lnls.keys()) - random_diagnoses = self.create_random_diagnoses() + random_diagnosis = self.create_random_diagnosis() random_t_stage = self.rng.choice(["early", "late"]) random_params = self.create_random_params() risk = self.model.risk( involvement=random_pattern, given_params=random_params, - given_diagnoses=random_diagnoses, + given_diagnosis=random_diagnosis, t_stage=random_t_stage, ) self.assertEqual(risk.dtype, float) @@ -441,7 +441,7 @@ def test_distribution_of_patients(self): for lnl_edge in self.model.graph.lnl_edges.values(): lnl_edge.set_spread_prob(0.) - # make all patients diagnosed after exactly one time-step + # make all patients diagnosisd after exactly one time-step self.model.set_distribution("early", [0,1,0,0,0,0,0,0,0,0,0]) # assign only one pathology modality diff --git a/tests/distribution_test.py b/tests/distribution_test.py index e117871..a8575c1 100644 --- a/tests/distribution_test.py +++ b/tests/distribution_test.py @@ -1,10 +1,10 @@ -"""Check functionality of the distribution over diagnose times.""" +"""Check functionality of the distribution over diagnosis times.""" import warnings import numpy as np import scipy as sp -from lymph.diagnose_times import Distribution +from lymph.diagnosis_times import Distribution from . import fixtures diff --git a/tests/doc_test.py b/tests/doc_test.py index 140dd43..f771860 100644 --- a/tests/doc_test.py +++ b/tests/doc_test.py @@ -4,13 +4,13 @@ import doctest import unittest -from lymph import diagnose_times, graph, matrix, modalities, utils +from lymph import diagnosis_times, graph, matrix, modalities, utils from lymph.models import bilateral, unilateral def load_tests(loader, tests: unittest.TestSuite, ignore): """Load doctests from the lymph package.""" - tests.addTests(doctest.DocTestSuite(diagnose_times)) + tests.addTests(doctest.DocTestSuite(diagnosis_times)) tests.addTests(doctest.DocTestSuite(graph)) tests.addTests(doctest.DocTestSuite(utils)) tests.addTests(doctest.DocTestSuite(matrix)) diff --git a/tests/fixtures.py b/tests/fixtures.py index 4273961..7dfffcd 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -11,7 +11,7 @@ import pandas as pd import scipy as sp -from lymph import diagnose_times +from lymph import diagnosis_times from lymph.modalities import Clinical, Modality, Pathological from lymph.models import Bilateral, Midline, Unilateral from lymph.types import DataWarning, PatternType @@ -80,19 +80,19 @@ def _create_random_frozen_dist( max_time: int, rng: np.random.Generator = RNG, ) -> np.ndarray: - """Create a random frozen diagnose time distribution.""" + """Create a random frozen diagnosis time distribution.""" unnormalized = rng.random(size=max_time + 1) return unnormalized / np.sum(unnormalized) def _create_random_parametric_dist( max_time: int, rng: np.random.Generator = RNG, -) -> diagnose_times.Distribution: - """Create a binomial diagnose time distribution with random params.""" +) -> diagnosis_times.Distribution: + """Create a binomial diagnosis time distribution with random params.""" def _pmf(support: np.ndarray, p: float = rng.random()) -> np.ndarray: return sp.stats.binom.pmf(support, p=p, n=max_time + 1) - return diagnose_times.Distribution( + return diagnosis_times.Distribution( distribution=_pmf, max_time=max_time, ) @@ -150,7 +150,7 @@ def create_random_params(self) -> dict[str, float]: def init_diag_time_dists(self, **dists) -> None: - """Init the diagnose time distributions.""" + """Init the diagnosis time distributions.""" for t_stage, type_ in dists.items(): self.model.set_distribution( t_stage, @@ -186,7 +186,7 @@ def setUp(self): def init_diag_time_dists(self, **dists) -> None: - """Init the diagnose time distributions.""" + """Init the diagnosis time distributions.""" for t_stage, type_ in dists.items(): self.model.set_distribution( t_stage, @@ -245,7 +245,7 @@ def create_random_params(self) -> dict[str, float]: def init_diag_time_dists(self, **dists) -> None: - """Init the diagnose time distributions.""" + """Init the diagnosis time distributions.""" for t_stage, type_ in dists.items(): self.model.set_distribution( t_stage, @@ -301,7 +301,7 @@ def setUp( def init_diag_time_dists(self, **dists) -> None: - """Init the diagnose time distributions.""" + """Init the diagnosis time distributions.""" for t_stage, type_ in dists.items(): self.model.set_distribution( t_stage, diff --git a/tests/trinary_unilateral_test.py b/tests/trinary_unilateral_test.py index d022019..363c2dc 100644 --- a/tests/trinary_unilateral_test.py +++ b/tests/trinary_unilateral_test.py @@ -89,11 +89,11 @@ def test_observation_matrix(self) -> None: self.assertTrue(np.allclose(row_sums, 1.0)) -class TrinaryDiagnoseMatricesTestCase( +class TrinaryDiagnosisMatricesTestCase( fixtures.TrinaryFixtureMixin, fixtures.IgnoreWarningsTestCase, ): - """Test the diagnose matrix of a trinary model.""" + """Test the diagnosis matrix of a trinary model.""" def setUp(self): super().setUp() @@ -104,13 +104,13 @@ def get_patient_data(self) -> pd.DataFrame: """Load an example dataset that has both clinical and pathology data.""" return pd.read_csv("tests/data/2021-usz-oropharynx.csv", header=[0, 1, 2]) - def test_diagnose_matrices_shape(self) -> None: - """Test the diagnose matrix of the model.""" + def test_diagnosis_matrices_shape(self) -> None: + """Test the diagnosis matrix of the model.""" for t_stage in ["early", "late"]: num_lnls = len(self.model.graph.lnls) num_patients = (self.model.patient_data["_model", "#", "t_stage"] == t_stage).sum() - diagnose_matrix = self.model.diagnose_matrix(t_stage).T - self.assertEqual(diagnose_matrix.shape, (3 ** num_lnls, num_patients)) + diagnosis_matrix = self.model.diagnosis_matrix(t_stage).T + self.assertEqual(diagnosis_matrix.shape, (3 ** num_lnls, num_patients)) class TrinaryParamAssignmentTestCase( @@ -184,21 +184,21 @@ def setUp(self): self.init_diag_time_dists(early="frozen", late="parametric") self.load_patient_data(filename="2021-usz-oropharynx.csv") - def create_random_diagnoses(self): + def create_random_diagnosis(self): """Create a random diagnosis for each modality and LNL.""" lnl_names = list(self.model.graph.lnls.keys()) - diagnoses = {} + diagnosis = {} for modality in self.model.get_all_modalities(): - diagnoses[modality] = fixtures.create_random_pattern(lnl_names) + diagnosis[modality] = fixtures.create_random_pattern(lnl_names) - return diagnoses + return diagnosis def test_risk_is_probability(self): """Make sure the risk is a probability.""" risk = self.model.risk( involvement=fixtures.create_random_pattern(lnls=list(self.model.graph.lnls.keys())), - given_diagnoses=self.create_random_diagnoses(), + given_diagnosis=self.create_random_diagnosis(), given_params=self.create_random_params(), t_stage=self.rng.choice(["early", "late"]), )