In [1]:
# If you wish to environment variables contained in a `.env` file.
# Comment these lines if you do not need so.
%pip install python-dotenv
%load_ext dotenv
%dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os


# Either add the path here replacing `None` or through the environment variable ICEDYN_DATA
ICEDYN_DATA = None
ICEDYN_DATA = ICEDYN_DATA or os.environ.get("ICEDYN_DATA")
%ls -hs {ICEDYN_DATA} | grep -E "(csv|txt)"

 20M [0m[01;31madmissions.csv.gz[0m
2.5M [01;31mADMISSIONS.csv.gz[0m
 33M [01;31mdiagnoses_icd.csv.gz[0m
4.5M [01;31mDIAGNOSES_ICD.csv.gz[0m
856K [01;31md_icd_diagnoses.csv.gz[0m
280K [01;31mD_ICD_DIAGNOSES.csv.gz[0m
2.8M [01;31mpatients.csv.gz[0m
560K [01;31mPATIENTS.csv.gz[0m
4.0K README.txt


In [3]:
import logging
import os
import shutil


# We write the low-level log into the file data_proc.log in the current directory.
if os.path.isdir("logs"):
    shutil.rmtree("logs")
os.makedirs("logs")
logging.basicConfig(
    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%S",
    encoding="utf-8",
    level=logging.DEBUG,
    filename="logs/data_proc.log",
    filemode="w",
)

import ehrax as rx
from ehrax.example_datasets import study_mimic_dx_summary as rxd

DEBUG:2025-08-14 15:54:40,449:jax._src.path:37: etils.epath was not found. Using pathlib for file I/O.


In [4]:
%%time
m3_dataset_init, m3_schemes = rxd.mimiciii_from_paths(
    patients=f"{ICEDYN_DATA}/PATIENTS.csv.gz",
    admissions=f"{ICEDYN_DATA}/ADMISSIONS.csv.gz",
    diagnoses_icd=f"{ICEDYN_DATA}/DIAGNOSES_ICD.csv.gz",
    d_icd_diagnoses=f"{ICEDYN_DATA}/D_ICD_DIAGNOSES.csv.gz",
)

m4_dataset_init, m4_schemes = rxd.mimiciv_from_paths(
    patients=f"{ICEDYN_DATA}/patients.csv.gz",
    admissions=f"{ICEDYN_DATA}/admissions.csv.gz",
    diagnoses_icd=f"{ICEDYN_DATA}/diagnoses_icd.csv.gz",
    d_icd_diagnoses=f"{ICEDYN_DATA}/d_icd_diagnoses.csv.gz",
)

CPU times: user 2min 50s, sys: 7.51 s, total: 2min 58s
Wall time: 2min 58s


In [5]:
%%time

# Combine both coding schemes managers into one manager
mimic_schemes = m3_schemes + m4_schemes
dataset_pipeline = rxd.default_dataset_pipeline()
m3_dataset = m3_dataset_init.execute_pipeline(dataset_pipeline, mimic_schemes)
m4_dataset = m4_dataset_init.execute_pipeline(dataset_pipeline, mimic_schemes)

Transforming Dataset:   0%|          | 0/9 [00:00<?, ?transformations/s]

Transforming Dataset:   0%|          | 0/9 [00:00<?, ?transformations/s]

CPU times: user 2min 4s, sys: 887 ms, total: 2min 5s
Wall time: 2min 5s


In [6]:
%%time
m3_dataset = m3_dataset_init.execute_pipeline(dataset_pipeline, mimic_schemes)
m4_dataset = m4_dataset_init.execute_pipeline(dataset_pipeline, mimic_schemes)

Transforming Dataset:   0%|          | 0/9 [00:00<?, ?transformations/s]

Transforming Dataset:   0%|          | 0/9 [00:00<?, ?transformations/s]

CPU times: user 1min 55s, sys: 390 ms, total: 1min 55s
Wall time: 1min 55s


In [13]:
m3_hist_stats = m3_dataset.stats(mimic_schemes).target_hist
m4_hist_stats = m4_dataset.stats(mimic_schemes).target_hist

In [35]:
import pandas as pd


m0 = mimic_schemes.map[("icd10cm", "dx_ccs")]
m1 = mimic_schemes.map[("icd9cm", "dx_ccs")]
m2 = mimic_schemes.make_chained_map(("icd10cm", "icd9cm", "dx_ccs"))
dx_ccs_quality = pd.DataFrame(
    [(len(m.domain), len(m.range)) for m in (m0, m1, m2)],
    index=["icd10cm->dx_ccs", "icd9cm->dx_ccs", "icd10cm->icd9cm->dx_ccs"],
    columns=["domain", "range"],
)

In [36]:
m0 = mimic_schemes.map[("icd10cm", "dx_flat_ccs")]
m1 = mimic_schemes.map[("icd9cm", "dx_flat_ccs")]
m2 = mimic_schemes.make_chained_map(("icd10cm", "icd9cm", "dx_flat_ccs"))
dx_flat_ccs_quality = pd.DataFrame(
    [(len(m.domain), len(m.range)) for m in (m0, m1, m2)],
    index=["icd10cm->dx_flat_ccs", "icd9cm->dx_flat_ccs", "icd10cm->icd9cm->dx_flat_ccs"],
    columns=["domain", "range"],
)

In [37]:
dx_ccs_quality

Unnamed: 0,domain,range
icd10cm->dx_ccs,71844,283
icd9cm->dx_ccs,15045,589
icd10cm->icd9cm->dx_ccs,70678,588


In [38]:
dx_flat_ccs_quality

Unnamed: 0,domain,range
icd10cm->dx_flat_ccs,71844,283
icd9cm->dx_flat_ccs,15045,283
icd10cm->icd9cm->dx_flat_ccs,70678,283


In [40]:
from scipy.stats import entropy


m3_dx_hist0, m3_dx_n0 = m3_hist_stats.dx_discharge("dx_ccs")
m4_dx_hist0, m4_dx_n0 = m4_hist_stats.dx_discharge("dx_ccs")
m4_dx_hist1, m4_dx_n1 = m4_hist_stats.dx_discharge(("icd9cm", "dx_ccs"))
m4_dx_hist2, m4_dx_n2 = m4_hist_stats.dx_discharge(("icd10cm", "dx_ccs"))

m3_dx_p0 = m3_dx_hist0 / m3_dx_n0
m4_dx_p0 = m4_dx_hist0 / m4_dx_n0
m4_dx_p1 = m4_dx_hist1 / m4_dx_n1
m4_dx_p2 = m4_dx_hist2 / m4_dx_n2

m3_dx_e0 = entropy(m3_dx_p0)
m4_dx_e0 = entropy(m4_dx_p0)
m4_dx_e1 = entropy(m4_dx_p1)
m4_dx_e2 = entropy(m4_dx_p2)

dx_e = pd.Series(
    [m3_dx_e0, m4_dx_e0, m4_dx_e1, m4_dx_e2],
    index=["m3:mix->dx_ccs", "m4:mix->dx_ccs", "m4:mix->icd9cm->dx_ccs", "m4:mix->icd10cm->dx_ccs"],
)

In [39]:
m3_o_hist0, m3_o_n0 = m3_hist_stats.outcome("dx_flat_ccs_v1")
m4_o_hist0, m4_o_n0 = m4_hist_stats.outcome("dx_flat_ccs_v1")
m4_o_hist1, m4_o_n1 = m4_hist_stats.outcome(("icd9cm", "dx_flat_ccs_v1"))
m4_o_hist2, m4_o_n2 = m4_hist_stats.outcome(("icd10cm", "dx_flat_ccs_v1"))

m3_o_p0 = m3_o_hist0 / m3_o_n0
m4_o_p0 = m4_o_hist0 / m4_o_n0
m4_o_p1 = m4_o_hist1 / m4_o_n1
m4_o_p2 = m4_o_hist2 / m4_o_n2

m3_o_e0 = entropy(m3_o_p0)
m4_o_e0 = entropy(m4_o_p0)
m4_o_e1 = entropy(m4_o_p1)
m4_o_e2 = entropy(m4_o_p2)

o_e = pd.Series(
    [m3_o_e0, m4_o_e0, m4_o_e1, m4_o_e2],
    index=["m3:mix->outcome", "m4:mix->outcome", "m4:mix->icd9cm->outcome", "m4:mix->icd10cm->outcome"],
)

In [41]:
dx_e

m3:mix->dx_ccs             5.161669
m4:mix->dx_ccs             5.237830
m4:mix->icd9cm->dx_ccs     5.218789
m4:mix->icd10cm->dx_ccs    2.434947
dtype: float64

In [42]:
o_e

m3:mix->outcome             4.533171
m4:mix->outcome             4.663448
m4:mix->icd9cm->outcome     4.654050
m4:mix->icd10cm->outcome    4.643892
dtype: float64

In [None]:
# Conclusion.
# from M3 to dx_ccs / dx_flat_ccs (use direct maps of AHRQ)
# from M4 to dx_ccs (use chained map for ICD10: ICD10CM->ICD9CM->dx_ccs).
# from M4 to dx_flat_ccs (use direct map of AHRQ).

In [7]:
# effective mapping between mixed_scheme (icd9cm, icd10cm) to dx_flat_ccs directly
# through the tables from AHRQ: (`icd9cm` -> `dx_flat_ccs`) \union ('icd10cm` -> `dx_flat_ccs`)
m_stats0 = rx.Dataset.two_stats(m3_dataset, m4_dataset, coding_schemes_manager=mimic_schemes)
p_tests0 = m_stats0.target_p_tests.outcome("dx_flat_ccs_v1", "admission")
p_summary0 = m_stats0.summerise_p_tests(p_tests0)

In [8]:
# effective mapping between mixed_scheme (icd9cm, icd10cm) to dx_flat_ccs through
# (`icd9cm` -> `dx_flat_ccs`) \union (`icd10cm` -> `icd9cm` -> `dx_flat_ccs`)
m_stats1 = rx.Dataset.two_stats(m3_dataset, m4_dataset, coding_schemes_manager=mimic_schemes)
p_tests1 = m_stats1.target_p_tests.outcome(("icd9cm", "dx_flat_ccs_v1"), "admission")
p_summary1 = m_stats1.summerise_p_tests(p_tests1)

In [9]:
import pandas as pd


pd.concat([p_summary0, p_summary1], axis=1)

Unnamed: 0,0,1
total,241,241
divergent,168,172
convergent,39,35
skip_test,34,34
