In [1]:
%pip install skimpy
%pip install ydata_profiling
from summit import SummitView
import summit
import pandas as pd
import numpy as np

Defaulting to user installation because normal site-packages is not writeable
Collecting skimpy
  Using cached skimpy-0.0.18-py3-none-any.whl.metadata (34 kB)
Collecting click>=8.1.7 (from skimpy)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting numpy>=2.0.2 (from skimpy)
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pandas-stubs>=2.2.2.240807 (from skimpy)
  Downloading pandas_stubs-2.3.2.250926-py3-none-any.whl.metadata (10 kB)
Collecting pandas>=2.2.3 (from skimpy)
  Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting polars>=1.17.1 (from skimpy)
  Downloading polars-1.34.0-py3-none-any.whl.metadata (10 kB)
Collecting rich>=13.9.4 (from skimpy)
  Downloading rich-14.1.0-py3-none-any.whl.metadata (18 kB)
Collecting typeguard>=4.4.1 (from skimpy)
  Downloading typeguard-4.4.4-py3-none-any.whl.metadata (3.3 kB)
Collecting pyarrow>=17.0.0

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


ImportError: cannot import name '_check_fit_params' from 'sklearn.utils.validation' (/home/reblocke/.local/lib/python3.10/site-packages/sklearn/utils/validation.py)

Visualization of the MIMIC dataset

In [None]:
s = SummitView("sqlite:///mimic.db")

In [None]:
s.view()

## Code to pull needed data elements: 

Minimum required data-elements required are:

- Age, Sex/Gender, Race/Ethnicity
- Whether they were ultiamtely admitted, admitted to ICU, length-of-stay, died
- Triage - Chief complaint (and vital signs, acuity)
- ED and ICU diagnosis codes:  E66.4, J96.02 ,J96.12, J96.22, J96.92
- Lab test results: First Arterial Blood Gas (both central lab and POC blood gas), First Venous - Blood Gas (central lab and POC)

First pass: identify all patients with ICD codes for hypercapnia, then gather needed data elements. 

[ ] TODO: will want to expand this to include all patients who underwent blood gas sampling that is suggestive of possible hypercapnia. 

**Rationale:** Extract the legacy ICD-only cohort and supporting fields.


### Patient Identification

Currently, just with ICD codes

**Rationale:** Define the cohort IDs and linkage keys.


In [None]:
# hypercapnic / obesity-hypoventilation ICD-10 codes, no dots
CODES_10 = {"E662", "J9602", "J9612", "J9622", "J9692"}

# ED codes
diag_ed = (
    s.materialize(
        columns={
            "subject_id": "diagnosis - subject_id",
            "stay_id"   : "diagnosis - stay_id",
            "icd_code"  : "diagnosis - icd_code",
            "icd_version":"diagnosis - icd_version",
        }
    )
    .query("icd_version == 10 and icd_code in @CODES_10")
    .rename(columns={"icd_code": "hypercap_icd"})
)
print(diag_ed.shape)

# inpatient codes
diag_hosp = (
    s.materialize(
        columns={
            "subject_id": "diagnoses_icd - subject_id",
            "hadm_id"   : "diagnoses_icd - hadm_id",
            "icd_code"  : "diagnoses_icd - icd_code",
            "icd_version":"diagnoses_icd - icd_version",
        }
    )
    .query("icd_version == 10 and icd_code in @CODES_10")
    .rename(columns={"icd_code": "hypercap_icd"})
)
print(diag_hosp.shape)

# --- force integer dtypes before merge ---------------------------------
diag_ed["subject_id"]  = pd.to_numeric(diag_ed["subject_id"],  downcast="integer", errors="raise").astype("int64")
diag_ed["stay_id"]     = pd.to_numeric(diag_ed["stay_id"],     downcast="integer", errors="coerce").astype("Int64")

diag_hosp["subject_id"] = pd.to_numeric(diag_hosp["subject_id"], downcast="integer", errors="raise").astype("int64")
diag_hosp["hadm_id"]    = pd.to_numeric(diag_hosp["hadm_id"],    downcast="integer", errors="coerce").astype("Int64")

# merge and keep one column with the ICD code
cohort_ids = (
    diag_ed[["subject_id","stay_id","hypercap_icd"]]
      .merge(
          diag_hosp[["subject_id","hadm_id","hypercap_icd"]],
          on="subject_id",
          how="outer",
          suffixes=("_ed", "_hosp")
      )
)

cohort_ids["hypercap_icd"] = (
    cohort_ids["hypercap_icd_ed"]
      .combine_first(cohort_ids["hypercap_icd_hosp"])
)

cohort_ids = cohort_ids.drop(columns=["hypercap_icd_ed","hypercap_icd_hosp"])

# TODO: figure how to select into cohort based on first blood gas.``

### Add needed data-elements:

**Rationale:** Attach demographic and clinical covariates for analysis.


In [None]:
# ───────────────────────────────────────────────
# 2. hypercap_dfgraphics and outcomes
# ───────────────────────────────────────────────
patients = s.materialize(
    columns={
        "subject_id": "patients - subject_id",
        "age"       : "patients - anchor_age",
        "sex"       : "patients - gender",
    },
    join_type="inner", # allegedly, join_type doesn't matter here since all 1 table.
)
patients["subject_id"] = pd.to_numeric(patients["subject_id"], errors="raise").astype("int64")

admissions = s.materialize(
    columns={
        "subject_id" : "admissions - subject_id",
        "hadm_id"    : "admissions - hadm_id",
        "admittime"  : "admissions - admittime",
        "dischtime"  : "admissions - dischtime",
        "died"       : "admissions - hospital_expire_flag",
        "race"       : "admissions - race", # includes race and ethnicity data
    },
    join_type="inner",
)
admissions["subject_id"] = pd.to_numeric(admissions["subject_id"], errors="raise").astype("int64")
admissions["hadm_id"]    = pd.to_numeric(admissions["hadm_id"],    errors="raise").astype("int64")
admissions["died"]       = admissions["died"].fillna(0).astype(bool)

icu_stays = (
    s.materialize(
        columns={"hadm_id": "icustays - hadm_id"},
        join_type="inner",
    )
    .assign(icu_admit=True)
)
icu_stays["hadm_id"] = pd.to_numeric(icu_stays["hadm_id"], errors="raise").astype("int64")


hypercap_df = (
    cohort_ids
        .merge(patients,  on="subject_id", how="left")
        .merge(admissions,on=["subject_id","hadm_id"], how="left")
        .merge(icu_stays, on="hadm_id", how="left")
)
hypercap_df["icu_admit"] = hypercap_df["icu_admit"].fillna(False).astype(bool)
hypercap_df["los_hrs"]   = (hypercap_df["dischtime"] - hypercap_df["admittime"]).dt.total_seconds() / 3600



In [None]:
# ───────────────────────────────────────────────
# 3. ED triage
# ───────────────────────────────────────────────
triage = s.materialize(
    columns={
        "subject_id"     : "triage - subject_id",
        "stay_id"        : "triage - stay_id",
        "chief_complaint": "triage - chiefcomplaint",
        "triage_acuity"  : "triage - acuity",
        "hr"             : "triage - heartrate",
        "sbp"            : "triage - sbp",
        "dbp"            : "triage - dbp",
        "rr"             : "triage - resprate",
        "spo2"           : "triage - o2sat",
        "temp"           : "triage - temperature",
    },
    join_type="inner",
)
triage["subject_id"] = pd.to_numeric(triage["subject_id"], errors="raise").astype("int64")
triage["stay_id"]    = pd.to_numeric(triage["stay_id"],    errors="coerce").astype("Int64")

int_vitals = ["triage_acuity","hr","sbp","dbp","rr","spo2","temp"]
for v in int_vitals:
    triage[v] = (
        pd.to_numeric(triage[v], errors="coerce")  # NaN for bad/missing
          .fillna(-1)                              # sentinel
          .astype("int64")
    )


hypercap_df = hypercap_df.merge(triage, on=["subject_id","stay_id"], how="left")

Perhaps only 97 of the patients who ultimately had hypercapnic respiratory failure ICD codes came through the ED? (~7%) Seems low

Reportedly - 39% of MIMIC admissions are associated with an ED stay 

### Output 

**Rationale:** Persist the legacy ICD-only cohort outputs.


In [None]:
hypercap_df.describe(include='all').T
hypercap_df.info(memory_usage='deep', show_counts=True)
from ydata_profiling import ProfileReport
ProfileReport(hypercap_df, title='MIMIC hypercapnia summary', minimal=True).to_notebook_iframe()