
# MIMIC-IV — First ABG & VBG per Episode (BigQuery, admission-centric)

This notebook builds an **admission-level** table (one line per `hadm_id`) and attaches the **chronologically first** ABG and VBG pairs (pH & PaCO₂ from the *same draw*) across ED/Hospital/ICU.  
Key features:

- Cohort defined by **hypercapnic respiratory failure ICD** codes (ED + Hospital), aggregated at **hadm_id**.
- **ED→Hospital reconciliation**: the ED stay that led to the admission is attached.
- **LAB (HOSP)** blood gases paired by **`specimen_id`**; arterial/venous from the **Specimen/Sample Type** row.
- **POC (ICU)** blood gases paired by **timestamp**; site inferred from label.
- **Unit normalization**: PaCO₂ in **mmHg** (kPa→mmHg via 7.50062).
- Guards prevent pH/PaCO₂ swaps: pH ∈ [6.3, 7.8], PaCO₂ ∈ [5, 200] mmHg.
- Final output includes legacy-compatible columns (`lab_*`, `poc_*`, `poc_vbg_*`).

> **Prereqs**: You must have Google Cloud auth set up (`gcloud auth application-default login`) and BigQuery access to PhysioNet.


In [1]:

import os
import pandas as pd
from google.cloud import bigquery

# ---- Environment (override via your .env or shell) ----
WORK_PROJECT = os.getenv("WORK_PROJECT")  # <-- REQUIRED (your billing project, NOT physionet-data)
PHYS = os.getenv("BQ_PHYSIONET_PROJECT", "physionet-data")
HOSP = os.getenv("BQ_DATASET_HOSP", "mimiciv_3_1_hosp")
ICU  = os.getenv("BQ_DATASET_ICU",  "mimiciv_3_1_icu")
ED   = os.getenv("BQ_DATASET_ED",   "mimiciv_ed")  # adjust to mimiciv_3_1_ed if needed

assert WORK_PROJECT, "WORK_PROJECT environment variable must be set to your billing project (e.g., 'my-gcp-project')."

print("Project:", WORK_PROJECT)
print("Datasets -> PHYS:", PHYS, "| HOSP:", HOSP, "| ICU:", ICU, "| ED:", ED)

client = bigquery.Client(project=WORK_PROJECT)

def run_sql_bq(sql: str, params: dict | None = None) -> pd.DataFrame:
    """Run a parameterized BigQuery SQL query and return a pandas DataFrame."""
    job_config = bigquery.QueryJobConfig()
    if params:
        bq_params = []
        for k, v in params.items():
            if isinstance(v, (list, tuple)):
                # infer type from first non-null
                non_null = next((x for x in v if x is not None), None)
                if isinstance(non_null, int):
                    bq_params.append(bigquery.ArrayQueryParameter(k, "INT64", v))
                elif isinstance(non_null, float):
                    bq_params.append(bigquery.ArrayQueryParameter(k, "FLOAT64", v))
                else:
                    bq_params.append(bigquery.ArrayQueryParameter(k, "STRING", v))
            else:
                if isinstance(v, int):
                    bq_params.append(bigquery.ScalarQueryParameter(k, "INT64", v))
                elif isinstance(v, float):
                    bq_params.append(bigquery.ScalarQueryParameter(k, "FLOAT64", v))
                else:
                    bq_params.append(bigquery.ScalarQueryParameter(k, "STRING", v))
        job_config.query_parameters = bq_params

    job = client.query(sql, job_config=job_config)
    try:
        return job.result().to_dataframe(create_bqstorage_client=True)
    except TypeError:
        # environments without the BQ Storage extra
        return job.result().to_dataframe()

def require_hadm(df: pd.DataFrame, name="table"):
    if "hadm_id" in df.columns:
        return df
    if df.index.name == "hadm_id":
        return df.reset_index()
    raise KeyError(f"{name} has no 'hadm_id'. Ensure you're operating on the admission-level table.")


Project: mimic-hypercapnia
Datasets -> PHYS: physionet-data | HOSP: mimiciv_3_1_hosp | ICU: mimiciv_3_1_icu | ED: mimiciv_ed


In [2]:

# Smoke test (do not reuse 'df' to avoid clobbering the master table)
smoke = client.query("SELECT 1 AS ok").result().to_dataframe()
smoke


E0000 00:00:1760371533.672742 13170318 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Unnamed: 0,ok
0,1


In [3]:

ICD10_CODES = ['J9602','J9612','J9622','J9692','E662']
ICD9_CODES  = ['27803']

cohort_sql = f"""
WITH target_codes AS (
  SELECT 'J9602' AS code, 10 AS ver UNION ALL
  SELECT 'J9612', 10 UNION ALL
  SELECT 'J9622', 10 UNION ALL
  SELECT 'J9692', 10 UNION ALL
  SELECT 'E662',  10 UNION ALL
  SELECT '27803', 9
),

-- Hospital diagnoses → flags per admission
hosp_dx AS (
  SELECT d.subject_id, d.hadm_id,
         UPPER(REPLACE(d.icd_code, '.', '')) AS code_norm, d.icd_version
  FROM `{PHYS}.{HOSP}.diagnoses_icd` d
  JOIN target_codes t
    ON t.ver = d.icd_version AND t.code = UPPER(REPLACE(d.icd_code, '.', ''))
  WHERE d.hadm_id IS NOT NULL
),
hosp_flags AS (
  SELECT
    subject_id, hadm_id,
    MAX(IF(icd_version=10 AND code_norm='J9602',1,0)) AS ICD10_J9602,
    MAX(IF(icd_version=10 AND code_norm='J9612',1,0)) AS ICD10_J9612,
    MAX(IF(icd_version=10 AND code_norm='J9622',1,0)) AS ICD10_J9622,
    MAX(IF(icd_version=10 AND code_norm='J9692',1,0)) AS ICD10_J9692,
    MAX(IF(icd_version=10 AND code_norm='E662', 1,0)) AS ICD10_E662,
    MAX(IF(icd_version=9  AND code_norm='27803',1,0)) AS ICD9_27803
  FROM hosp_dx GROUP BY subject_id, hadm_id
),

-- Earliest ED stay per admission (even if ED had no hypercapnia code)
ed_link AS (
  SELECT
    subject_id,
    hadm_id,
    (ARRAY_AGG(STRUCT(stay_id, intime) ORDER BY intime LIMIT 1))[OFFSET(0)].stay_id AS stay_id
  FROM `{PHYS}.{ED}.edstays`
  WHERE hadm_id IS NOT NULL
  GROUP BY subject_id, hadm_id
),

-- ED diagnoses restricted to target codes → flags per admission
ed_dx AS (
  SELECT s.subject_id, s.hadm_id, s.stay_id,
         UPPER(REPLACE(d.icd_code, '.', '')) AS code_norm, d.icd_version
  FROM `{PHYS}.{ED}.diagnosis` d
  JOIN `{PHYS}.{ED}.edstays` s
    ON s.subject_id = d.subject_id AND s.stay_id = d.stay_id
  JOIN target_codes t
    ON t.ver = d.icd_version AND t.code = UPPER(REPLACE(d.icd_code, '.', ''))
  WHERE s.hadm_id IS NOT NULL
),
ed_flags_by_hadm AS (
  SELECT
    subject_id, hadm_id,
    MAX(IF(icd_version=10 AND code_norm='J9602',1,0)) AS ICD10_J9602,
    MAX(IF(icd_version=10 AND code_norm='J9612',1,0)) AS ICD10_J9612,
    MAX(IF(icd_version=10 AND code_norm='J9622',1,0)) AS ICD10_J9622,
    MAX(IF(icd_version=10 AND code_norm='J9692',1,0)) AS ICD10_J9692,
    MAX(IF(icd_version=10 AND code_norm='E662', 1,0)) AS ICD10_E662,
    MAX(IF(icd_version=9  AND code_norm='27803',1,0)) AS ICD9_27803
  FROM ed_dx
  GROUP BY subject_id, hadm_id
),

-- Combine: attach ED stay if it exists, OR flags across ED + hospital
combined AS (
  SELECT
    COALESCE(h.subject_id, l.subject_id) AS subject_id,
    l.stay_id,
    COALESCE(h.hadm_id, l.hadm_id) AS hadm_id,
    GREATEST(IFNULL(h.ICD10_J9602,0), IFNULL(e.ICD10_J9602,0)) AS ICD10_J9602,
    GREATEST(IFNULL(h.ICD10_J9612,0), IFNULL(e.ICD10_J9612,0)) AS ICD10_J9612,
    GREATEST(IFNULL(h.ICD10_J9622,0), IFNULL(e.ICD10_J9622,0)) AS ICD10_J9622,
    GREATEST(IFNULL(h.ICD10_J9692,0), IFNULL(e.ICD10_J9692,0)) AS ICD10_J9692,
    GREATEST(IFNULL(h.ICD10_E662 ,0), IFNULL(e.ICD10_E662 ,0)) AS ICD10_E662,
    GREATEST(IFNULL(h.ICD9_27803,0), IFNULL(e.ICD9_27803,0)) AS ICD9_27803
  FROM hosp_flags h
  FULL OUTER JOIN ed_link l
    ON l.hadm_id = h.hadm_id
  LEFT JOIN ed_flags_by_hadm e
    ON e.hadm_id = COALESCE(h.hadm_id, l.hadm_id)
)

SELECT
  subject_id, stay_id, hadm_id,
  ICD10_J9602, ICD10_J9612, ICD10_J9622, ICD10_J9692, ICD10_E662, ICD9_27803,
  IF((ICD10_J9602+ICD10_J9612+ICD10_J9622+ICD10_J9692+ICD10_E662+ICD9_27803) > 0, 1, 0) AS any_hypercap_icd
FROM combined
WHERE (ICD10_J9602+ICD10_J9612+ICD10_J9622+ICD10_J9692+ICD10_E662+ICD9_27803) > 0
"""

cohort = run_sql_bq(cohort_sql)
cohort.head()


E0000 00:00:1760371535.690697 13170318 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Unnamed: 0,subject_id,stay_id,hadm_id,ICD10_J9602,ICD10_J9612,ICD10_J9622,ICD10_J9692,ICD10_E662,ICD9_27803,any_hypercap_icd
0,15874847,,22638629,0,0,1,0,0,0,1
1,18544264,,22660840,0,0,1,0,0,0,1
2,19650945,32448154.0,28820543,0,0,0,0,0,1,1
3,12458464,,25160678,1,0,0,0,0,0,1
4,12720595,,22136377,0,0,1,0,0,0,1


In [4]:
# Guardrails
flags = ["ICD10_J9602","ICD10_J9612","ICD10_J9622","ICD10_J9692","ICD10_E662","ICD9_27803"]
bad = cohort[(cohort["any_hypercap_icd"] == 1) & (cohort[flags].sum(axis=1) == 0)]
assert bad.empty, f"Inconsistent flags in cohort: {len(bad)} rows have any=1 but all component flags=0"

assert cohort["hadm_id"].isna().sum() == 0, "Cohort produced rows without hadm_id"
assert cohort["hadm_id"].nunique() == len(cohort), "Cohort has duplicate hadm_id rows"

hadm_list = sorted(cohort["hadm_id"].dropna().unique().tolist())

# Admission-level base table
episodes = cohort.copy()
episodes = episodes.sort_values(["hadm_id"]).drop_duplicates(["hadm_id"], keep="first").reset_index(drop=True)
episodes.shape


(4237, 10)

In [5]:
params = {"hadms": hadm_list}

bg_pairs_sql = f"""
WITH hadms AS (SELECT hadm_id FROM UNNEST(@hadms) AS hadm_id),

/* ---------------- LAB (HOSP) ---------------- */
hosp_cand AS (
  SELECT
    le.subject_id, le.hadm_id, le.charttime, le.specimen_id,
    CAST(le.valuenum AS FLOAT64) AS val,
    LOWER(COALESCE(le.valueuom,'')) AS uom,
    LOWER(di.label) AS lbl,
    LOWER(COALESCE(di.fluid,'')) AS fl
  FROM `{PHYS}.{HOSP}.labevents`  le
  JOIN `{PHYS}.{HOSP}.d_labitems` di ON di.itemid = le.itemid
  JOIN hadms h ON h.hadm_id = le.hadm_id
  WHERE le.valuenum IS NOT NULL
    AND (LOWER(COALESCE(di.category,'')) LIKE '%blood gas%'
         OR LOWER(di.label) LIKE '%pco2%'
         OR REGEXP_CONTAINS(LOWER(di.label), r'\\bph\\b'))
    AND (
         REGEXP_CONTAINS(LOWER(di.label), r'\\bph\\b')
      OR REGEXP_CONTAINS(LOWER(di.label), r'\\bpa?\\s*co(?:2|₂)\\b')
      OR LOWER(di.label) LIKE '%pco2%' OR LOWER(di.label) LIKE '%paco2%'
        )
    AND NOT REGEXP_CONTAINS(LOWER(di.label), r'(t\\s*co2|tco2|total\\s*co2|content|bicar|hco3|et\\s*co2|end[- ]?tidal)')
),
hosp_spec AS (
  SELECT le.specimen_id, LOWER(COALESCE(le.value,'')) AS spec_val
  FROM `{PHYS}.{HOSP}.labevents` le
  JOIN `{PHYS}.{HOSP}.d_labitems` di ON di.itemid = le.itemid
  WHERE le.specimen_id IS NOT NULL
    AND REGEXP_CONTAINS(LOWER(di.label), r'(specimen|sample)')
),
hosp_class AS (
  SELECT
    c.hadm_id, c.charttime, c.specimen_id, c.val, c.uom, c.lbl, c.fl,
    CASE
      WHEN REGEXP_CONTAINS(c.lbl, r'\\b(?:blood\\s*)?ph\\b') THEN 'ph'
      WHEN (c.lbl LIKE '%pco2%' OR REGEXP_CONTAINS(c.lbl, r'\\bpa?\\s*co(?:2|₂)\\b')) THEN 'pco2'
      ELSE NULL
    END AS analyte,
    CASE
      WHEN REGEXP_CONTAINS(s.spec_val, r'arter') OR REGEXP_CONTAINS(s.spec_val, r'\\bart\\b') THEN 'arterial'
      WHEN REGEXP_CONTAINS(s.spec_val, r'ven|mixed|central') THEN 'venous'
      WHEN c.fl LIKE '%arterial%' OR REGEXP_CONTAINS(c.lbl, r'\\b(abg|art|arterial|a[- ]?line)\\b') THEN 'arterial'
      WHEN c.fl LIKE '%ven%'      OR REGEXP_CONTAINS(c.lbl, r'\\b(vbg|ven|venous|mixed|central)\\b') THEN 'venous'
      ELSE NULL
    END AS site
  FROM hosp_cand c
  LEFT JOIN hosp_spec s USING (specimen_id)
),
hosp_pairs AS (
  SELECT
    hadm_id, specimen_id,
    MIN(charttime) AS sample_time,
    MAX(IF(analyte='ph',   val, NULL)) AS ph,
    MAX(IF(analyte='pco2', val, NULL)) AS pco2_raw,
    (ARRAY_AGG(IF(analyte='pco2', uom, NULL) IGNORE NULLS LIMIT 1))[OFFSET(0)] AS pco2_uom,
    (ARRAY_AGG(IF(analyte='ph',   uom, NULL) IGNORE NULLS LIMIT 1))[OFFSET(0)] AS ph_uom,
    (ARRAY_AGG(site IGNORE NULLS LIMIT 1))[OFFSET(0)] AS site
  FROM hosp_class
  GROUP BY hadm_id, specimen_id
  HAVING (ph IS NOT NULL OR pco2_raw IS NOT NULL) AND site IN ('arterial','venous')
),
hosp_pairs_std AS (
  SELECT
    hadm_id, specimen_id, sample_time, site,
    ph, ph_uom,
    CASE WHEN pco2_uom = 'kpa' THEN pco2_raw * 7.50062 ELSE pco2_raw END AS pco2_mmHg,
    COALESCE(NULLIF(pco2_uom,''),'mmhg') AS pco2_uom_norm
  FROM hosp_pairs
  WHERE (ph IS NULL OR (ph BETWEEN 6.3 AND 7.8))
    AND (pco2_raw IS NULL OR (CASE WHEN pco2_uom='kpa' THEN pco2_raw*7.50062 ELSE pco2_raw END) BETWEEN 5 AND 200)
),
lab_abg AS (
  SELECT hadm_id,
         ph            AS lab_abg_ph,
         ph_uom        AS lab_abg_ph_uom,
         pco2_mmHg     AS lab_abg_paco2,
         pco2_uom_norm AS lab_abg_paco2_uom,
         sample_time   AS lab_abg_time
  FROM (
     SELECT *, ROW_NUMBER() OVER (PARTITION BY hadm_id ORDER BY sample_time) rn
     FROM hosp_pairs_std WHERE site='arterial'
  ) WHERE rn=1
),
lab_vbg AS (
  SELECT hadm_id,
         ph            AS lab_vbg_ph,
         ph_uom        AS lab_vbg_ph_uom,
         pco2_mmHg     AS lab_vbg_paco2,
         pco2_uom_norm AS lab_vbg_paco2_uom,
         sample_time   AS lab_vbg_time
  FROM (
     SELECT *, ROW_NUMBER() OVER (PARTITION BY hadm_id ORDER BY sample_time) rn
     FROM hosp_pairs_std WHERE site='venous'
  ) WHERE rn=1
),

/* ---------------- POC (ICU) ---------------- */
icu_cand AS (
  SELECT
    ie.hadm_id, ce.stay_id, ce.charttime,
    CAST(ce.valuenum AS FLOAT64) AS val,
    LOWER(COALESCE(ce.valueuom,'')) AS uom,
    LOWER(di.label) AS lbl
  FROM `{PHYS}.{ICU}.chartevents` ce
  JOIN `{PHYS}.{ICU}.d_items`  di ON di.itemid = ce.itemid
  JOIN `{PHYS}.{ICU}.icustays` ie ON ie.stay_id = ce.stay_id
  JOIN hadms h ON h.hadm_id = ie.hadm_id
  WHERE ce.valuenum IS NOT NULL
    AND (
          REGEXP_CONTAINS(LOWER(di.label), r'\\bph\\b')
       OR REGEXP_CONTAINS(LOWER(di.label), r'\\bpa?\\s*co(?:2|₂)\\b')
       OR LOWER(di.label) LIKE '%pco2%' OR LOWER(di.label) LIKE '%paco2%'
        )
    AND NOT REGEXP_CONTAINS(LOWER(di.label), r'(t\\s*co2|tco2|total\\s*co2|content|bicar|hco3|et\\s*co2|end[- ]?tidal)')
),
icu_class AS (
  SELECT
    hadm_id, stay_id, charttime, val, uom, lbl,
    CASE
      WHEN REGEXP_CONTAINS(lbl, r'\\b(?:blood\\s*)?ph\\b') THEN 'ph'
      WHEN (lbl LIKE '%pco2%' OR REGEXP_CONTAINS(lbl, r'\\bpa?\\s*co(?:2|₂)\\b')) THEN 'pco2'
      ELSE NULL
    END AS analyte,
    CASE
      WHEN REGEXP_CONTAINS(lbl, r'\\b(abg|art|arterial|a[- ]?line)\\b') THEN 'arterial'
      WHEN REGEXP_CONTAINS(lbl, r'\\b(vbg|ven|venous|mixed|central)\\b') THEN 'venous'
      ELSE NULL
    END AS site
  FROM icu_cand
),
icu_pairs AS (
  SELECT
    hadm_id, stay_id, charttime AS sample_time,
    MAX(IF(analyte='ph',   val, NULL)) AS ph,
    MAX(IF(analyte='pco2', val, NULL)) AS pco2_raw,
    (ARRAY_AGG(IF(analyte='pco2', uom, NULL) IGNORE NULLS LIMIT 1))[OFFSET(0)] AS pco2_uom,
    (ARRAY_AGG(IF(analyte='ph',   uom, NULL) IGNORE NULLS LIMIT 1))[OFFSET(0)] AS ph_uom,
    (ARRAY_AGG(site IGNORE NULLS LIMIT 1))[OFFSET(0)] AS site
  FROM icu_class
  GROUP BY hadm_id, stay_id, sample_time
  HAVING (ph IS NOT NULL OR pco2_raw IS NOT NULL) AND site IN ('arterial','venous')
),
icu_pairs_std AS (
  SELECT
    hadm_id, stay_id, sample_time, site,
    ph, ph_uom,
    CASE WHEN pco2_uom='kpa' THEN pco2_raw*7.50062 ELSE pco2_raw END AS pco2_mmHg,
    COALESCE(NULLIF(pco2_uom,''),'mmhg') AS pco2_uom_norm
  FROM icu_pairs
  WHERE (ph IS NULL OR (ph BETWEEN 6.3 AND 7.8))
    AND (pco2_raw IS NULL OR (CASE WHEN pco2_uom='kpa' THEN pco2_raw*7.50062 ELSE pco2_raw END) BETWEEN 5 AND 200)
),
poc_abg AS (
  SELECT hadm_id,
         ph            AS poc_abg_ph,
         ph_uom        AS poc_abg_ph_uom,
         pco2_mmHg     AS poc_abg_paco2,
         pco2_uom_norm AS poc_abg_paco2_uom,
         sample_time   AS poc_abg_time
  FROM (
     SELECT *, ROW_NUMBER() OVER (PARTITION BY hadm_id ORDER BY sample_time) rn
     FROM icu_pairs_std WHERE site='arterial'
  ) WHERE rn=1
),
poc_vbg AS (
  SELECT hadm_id,
         ph            AS poc_vbg_ph,
         ph_uom        AS poc_vbg_ph_uom,
         pco2_mmHg     AS poc_vbg_paco2,
         pco2_uom_norm AS poc_vbg_paco2_uom,
         sample_time   AS poc_vbg_time
  FROM (
     SELECT *, ROW_NUMBER() OVER (PARTITION BY hadm_id ORDER BY sample_time) rn
     FROM icu_pairs_std WHERE site='venous'
  ) WHERE rn=1
),

/* -------- First ABG/VBG across LAB + POC (earliest per admission) -------- */
abg_union AS (
  SELECT hadm_id, 'LAB' AS src, lab_abg_time AS t, lab_abg_ph AS ph, lab_abg_paco2 AS pco2
  FROM lab_abg WHERE lab_abg_time IS NOT NULL
  UNION ALL
  SELECT hadm_id, 'POC' AS src, poc_abg_time, poc_abg_ph, poc_abg_paco2
  FROM poc_abg WHERE poc_abg_time IS NOT NULL
),
first_abg AS (
  SELECT
    hadm_id,
    (ARRAY_AGG(STRUCT(src, t, ph, pco2) ORDER BY t LIMIT 1))[OFFSET(0)] AS pick
  FROM abg_union
  GROUP BY hadm_id
),
vbg_union AS (
  SELECT hadm_id, 'LAB' AS src, lab_vbg_time AS t, lab_vbg_ph AS ph, lab_vbg_paco2 AS pco2
  FROM lab_vbg WHERE lab_vbg_time IS NOT NULL
  UNION ALL
  SELECT hadm_id, 'POC' AS src, poc_vbg_time, poc_vbg_ph, poc_vbg_paco2
  FROM poc_vbg WHERE poc_vbg_time IS NOT NULL
),
first_vbg AS (
  SELECT
    hadm_id,
    (ARRAY_AGG(STRUCT(src, t, ph, pco2) ORDER BY t LIMIT 1))[OFFSET(0)] AS pick
  FROM vbg_union
  GROUP BY hadm_id
)

/* ---------------- Final one row per hadm ---------------- */
SELECT
  h.hadm_id,

  -- LAB-ABG / LAB-VBG
  la.lab_abg_ph, la.lab_abg_ph_uom, la.lab_abg_paco2, la.lab_abg_paco2_uom, la.lab_abg_time,
  lv.lab_vbg_ph, lv.lab_vbg_ph_uom, lv.lab_vbg_paco2, lv.lab_vbg_paco2_uom, lv.lab_vbg_time,

  -- POC-ABG / POC-VBG
  pa.poc_abg_ph, pa.poc_abg_ph_uom, pa.poc_abg_paco2, pa.poc_abg_paco2_uom, pa.poc_abg_time,
  pv.poc_vbg_ph, pv.poc_vbg_ph_uom, pv.poc_vbg_paco2, pv.poc_vbg_paco2_uom, pv.poc_vbg_time,

  -- First ABG across LAB+POC (may have only one analyte)
  fa.pick.src AS first_abg_src,
  fa.pick.t   AS first_abg_time,
  fa.pick.ph  AS first_abg_ph,
  fa.pick.pco2 AS first_abg_paco2,

  -- First VBG across LAB+POC (may have only one analyte)
  fv.pick.src AS first_vbg_src,
  fv.pick.t   AS first_vbg_time,
  fv.pick.ph  AS first_vbg_ph,
  fv.pick.pco2 AS first_vbg_paco2

FROM hadms h
LEFT JOIN lab_abg la USING (hadm_id)
LEFT JOIN lab_vbg lv USING (hadm_id)
LEFT JOIN poc_abg pa USING (hadm_id)
LEFT JOIN poc_vbg pv USING (hadm_id)
LEFT JOIN first_abg fa USING (hadm_id)
LEFT JOIN first_vbg fv USING (hadm_id)
"""
bg_pairs = run_sql_bq(bg_pairs_sql, params)
bg_pairs.head()


E0000 00:00:1760371542.953917 13170318 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Unnamed: 0,hadm_id,lab_abg_ph,lab_abg_ph_uom,lab_abg_paco2,lab_abg_paco2_uom,lab_abg_time,lab_vbg_ph,lab_vbg_ph_uom,lab_vbg_paco2,lab_vbg_paco2_uom,...,poc_vbg_paco2_uom,poc_vbg_time,first_abg_src,first_abg_time,first_abg_ph,first_abg_paco2,first_vbg_src,first_vbg_time,first_vbg_ph,first_vbg_paco2
0,20001305,,,,,NaT,7.29,units,77.0,mm hg,...,mmhg,2178-03-25 04:28:00,,NaT,,,POC,2178-03-25 04:28:00,7.29,
1,20002497,7.49,units,44.0,mm hg,2121-08-23 17:53:00,7.42,units,58.0,mm hg,...,,NaT,LAB,2121-08-23 17:53:00,7.49,44.0,LAB,2121-08-19 20:20:00,7.42,58.0
2,20005666,,,,,NaT,,,,,...,,NaT,,NaT,,,,NaT,,
3,20006409,7.4,units,36.0,mm hg,2126-10-28 05:45:00,7.4,units,38.0,mm hg,...,mmhg,2126-10-28 12:39:00,POC,2126-10-28 05:45:00,7.4,,POC,2126-10-28 12:39:00,7.4,
4,20010211,,,,,NaT,,,,,...,,NaT,,NaT,,,,NaT,,


In [6]:
# Ensure pairing produced sensible ranges
def in_range(s, lo, hi):
    s = pd.to_numeric(s, errors="coerce")
    return s.notna() & (s >= lo) & (s <= hi)

for c in ["lab_abg_ph","lab_vbg_ph","poc_abg_ph","poc_vbg_ph"]:
    if c in bg_pairs.columns:
        print(c, (~in_range(bg_pairs[c], 6.3, 7.8)).sum(), "out of range")

for c in ["lab_abg_paco2","lab_vbg_paco2","poc_abg_paco2","poc_vbg_paco2"]:
    if c in bg_pairs.columns:
        print(c, (~in_range(bg_pairs[c], 5, 200)).sum(), "out of range")

lab_abg_ph 1884 out of range
lab_vbg_ph 1063 out of range
poc_abg_ph 2223 out of range
poc_vbg_ph 1716 out of range
lab_abg_paco2 1910 out of range
lab_vbg_paco2 1205 out of range
poc_abg_paco2 4237 out of range
poc_vbg_paco2 4237 out of range


In [7]:

# Rename to legacy-compatible column names
rename_map = {
    # LAB ABG -> legacy "lab_*"
    "lab_abg_paco2":      "lab_paco2",
    "lab_abg_paco2_uom":  "lab_paco2_uom",
    "lab_abg_time":       "abg_time",
    "lab_abg_ph":         "lab_ph",
    "lab_abg_ph_uom":     "lab_ph_uom",

    # POC ABG -> legacy "poc_*"
    "poc_abg_paco2":      "poc_paco2",
    "poc_abg_paco2_uom":  "poc_paco2_uom",
    "poc_abg_time":       "poc_paco2_time",
    "poc_abg_ph":         "poc_ph",
    "poc_abg_ph_uom":     "poc_ph_uom",
}
bg_pairs_legacy = bg_pairs.rename(columns=rename_map).copy()

# Convenience: POC ABG analytes share the same time
if "poc_paco2_time" in bg_pairs_legacy.columns:
    bg_pairs_legacy["poc_ph_time"] = bg_pairs_legacy["poc_paco2_time"]

# Provide POC VBG times with the same timestamp field
if "poc_vbg_time" in bg_pairs.columns:
    bg_pairs_legacy["poc_vbg_paco2_time"] = bg_pairs["poc_vbg_time"]
    bg_pairs_legacy["poc_vbg_ph_time"]    = bg_pairs["poc_vbg_time"]

# Ensure hadm_id is present for merge
episodes = require_hadm(episodes, "episodes")
bg_pairs_legacy = require_hadm(bg_pairs_legacy, "bg_pairs_legacy")

# Merge
episodes = episodes.merge(bg_pairs_legacy, on="hadm_id", how="left")

# Back-compatibility (if downstream cells expect 'df')
df = episodes.copy()
df.head(10)


Unnamed: 0,subject_id,stay_id,hadm_id,ICD10_J9602,ICD10_J9612,ICD10_J9622,ICD10_J9692,ICD10_E662,ICD9_27803,any_hypercap_icd,...,first_abg_time,first_abg_ph,first_abg_paco2,first_vbg_src,first_vbg_time,first_vbg_ph,first_vbg_paco2,poc_ph_time,poc_vbg_paco2_time,poc_vbg_ph_time
0,16003661,33328878.0,20001305,1,0,0,0,0,0,1,...,NaT,,,POC,2178-03-25 04:28:00,7.29,,NaT,2178-03-25 04:28:00,2178-03-25 04:28:00
1,13390157,,20002497,0,0,0,0,1,0,1,...,2121-08-23 17:53:00,7.49,44.0,LAB,2121-08-19 20:20:00,7.42,58.0,NaT,NaT,NaT
2,17055745,34302455.0,20005666,1,0,0,0,0,0,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
3,11281568,30744215.0,20006409,0,0,1,0,0,0,1,...,2126-10-28 05:45:00,7.4,,POC,2126-10-28 12:39:00,7.4,,2126-10-28 05:45:00,2126-10-28 12:39:00,2126-10-28 12:39:00
4,17598948,,20010211,0,0,0,0,0,1,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
5,11613862,,20010312,0,0,0,0,1,0,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
6,11068569,,20014075,0,1,0,0,1,0,1,...,2152-02-26 08:29:00,7.48,47.0,LAB,2152-01-23 10:46:00,7.37,59.0,NaT,NaT,NaT
7,17396346,37000201.0,20014583,0,0,0,0,0,1,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
8,16186978,,20015802,0,0,1,0,0,0,1,...,2150-01-25 05:44:00,7.35,,POC,2150-01-24 20:47:00,7.27,,2150-01-25 05:44:00,2150-01-24 20:47:00,2150-01-24 20:47:00
9,17199808,,20016167,1,0,0,0,0,0,1,...,2183-06-22 09:38:00,7.37,,,NaT,,,2183-06-22 09:38:00,NaT,NaT


In [8]:
# First ABG across LAB+POC
for tcol in ["abg_time", "poc_paco2_time"]:
    if tcol in df.columns:
        df[tcol] = pd.to_datetime(df[tcol], errors="coerce")

df["first_abg_time"] = df[["abg_time","poc_paco2_time"]].min(axis=1)

pick_poc_for_abg = df["first_abg_time"].eq(df["poc_paco2_time"])
if {"lab_paco2","poc_paco2"}.issubset(df.columns):
    df["first_abg_paco2"] = df["lab_paco2"].where(~pick_poc_for_abg, df["poc_paco2"])
if {"lab_ph","poc_ph"}.issubset(df.columns):
    df["first_abg_ph"]    = df["lab_ph"].where(~pick_poc_for_abg, df["poc_ph"])

# First VBG across LAB+POC
for tcol in ["lab_vbg_time", "poc_vbg_paco2_time"]:
    if tcol in df.columns:
        df[tcol] = pd.to_datetime(df[tcol], errors="coerce")
df["first_vbg_time"] = df[["lab_vbg_time","poc_vbg_paco2_time"]].min(axis=1)

pick_poc_for_vbg = df["first_vbg_time"].eq(df["poc_vbg_paco2_time"])
if {"lab_vbg_paco2","poc_vbg_paco2"}.issubset(df.columns):
    df["first_vbg_paco2"] = df["lab_vbg_paco2"].where(~pick_poc_for_vbg, df["poc_vbg_paco2"])
if {"lab_vbg_ph","poc_vbg_ph"}.issubset(df.columns):
    df["first_vbg_ph"]    = df["lab_vbg_ph"].where(~pick_poc_for_vbg, df["poc_vbg_ph"])

df.head(10)


Unnamed: 0,subject_id,stay_id,hadm_id,ICD10_J9602,ICD10_J9612,ICD10_J9622,ICD10_J9692,ICD10_E662,ICD9_27803,any_hypercap_icd,...,first_abg_time,first_abg_ph,first_abg_paco2,first_vbg_src,first_vbg_time,first_vbg_ph,first_vbg_paco2,poc_ph_time,poc_vbg_paco2_time,poc_vbg_ph_time
0,16003661,33328878.0,20001305,1,0,0,0,0,0,1,...,NaT,,,POC,2178-03-25 04:28:00,7.29,,NaT,2178-03-25 04:28:00,2178-03-25 04:28:00
1,13390157,,20002497,0,0,0,0,1,0,1,...,2121-08-23 17:53:00,7.49,44.0,LAB,2121-08-19 20:20:00,7.42,58.0,NaT,NaT,NaT
2,17055745,34302455.0,20005666,1,0,0,0,0,0,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
3,11281568,30744215.0,20006409,0,0,1,0,0,0,1,...,2126-10-28 05:45:00,7.4,,POC,2126-10-28 12:39:00,7.4,,2126-10-28 05:45:00,2126-10-28 12:39:00,2126-10-28 12:39:00
4,17598948,,20010211,0,0,0,0,0,1,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
5,11613862,,20010312,0,0,0,0,1,0,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
6,11068569,,20014075,0,1,0,0,1,0,1,...,2152-02-26 08:29:00,7.48,47.0,LAB,2152-01-23 10:46:00,7.37,59.0,NaT,NaT,NaT
7,17396346,37000201.0,20014583,0,0,0,0,0,1,1,...,NaT,,,,NaT,,,NaT,NaT,NaT
8,16186978,,20015802,0,0,1,0,0,0,1,...,2150-01-25 05:44:00,7.35,,POC,2150-01-24 20:47:00,7.27,,2150-01-25 05:44:00,2150-01-24 20:47:00,2150-01-24 20:47:00
9,17199808,,20016167,1,0,0,0,0,0,1,...,2183-06-22 09:38:00,7.37,,,NaT,,,2183-06-22 09:38:00,NaT,NaT


In [10]:
import numpy as np
import pandas as pd

def _col_or_nan(df: pd.DataFrame, name: str) -> pd.Series:
    if name in df.columns:
        return pd.to_numeric(df[name], errors="coerce")
    # column absent → Series of NaN matching df length
    return pd.Series(np.nan, index=df.index, dtype="float64")

def qc_pair_any(df: pd.DataFrame, ph_col: str, co2_col: str, label: str,
                ph_lo=6.3, ph_hi=7.8, co2_lo=5, co2_hi=200):
    ph  = _col_or_nan(df, ph_col)
    co2 = _col_or_nan(df, co2_col)

    # present_any = at least one analyte exists; present_both = both exist
    present_any  = (ph.notna() | co2.notna())
    present_both = (ph.notna() & co2.notna())
    missing_both = int((~present_any).sum())

    # IMPORTANT: sum first, then cast to int
    ph_oob  = int((((ph < ph_lo) | (ph > ph_hi)) & ph.notna()).sum())
    co2_oob = int((((co2 < co2_lo) | (co2 > co2_hi)) & co2.notna()).sum())

    return {
        "pair": label,
        "n_rows": len(df),
        "present_any":  int(present_any.sum()),
        "present_both": int(present_both.sum()),
        "missing_both": missing_both,
        "ph_oob_among_present":  ph_oob,
        "pco2_oob_among_present": co2_oob,
    }

rows = []
rows.append(qc_pair_any(df, "lab_abg_ph", "lab_paco2",     "LAB ABG"))
rows.append(qc_pair_any(df, "lab_vbg_ph", "lab_vbg_paco2", "LAB VBG"))
rows.append(qc_pair_any(df, "poc_ph",     "poc_paco2",     "POC ABG"))
rows.append(qc_pair_any(df, "poc_vbg_ph", "poc_vbg_paco2", "POC VBG"))

pd.DataFrame(rows)

Unnamed: 0,pair,n_rows,present_any,present_both,missing_both,ph_oob_among_present,pco2_oob_among_present
0,LAB ABG,4237,2327,0,1910,0,0
1,LAB VBG,4237,3174,3032,1063,0,0
2,POC ABG,4237,2014,0,2223,0,0
3,POC VBG,4237,2521,0,1716,0,0


In [None]:

ts = pd.Timestamp.utcnow().strftime("%Y%m%d_%H%M%S")
out_path = f"/mnt/data/mimic_hypercap_bq_abg_vbg_{ts}.xlsx"
with pd.ExcelWriter(out_path) as xw:
    df.to_excel(xw, sheet_name="episodes", index=False)

print("Wrote:", out_path)
out_path
