### 3. Demographics / Admissions / ICU flag / LOS

In [None]:
dem_sql = f"""
SELECT p.subject_id,
       CAST(p.anchor_age AS INT64) AS age,
       p.gender AS sex,
       a.hadm_id, a.admittime, a.dischtime,
       CAST(a.hospital_expire_flag AS INT64) AS died,
       a.race
FROM `{PHYS}.{HOSP}.patients` p
LEFT JOIN `{PHYS}.{HOSP}.admissions` a USING (subject_id)
"""
dem = run_sql_bq(dem_sql)

icu_sql = f"""
SELECT DISTINCT hadm_id, 1 AS icu_admit
FROM `{PHYS}.{ICU}.icustays`
WHERE hadm_id IS NOT NULL
"""
icu = run_sql_bq(icu_sql)

base = (cohort
        .merge(dem, on=["subject_id","hadm_id"], how="left")
        .merge(icu, on="hadm_id", how="left"))
base["icu_admit"] = base["icu_admit"].fillna(0).astype("int64")
base["los_hrs"] = (pd.to_datetime(base["dischtime"]) - pd.to_datetime(base["admittime"])).dt.total_seconds()/3600
base.head()


### 4. ED triage vitals and chief complaint

In [None]:
triage_sql = f"""
SELECT subject_id, stay_id,
       chiefcomplaint AS chief_complaint,
       acuity AS triage_acuity,
       heartrate AS hr, sbp, dbp, resprate AS rr, o2sat AS spo2, temperature AS temp
FROM `{PHYS}.{ED}.triage`
"""
triage = run_sql_bq(triage_sql)

for col in ["triage_acuity","hr","sbp","dbp","rr","spo2","temp"]:
    triage[col] = pd.to_numeric(triage[col], errors="coerce")

base = base.merge(triage, on=["subject_id","stay_id"], how="left")
base.head()


### 5. First ABG & VBG PaCO₂ and pH per admission (inline join, no itemid lists)

Notes:
- Restricts to `labevents` rows where `d_labitems.category` resembles blood gas.
- Uses `fluid` to separate arterial vs venous; if `fluid` is null we allow a fallback on labels containing 'ven' or 'art'.
- Filters explicitly for **pCO2** to avoid TCO2/bicarbonate.

In [None]:
params = {"hadms": hadm_list}

def first_bg_union_specimen_sql(site: str, analyte: str, value_alias: str, time_alias: str) -> str:
    """
    site:     'arterial' | 'venous'
    analyte:  'pco2' | 'ph'
    Returns: subject_id, hadm_id, <time_alias>, <value_alias>, <value_alias>_uom
    """
    site = site.lower().strip()
    analyte = analyte.lower().strip()
    return f"""
    WITH hadms AS (
      SELECT hadm_id FROM UNNEST(@hadms) AS hadm_id
    ),

    -- -----------------------------
    -- HOSPITAL LABEVENTS CANDIDATES
    -- -----------------------------
    hosp_items AS (
      SELECT
        di.itemid,
        LOWER(di.label) AS lbl,
        LOWER(COALESCE(di.fluid,'')) AS fl
      FROM `{PHYS}.{HOSP}.d_labitems` di
    ),
    hosp_vals AS (
      SELECT
        le.subject_id, le.hadm_id, le.charttime, le.itemid, le.specimen_id,
        CAST(le.valuenum AS FLOAT64) AS val, le.valueuom,
        hi.lbl, hi.fl
      FROM `{PHYS}.{HOSP}.labevents` le
      JOIN hosp_items hi ON hi.itemid = le.itemid
      JOIN hadms h       ON h.hadm_id = le.hadm_id
      WHERE le.valuenum IS NOT NULL
        -- analyte candidate: pH or Pa/pCO2 (ASCII or Unicode subscript), exclude TCO2/bicarb/total and EtCO2
        AND (
              REGEXP_CONTAINS(hi.lbl, r'\\bph\\b')
           OR REGEXP_CONTAINS(hi.lbl, r'\\bpa?\\s*co(?:2|₂)\\b')  -- matches pco2, paco2, p co2
           OR hi.lbl LIKE '%pco2%' OR hi.lbl LIKE '%paco2%'
        )
        AND NOT REGEXP_CONTAINS(hi.lbl, r'(tco2|total|content|bicar|etco2|end[- ]?tidal)')
    ),

    -- Specimen rows that carry site info for the same specimen_id
    hosp_spec AS (
      SELECT
        le.specimen_id,
        LOWER(COALESCE(le.value,'')) AS spec_val
      FROM `{PHYS}.{HOSP}.labevents` le
      JOIN `{PHYS}.{HOSP}.d_labitems` di
        ON di.itemid = le.itemid
      WHERE le.specimen_id IS NOT NULL
        AND (
              REGEXP_CONTAINS(LOWER(di.label), r'(specimen|sample)')
              OR di.itemid IN (52033)  -- 'Specimen Type' appears in demo d_labitems
        )
    ),

    hosp_classed AS (
      SELECT
        v.subject_id, v.hadm_id, v.charttime, v.itemid, v.specimen_id, v.val, v.valueuom, v.lbl, v.fl,
        CASE
          WHEN LOWER(v.lbl) LIKE '%pco2%' OR REGEXP_CONTAINS(v.lbl, r'\\bpa?\\s*co(?:2|₂)\\b') THEN 'pco2'
          WHEN REGEXP_CONTAINS(v.lbl, r'\\bph\\b') THEN 'ph'
          ELSE NULL
        END AS analyte,
        CASE
          -- First preference: specimen text
          WHEN REGEXP_CONTAINS(s.spec_val, r'arter') OR REGEXP_CONTAINS(s.spec_val, r'\\bart\\b') THEN 'arterial'
          WHEN REGEXP_CONTAINS(s.spec_val, r'ven|central|mixed') THEN 'venous'
          -- Fallbacks: label/fluid heuristics if specimen text missing
          WHEN v.fl LIKE '%arterial%' OR REGEXP_CONTAINS(v.lbl, r'\\b(abg|art|arterial|a[- ]?line)\\b') THEN 'arterial'
          WHEN v.fl LIKE '%ven%'      OR REGEXP_CONTAINS(v.lbl, r'\\b(vbg|ven|venous|mixed|central)\\b') THEN 'venous'
          ELSE NULL
        END AS site
      FROM hosp_vals v
      LEFT JOIN hosp_spec s
        ON s.specimen_id = v.specimen_id
    ),

    -- -----------------------------
    -- ICU CHARTEVENTS (fallback only)
    -- -----------------------------
    icu_items AS (
      SELECT di.itemid, LOWER(di.label) AS lbl
      FROM `{PHYS}.{ICU}.d_items` di
    ),
    icu_vals AS (
      SELECT
        ie.subject_id, ie.hadm_id, ce.charttime, ce.itemid,
        CAST(ce.valuenum AS FLOAT64) AS val, ce.valueuom,
        ii.lbl
      FROM `{PHYS}.{ICU}.chartevents` ce
      JOIN icu_items ii         ON ii.itemid = ce.itemid
      JOIN `{PHYS}.{ICU}.icustays` ie ON ie.stay_id = ce.stay_id
      JOIN hadms h              ON h.hadm_id = ie.hadm_id
      WHERE ce.valuenum IS NOT NULL
        AND (
              REGEXP_CONTAINS(ii.lbl, r'\\bph\\b')
           OR REGEXP_CONTAINS(ii.lbl, r'\\bpa?\\s*co(?:2|₂)\\b') OR ii.lbl LIKE '%pco2%' OR ii.lbl LIKE '%paco2%'
        )
        AND NOT REGEXP_CONTAINS(ii.lbl, r'(tco2|total|content|bicar|etco2|end[- ]?tidal)')
    ),
    icu_classed AS (
      SELECT
        subject_id, hadm_id, charttime, itemid, val, valueuom, lbl,
        CASE
          WHEN REGEXP_CONTAINS(lbl, r'\\b(abg|art|arterial|a[- ]?line)\\b') THEN 'arterial'
          WHEN REGEXP_CONTAINS(lbl, r'\\b(vbg|ven|venous|mixed|central)\\b') THEN 'venous'
          ELSE NULL
        END AS site,
        CASE
          WHEN lbl LIKE '%pco2%' OR REGEXP_CONTAINS(lbl, r'\\bpa?\\s*co(?:2|₂)\\b') THEN 'pco2'
          WHEN REGEXP_CONTAINS(lbl, r'\\bph\\b') THEN 'ph'
          ELSE NULL
        END AS analyte
      FROM icu_vals
    ),

    -- -----------------------------
    -- UNION & FIRST VALUE PER HADM
    -- -----------------------------
    all_vals AS (
      SELECT subject_id, hadm_id, charttime, itemid, val, valueuom, analyte, site
      FROM hosp_classed
      UNION ALL
      SELECT subject_id, hadm_id, charttime, itemid, val, valueuom, analyte, site
      FROM icu_classed
    ),
    firsts AS (
      SELECT subject_id, hadm_id, MIN(charttime) AS first_time
      FROM all_vals
      WHERE site = '{site}' AND analyte = '{analyte}'
      GROUP BY subject_id, hadm_id
    )
    SELECT
      f.subject_id,
      f.hadm_id,
      f.first_time AS {time_alias},
      v.val        AS {value_alias},
      v.valueuom   AS {value_alias}_uom
    FROM firsts f
    JOIN all_vals v
      ON v.subject_id=f.subject_id AND v.hadm_id=f.hadm_id AND v.charttime=f.first_time
    """

abg_paco2 = run_sql_bq(first_bg_union_specimen_sql("arterial","pco2","lab_paco2","abg_time"), {"hadms": hadm_list})
abg_ph    = run_sql_bq(first_bg_union_specimen_sql("arterial","ph",  "lab_ph",  "lab_ph_time"), {"hadms": hadm_list})
vbg_paco2 = run_sql_bq(first_bg_union_specimen_sql("venous", "pco2","poc_vbg_paco2","poc_vbg_paco2_time"), {"hadms": hadm_list})
vbg_ph    = run_sql_bq(first_bg_union_specimen_sql("venous", "ph",  "poc_vbg_ph",  "poc_vbg_ph_time"), {"hadms": hadm_list})

for name, d in [("ABG PaCO2", abg_paco2), ("ABG pH", abg_ph),
                ("VBG PaCO2", vbg_paco2), ("VBG pH", vbg_ph)]:
    print(name, "rows:", len(d))


### Merge and Standardize Units

In [None]:
df = (base
      .merge(abg_paco2, on=["subject_id","hadm_id"], how="left")
      .merge(abg_ph,    on=["subject_id","hadm_id"], how="left")
      .merge(vbg_paco2, on=["subject_id","hadm_id"], how="left")
      .merge(vbg_ph,    on=["subject_id","hadm_id"], how="left"))

# Convert kPa -> mmHg for PaCO2
MMHG_PER_KPA = 7.50062
def _to_mmhg(val, uom):
    if pd.isna(val): return val, uom
    if isinstance(uom, str) and uom.lower() == "kpa":
        return float(val) * MMHG_PER_KPA, "mmHg"
    return val, uom

for prefix in ["lab", "poc_vbg"]:
    vcol = f"{prefix}_paco2"
    ucol = f"{prefix}_paco2_uom"
    if vcol in df.columns and ucol in df.columns:
        out = df[[vcol, ucol]].apply(lambda r: _to_mmhg(r[vcol], r[ucol]), axis=1, result_type="expand")
        df[vcol] = pd.to_numeric(out[0], errors="coerce")
        df[ucol] = out[1].fillna(df[ucol]).where(out[1].notna(), df[ucol])

# Order to match the 44-column legacy spec
order = [
    "subject_id","stay_id",
    "ICD10_J9602","ICD10_J9612","ICD10_J9622","ICD10_J9692","ICD10_E662","ICD9_27803",
    "hadm_id","any_hypercap_icd",
    "age","sex","admittime","dischtime","died","race","icu_admit","los_hrs",
    "chief_complaint","triage_acuity","hr","sbp","dbp","rr","spo2","temp",
    "lab_paco2","lab_paco2_uom","abg_time",
    "lab_ph","lab_ph_uom","lab_ph_time",
    # POC ABG columns omitted by design in this focused build
    # Keep VBG columns under legacy names for compatibility
    "poc_vbg_paco2","poc_vbg_paco2_uom","poc_vbg_paco2_time",
    "poc_vbg_ph","poc_vbg_ph_uom","poc_vbg_ph_time"
]
for col in order:
    if col not in df.columns: df[col] = pd.NA
df = df[order]

int_cols = ["ICD10_J9602","ICD10_J9612","ICD10_J9622","ICD10_J9692","ICD10_E662","ICD9_27803","any_hypercap_icd","icu_admit"]
for c in int_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype("int64")

time_cols = ["admittime","dischtime","abg_time","lab_ph_time","poc_vbg_paco2_time","poc_vbg_ph_time"]
for c in time_cols:
    df[c] = pd.to_datetime(df[c], errors="coerce")


for missing in ["poc_paco2","poc_paco2_uom","poc_paco2_time",
                "poc_ph","poc_ph_uom","poc_ph_time"]:
    if missing not in df.columns:
        df[missing] = pd.NA

# Reorder to the exact 44-column legacy schema
df = df[[
    "subject_id","stay_id",
    "ICD10_J9602","ICD10_J9612","ICD10_J9622","ICD10_J9692","ICD10_E662","ICD9_27803",
    "hadm_id","any_hypercap_icd",
    "age","sex","admittime","dischtime","died","race","icu_admit","los_hrs",
    "chief_complaint","triage_acuity","hr","sbp","dbp","rr","spo2","temp",
    "lab_paco2","lab_paco2_uom","abg_time",
    "lab_ph","lab_ph_uom","lab_ph_time",
    "poc_paco2","poc_paco2_uom","poc_paco2_time",
    "poc_ph","poc_ph_uom","poc_ph_time",
    "poc_vbg_paco2","poc_vbg_paco2_uom","poc_vbg_paco2_time",
    "poc_vbg_ph","poc_vbg_ph_uom","poc_vbg_ph_time"
]]

df = df.sort_values(["hadm_id", "abg_time"], na_position="last")
df = df.drop_duplicates(["hadm_id"], keep="first")

df.head()
