# 01 - Extract Current vs Legacy vs Permissive Envelope

Debug-only notebook. Builds hadm-level comparison tables and permissive long audit tables.

In [1]:
import os
import json
import time
from datetime import datetime
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from google.cloud import bigquery

load_dotenv()

WORK_DIR = Path(os.getenv('WORK_DIR', Path.cwd())).expanduser().resolve()
DATA_DIR = WORK_DIR / 'MIMIC tabular data'
ART_DIR = WORK_DIR / 'debug' / 'abg_vbg_capture' / 'artifacts'
ART_DIR.mkdir(parents=True, exist_ok=True)

WORK_PROJECT = os.getenv('WORK_PROJECT', '').strip()
PHYS = os.getenv('BQ_PHYSIONET_PROJECT', 'physionet-data').strip()
HOSP = os.getenv('BQ_DATASET_HOSP', 'mimiciv_3_1_hosp').strip()
ICU = os.getenv('BQ_DATASET_ICU', 'mimiciv_3_1_icu').strip()

client = bigquery.Client(project=WORK_PROJECT)

print('WORK_DIR:', WORK_DIR)
print('DATA_DIR:', DATA_DIR)
print('ART_DIR:', ART_DIR)
print('WORK_PROJECT:', WORK_PROJECT)
print('PHYS/HOSP/ICU:', PHYS, HOSP, ICU)


WORK_DIR: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP
DATA_DIR: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/MIMIC tabular data
ART_DIR: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts
WORK_PROJECT: mimic-hypercapnia
PHYS/HOSP/ICU: physionet-data mimiciv_3_1_hosp mimiciv_3_1_icu


In [2]:
def run_sql_bq(sql: str, params: dict | None = None) -> pd.DataFrame:
    params = params or {}
    bq_params = []
    for key, val in params.items():
        if isinstance(val, list):
            bq_params.append(bigquery.ArrayQueryParameter(key, 'INT64', [int(x) for x in val]))
        else:
            bq_params.append(bigquery.ScalarQueryParameter(key, 'STRING', str(val)))
    cfg = bigquery.QueryJobConfig(query_parameters=bq_params)
    job = client.query(sql, job_config=cfg)
    return job.result().to_dataframe(create_bqstorage_client=True)


def timed_query(name: str, sql: str, params: dict | None = None):
    t0 = time.time()
    out = run_sql_bq(sql, params)
    dt = time.time() - t0
    print(f"{name}: {len(out):,} rows in {dt:,.1f}s")
    return out, dt


In [3]:
old_path = DATA_DIR / '2025-10-14 MIMICIV all with CC.xlsx'
current_path = DATA_DIR / 'MIMICIV all with CC.xlsx'

if not old_path.exists():
    raise FileNotFoundError(f'Missing old comparator: {old_path}')
if not current_path.exists():
    raise FileNotFoundError(f'Missing current comparator: {current_path}')

old_h = pd.read_excel(old_path, usecols=['hadm_id'])['hadm_id'].dropna().astype(int).drop_duplicates()
cur_h = pd.read_excel(current_path, usecols=['hadm_id'])['hadm_id'].dropna().astype(int).drop_duplicates()

H_old = set(old_h.tolist())
H_current = set(cur_h.tolist())
H_union = sorted(H_old | H_current)

print('H_old:', len(H_old))
print('H_current:', len(H_current))
print('H_union:', len(H_union))


H_old: 27459
H_current: 17424
H_union: 27459


In [4]:
import sys
if str(WORK_DIR) not in sys.path:
    sys.path.insert(0, str(WORK_DIR))

from debug.abg_vbg_capture.sql_registry import build_sql_registry

SQL = build_sql_registry(PHYS=PHYS, HOSP=HOSP, ICU=ICU)
print('SQL keys:', sorted(SQL.keys()))


SQL keys: ['current_strict', 'legacy_replay', 'permissive_envelope_hadm', 'permissive_envelope_long']


In [5]:
params = {'hadms': H_union}
query_times = {}

current_df, query_times['current_strict_s'] = timed_query('current_strict', SQL['current_strict'], params)
legacy_df, query_times['legacy_replay_s'] = timed_query('legacy_replay', SQL['legacy_replay'], params)
env_hadm_df, query_times['permissive_hadm_s'] = timed_query('permissive_envelope_hadm', SQL['permissive_envelope_hadm'], params)
env_long_df, query_times['permissive_long_s'] = timed_query('permissive_envelope_long', SQL['permissive_envelope_long'], params)


current_strict: 22,644 rows in 7.0s


legacy_replay: 16,552 rows in 6.8s


permissive_envelope_hadm: 25,527 rows in 5.9s


permissive_envelope_long: 333,278 rows in 7.5s


In [6]:
base = pd.DataFrame({'hadm_id': H_union})
base['in_old'] = base['hadm_id'].isin(H_old).astype(int)
base['in_current'] = base['hadm_id'].isin(H_current).astype(int)
base['dropped_old_only'] = ((base['in_old'] == 1) & (base['in_current'] == 0)).astype(int)

comparison = base.merge(current_df, on='hadm_id', how='left')
comparison = comparison.merge(legacy_df, on='hadm_id', how='left')
comparison = comparison.merge(env_hadm_df, on='hadm_id', how='left')

fill_zero_cols = [
    'current_abg_hypercap', 'current_vbg_hypercap', 'current_candidate_n',
    'legacy_abg_hypercap', 'legacy_vbg_hypercap', 'legacy_candidate_n',
    'env_candidate_n', 'env_in_range_n', 'env_known_site_n', 'env_unknown_site_n',
    'env_abg_ge45_n', 'env_vbg_ge50_n', 'env_any_ge45_n', 'env_kpa_n',
]
for col in fill_zero_cols:
    if col not in comparison.columns:
        comparison[col] = 0
    comparison[col] = pd.to_numeric(comparison[col], errors='coerce').fillna(0).astype(int)

comparison['current_any_hypercap'] = ((comparison['current_abg_hypercap'] == 1) | (comparison['current_vbg_hypercap'] == 1)).astype(int)
comparison['legacy_any_hypercap'] = ((comparison['legacy_abg_hypercap'] == 1) | (comparison['legacy_vbg_hypercap'] == 1)).astype(int)
comparison['envelope_any_hypercap'] = (comparison['env_any_ge45_n'] > 0).astype(int)

comparison[['hadm_id','in_old','in_current','dropped_old_only','current_any_hypercap','legacy_any_hypercap','envelope_any_hypercap']].head()


Unnamed: 0,hadm_id,in_old,in_current,dropped_old_only,current_any_hypercap,legacy_any_hypercap,envelope_any_hypercap
0,20000694,1,1,0,0,0,0
1,20000808,1,0,1,0,0,0
2,20001305,1,1,0,1,1,1
3,20001395,1,1,0,0,0,0
4,20001770,1,1,0,0,0,0


In [7]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')

comparison_path_parquet = ART_DIR / f'capture_comparison_{ts}.parquet'
comparison_path_csv = ART_DIR / f'capture_comparison_{ts}.csv'
env_long_path = ART_DIR / f'permissive_long_{ts}.parquet'
label_audit_path = ART_DIR / f'permissive_label_audit_{ts}.csv'
unit_audit_path = ART_DIR / f'permissive_unit_audit_{ts}.csv'
meta_path = ART_DIR / f'run_metadata_{ts}.json'

comparison.to_parquet(comparison_path_parquet, index=False)
comparison.to_csv(comparison_path_csv, index=False)
env_long_df.to_parquet(env_long_path, index=False)

label_audit = (
    env_long_df.groupby(['source_system', 'itemid', 'label'], dropna=False)
    .size()
    .reset_index(name='n')
    .sort_values('n', ascending=False)
)
unit_audit = (
    env_long_df.groupby(['source_system', 'unit_norm'], dropna=False)
    .size()
    .reset_index(name='n')
    .sort_values('n', ascending=False)
)

label_audit.to_csv(label_audit_path, index=False)
unit_audit.to_csv(unit_audit_path, index=False)

meta = {
    'timestamp': ts,
    'h_old_n': len(H_old),
    'h_current_n': len(H_current),
    'h_union_n': len(H_union),
    'query_times_seconds': query_times,
    'output_files': {
        'comparison_parquet': str(comparison_path_parquet),
        'comparison_csv': str(comparison_path_csv),
        'permissive_long_parquet': str(env_long_path),
        'label_audit_csv': str(label_audit_path),
        'unit_audit_csv': str(unit_audit_path),
    },
}
meta_path.write_text(json.dumps(meta, indent=2))

print('Wrote:', comparison_path_parquet)
print('Wrote:', comparison_path_csv)
print('Wrote:', env_long_path)
print('Wrote:', label_audit_path)
print('Wrote:', unit_audit_path)
print('Wrote:', meta_path)


Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/capture_comparison_20260205_161205.parquet
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/capture_comparison_20260205_161205.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/permissive_long_20260205_161205.parquet
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/permissive_label_audit_20260205_161205.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/permissive_unit_audit_20260205_161205.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Researc

In [8]:
summary = pd.DataFrame([
    {'metric': 'union_hadm_n', 'value': len(H_union)},
    {'metric': 'dropped_old_only_n', 'value': int(comparison['dropped_old_only'].sum())},
    {'metric': 'current_any_hypercap_n', 'value': int(comparison['current_any_hypercap'].sum())},
    {'metric': 'legacy_any_hypercap_n', 'value': int(comparison['legacy_any_hypercap'].sum())},
    {'metric': 'envelope_any_hypercap_n', 'value': int(comparison['envelope_any_hypercap'].sum())},
    {'metric': 'strict_missed_envelope_hit_n', 'value': int(((comparison['current_any_hypercap'] == 0) & (comparison['envelope_any_hypercap'] == 1)).sum())},
])
summary


Unnamed: 0,metric,value
0,union_hadm_n,27459
1,dropped_old_only_n,10035
2,current_any_hypercap_n,10201
3,legacy_any_hypercap_n,6244
4,envelope_any_hypercap_n,14476
5,strict_missed_envelope_hit_n,4275
