# 01 Â· Data Intake & Cleaning

Load the MIMIC-IV-Ext-BHC dataset from Drive, normalize it into the canonical schema, and persist to Parquet.

In [1]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


Mounted at /content/drive
PROJECT_ROOT: /content/drive/MyDrive/secure-llm-mia
Active run mode: subset - Quick debugging subset (<=2k rows) for lightweight Colab smoke tests.
BHC CSV path: /content/drive/MyDrive/mimic-iv-bhc/mimic-iv-bhc.csv


In [2]:
from src.data.bhc import BHCDataConfig, load_bhc_dataframe, bhc_to_canonical
from src.data.loaders import tag_split, export_canonical

CANONICAL_PATH = ARTIFACTS_DIR / f'canonical_bhc_{RUN_MODE.name}.parquet'

bhc_config = BHCDataConfig(csv_path=BHC_CSV_PATH, run_mode=RUN_MODE)
df_raw = load_bhc_dataframe(bhc_config)
print('Loaded rows:', len(df_raw))

df_canonical = bhc_to_canonical(df_raw)
df_split = tag_split(df_canonical)
export_canonical(df_split, CANONICAL_PATH)
print('Canonical dataset saved to', CANONICAL_PATH)


Loaded rows: 2000
Canonical dataset saved to /content/drive/MyDrive/secure-llm-mia/artifacts/canonical_bhc_subset.parquet


Update `BHC_CSV_PATH` if you store the CSV elsewhere on Drive. Replace synthetic timestamps with true discharge times when available.