# 01 Â· Data Intake & Cleaning

Ingest clinical datasets, sanitize patient identifiers, and export a canonical Parquet file stored on Drive.

In [None]:
# Persistent project setup on Drive
import os
import sys
from pathlib import Path

DRIVE_ROOT = Path('/content/drive')
try:
    from google.colab import drive
    if not DRIVE_ROOT.exists():
        drive.mount('/content/drive')
except Exception as exc:
    print(f'Colab drive mount skipped: {exc}')

if DRIVE_ROOT.exists():
    BASE_ROOT = (DRIVE_ROOT / 'MyDrive').resolve()
else:
    BASE_ROOT = Path.home().resolve()

PROJECT_ROOT = BASE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Clone the repo via 00_colab_setup.ipynb before running this notebook.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

os.chdir(PROJECT_ROOT)
print('PROJECT_ROOT:', PROJECT_ROOT)


In [None]:
from pathlib import Path
import pandas as pd

from src.data.loaders import LoaderConfig, load_notes, estimate_token_counts, tag_split, export_canonical

RAW_NOTES_DIR = DATA_ROOT / 'mimic_iv_notes'
RAW_NOTES_DIR.mkdir(parents=True, exist_ok=True)
CANONICAL_PATH = ARTIFACTS_DIR / 'canonical_demo.parquet'
CONFIG = LoaderConfig(root=RAW_NOTES_DIR, table='notes', limit=2000)
print(CONFIG)


In [None]:
df_raw = load_notes(CONFIG)
df_raw.head()

In [None]:
df_tokens = estimate_token_counts(df_raw)
df_split = tag_split(df_tokens)
print(df_split[['subject_id', 'discharge_time', 'tokens_estimate', 'split_tag']].head())

In [None]:
required_columns = ['subject_id', 'discharge_time', 'note_text', 'tokens_estimate', 'split_tag']
df_canonical = df_split.rename(columns={'note_text': 'text'})[required_columns]
export_canonical(df_canonical, CANONICAL_PATH)
print(f'Canonical parquet stored at {CANONICAL_PATH}')

Update `LoaderConfig` with secure PhysioNet paths before processing PHI.