# 01 · Data Intake & Cleaning

Ingest clinical datasets, sanitize identifiers, and export a canonical Parquet file with chronological metadata.

> **TODO:** replace synthetic data generation with credentialed MIMIC loaders once paths are configured.

In [None]:
import os
import sys
from pathlib import Path

PROJECT_ROOT = Path(os.getcwd()).resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print(f"Project root: {PROJECT_ROOT}")


In [None]:
from pathlib import Path
import pandas as pd

from src.data.loaders import LoaderConfig, load_notes, estimate_token_counts, tag_split, export_canonical

CANONICAL_PATH = PROJECT_ROOT / 'artifacts' / 'canonical_demo.parquet'
CONFIG = LoaderConfig(root=PROJECT_ROOT / 'TODO_replace_with_secure_path', table='notes', limit=2000)
print(CONFIG)

In [None]:
df_raw = load_notes(CONFIG)
df_raw.head()

In [None]:
df_tokens = estimate_token_counts(df_raw)
df_split = tag_split(df_tokens)
print(df_split[['subject_id', 'discharge_time', 'tokens_estimate', 'split_tag']].head())

In [None]:
required_columns = ['subject_id', 'discharge_time', 'note_text', 'tokens_estimate', 'split_tag']
df_canonical = df_split.rename(columns={'note_text': 'text'})[required_columns]

In [None]:
export_canonical(df_canonical, CANONICAL_PATH)
print(f'Canonical parquet written to {CANONICAL_PATH}')

→ **Reminder:** store PHI only in mounted drives with restricted permissions. The exported Parquet should reside on encrypted storage, not Git.