# 02 Â· Make Chronological Slices

Build temporal windows and persist member / non-member panels on Drive.

In [1]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


PROJECT_ROOT: /content/drive/MyDrive/secure-llm-mia
Active run mode: subset - 30k-example subset powering the 4-slice, 3M-token continual fine-tuning regime.
BHC CSV path: /content/drive/MyDrive/mimic-iv-bhc/mimic-iv-bhc.csv


In [2]:
from src.data.loaders import load_canonical
from src.data.slicing import SliceConfig, assign_temporal_slices, enforce_token_budget, build_member_panels

TOTAL_SLICES = 4
TOKENS_PER_SLICE = 3_000_000
CANONICAL_PATH = ARTIFACTS_DIR / f'canonical_bhc_{RUN_MODE.name}.parquet'

if not CANONICAL_PATH.exists():
    raise FileNotFoundError('Run notebook 01 to create the canonical dataset first.')

df = load_canonical(CANONICAL_PATH)
print('Loaded canonical rows:', len(df))

Loaded canonical rows: 30000


In [3]:
df = assign_temporal_slices(df, total_slices=TOTAL_SLICES)
df['slice_id'] = df['slice_id'].astype(int) + 1
df = enforce_token_budget(df, tokens_per_slice=TOKENS_PER_SLICE)
print(df[['subject_id', 'slice_id', 'tokens_estimate']].head())

       subject_id  slice_id  tokens_estimate
0  10000032-DS-21         1             2177
1  10000032-DS-22         1             2993
2  10000117-DS-21         1             1232
3  10000117-DS-22         1             1525
4  10000248-DS-10         1             2191


In [4]:
slice_config = SliceConfig(
    total_slices=TOTAL_SLICES,
    members=1000,
    non_members=1000,
    past_members=500,
    future_non_members=500,
)
panels = build_member_panels(df, config=slice_config, artifact_dir=str(ARTIFACTS_DIR))
print(f'Persisted panels for {len(panels)} slices in {ARTIFACTS_DIR / "slice_*"}')

Persisted panels for 4 slices in /content/drive/MyDrive/secure-llm-mia/artifacts/slice_*


In [5]:
SLICED_DATASET_PATH = ARTIFACTS_DIR / f'sliced_dataset_{RUN_MODE.name}.parquet'
df.to_parquet(SLICED_DATASET_PATH, index=False)
print('Saved sliced dataset to', SLICED_DATASET_PATH)

Saved sliced dataset to /content/drive/MyDrive/secure-llm-mia/artifacts/sliced_dataset_subset.parquet


Replace synthetic discharge times with actual timestamps before relying on temporal leakage metrics.