# 06 · Core Attacks

Compute loss/confidence, Win-k, and label-free scores using Drive-backed features.

In [3]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


PROJECT_ROOT: /content/drive/MyDrive/secure-llm-mia
Active run mode: subset - Quick debugging subset (<=2k rows) for lightweight Colab smoke tests.
BHC CSV path: /content/drive/MyDrive/mimic-iv-bhc/mimic-iv-bhc.csv


In [4]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from datasets import Dataset

from src.eval.metrics import auc_metrics, tpr_at_fpr

FEATURES_DIR = PROJECT_ROOT / 'reports' / 'features'
if not FEATURES_DIR.exists():
    raise FileNotFoundError('Feature directory missing. Run notebook 05 before launching this notebook.')

RUN_MODE_NAME = RUN_MODE.name


def _feature_files() -> list[Path]:
    pattern = f"features_slice_*_{RUN_MODE_NAME}.parquet"
    return sorted(FEATURES_DIR.glob(pattern))


def _parse_slice_track(path: Path) -> tuple[int, str]:
    stem = path.stem  # features_slice_{slice}_{track}_{run_mode}
    prefix = 'features_slice_'
    suffix = f'_{RUN_MODE_NAME}'
    if not stem.startswith(prefix) or not stem.endswith(suffix):
        raise ValueError(f'Unexpected feature file name: {path.name}')
    core = stem[len(prefix):-len(suffix)]
    slice_str, track = core.split('_', 1)
    return int(slice_str), track


def _ragged_mean(values: list[np.ndarray]) -> np.ndarray:
    return np.array([seq.mean() if seq.size else np.nan for seq in values])


def _worst_percent(values: list[np.ndarray], percent: float) -> np.ndarray:
    stats: list[float] = []
    for seq in values:
        if not seq.size:
            stats.append(np.nan)
            continue
        k = max(1, int(np.ceil(seq.size * percent)))
        sorted_seq = np.sort(seq)
        stats.append(sorted_seq[-k:].mean())
    return np.array(stats)


feature_paths = _feature_files()
if not feature_paths:
    raise FileNotFoundError(f'No feature parquet files found in {FEATURES_DIR} for run mode `{RUN_MODE_NAME}`.')

metrics_records: list[dict] = []

for feature_path in feature_paths:
    slice_id, track = _parse_slice_track(feature_path)
    ds = Dataset.from_parquet(str(feature_path))
    if len(ds) == 0:
        print(f'Skipping slice {slice_id} ({track}) - empty feature dataset.')
        continue

    token_nll = [np.asarray(seq, dtype=float) for seq in ds['token_nll']]
    token_entropy = [np.asarray(seq, dtype=float) for seq in ds['token_entropy']]
    token_max_prob = [np.asarray(seq, dtype=float) for seq in ds['token_max_prob']]

    win_keys = [name for name in ds.column_names if name.startswith('win@')]
    win_stats = {key: [np.asarray(seq, dtype=float) for seq in ds[key]] for key in win_keys}

    feature_df = pd.DataFrame(
        {
            'example_id': ds['example_id'],
            'label': np.array(ds['label'], dtype=int),
            'mean_nll': _ragged_mean(token_nll),
            'entropy': _ragged_mean(token_entropy),
            'max_prob': _ragged_mean(token_max_prob),
            'min_loss_top_5pct': _worst_percent(token_nll, 0.05),
            'min_loss_top_10pct': _worst_percent(token_nll, 0.10),
        }
    )

    for key, seqs in win_stats.items():
        feature_df[f"win_frac_{key.split('@')[1]}"] = _ragged_mean(seqs)

    aggregated_path = feature_path.with_name(f'features_core_slice_{slice_id}_{track}_{RUN_MODE_NAME}.parquet')
    feature_df.to_parquet(aggregated_path, index=False)
    print(f'Saved aggregated features to {aggregated_path}')

    scores = -feature_df['mean_nll'].to_numpy()
    labels = feature_df['label'].to_numpy()
    auc_info = auc_metrics(labels, scores)
    tpr = tpr_at_fpr(labels, scores, target_fpr=0.01)

    metrics_path = PROJECT_ROOT / 'reports' / f'metrics_core_slice_{slice_id}_{track}_{RUN_MODE_NAME}.json'
    metrics = {
        'slice_id': slice_id,
        'track': track,
        'run_mode': RUN_MODE_NAME,
        'auc': float(auc_info['auc']),
        'tpr_at_0.01': float(tpr),
        'num_examples': int(len(feature_df)),
    }
    metrics_path.write_text(json.dumps(metrics, indent=2))
    print(f'Saved metrics to {metrics_path}')
    metrics_records.append(metrics)

if not metrics_records:
    raise RuntimeError('No metrics computed. Ensure feature parquet files are populated.')

metrics_df = pd.DataFrame(metrics_records).sort_values(['track', 'slice_id'])
metrics_csv = PROJECT_ROOT / 'reports' / f'metrics_core_{RUN_MODE_NAME}.csv'
metrics_df.to_csv(metrics_csv, index=False)
print(f'Wrote consolidated metrics to {metrics_csv}')

results_long = metrics_df.melt(
    id_vars=['slice_id', 'track', 'run_mode'],
    value_vars=['auc', 'tpr_at_0.01'],
    var_name='metric',
    value_name='value',
)
results_path = PROJECT_ROOT / 'reports' / f'results_core_{RUN_MODE_NAME}.csv'
results_long.to_csv(results_path, index=False)
print(f'Saved tidy metrics table to {results_path}')


Saved aggregated features to /content/drive/MyDrive/secure-llm-mia/reports/features/features_core_slice_1_noreplay_subset.parquet
AUC: 0.36
TPR@1%FPR: 0.2
Saved metrics to /content/drive/MyDrive/secure-llm-mia/reports/metrics_core_slice_1_noreplay_subset.json
