# 06 ¬∑ Core Attacks

Compute loss/confidence, Win-k, and label-free scores using Drive-backed features.

In [None]:
# Persistent Drive + run mode setup
import os
import sys
from pathlib import Path

try:
    from google.colab import drive  # type: ignore
    DRIVE_MOUNT = Path('/content/drive')
    if not DRIVE_MOUNT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if Path('/content/drive').exists():
    DRIVE_ROOT = Path('/content/drive/MyDrive').resolve()
else:
    DRIVE_ROOT = Path.home().resolve()

PROJECT_ROOT = DRIVE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Run 00_colab_setup.ipynb first to clone the repo on Drive.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

from src.utils.runtime import current_run_mode

RUN_MODE = current_run_mode()
print('PROJECT_ROOT:', PROJECT_ROOT)
print('Active run mode:', RUN_MODE.name, '-', RUN_MODE.description)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

BHC_DATA_DIR = DRIVE_ROOT / 'mimic-iv-bhc'
BHC_DATA_DIR.mkdir(parents=True, exist_ok=True)
BHC_CSV_PATH = BHC_DATA_DIR / 'mimic-iv-bhc.csv'
print('BHC CSV path:', BHC_CSV_PATH)


In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from datasets import Dataset

from src.attacks.loss_confidence import score_examples
from src.attacks.win_k_min_k import aggregate_features
from src.attacks.label_free import CalibrationPrompt, summarize_outputs
from src.eval.metrics import auc_metrics, tpr_at_fpr, expected_calibration_error

SLICE_ID = 1
TRACK = 'noreplay'
FEATURES_PATH = PROJECT_ROOT / 'reports' / 'features' / f'features_slice_{SLICE_ID}_{TRACK}_{RUN_MODE.name}.parquet'
IDS_DIR = ARTIFACTS_DIR / f'slice_{SLICE_ID}' / 'ids'

if not FEATURES_PATH.exists():
    raise FileNotFoundError('Feature parquet missing. Run 05_eval_generation_and_logprobs.ipynb first.')

features = Dataset.from_parquet(str(FEATURES_PATH))
print('Loaded features:', len(features))

labels = features['labels'] if 'labels' in features.column_names else None
if labels is None:
    raise ValueError('Feature dataset lacks `labels`. Ensure notebook 05 saved member/non-member tags.')

nll = np.array(features['token_nll'], dtype=float)
entropy = np.array(features['token_entropy'], dtype=float)
max_prob = np.array(features['token_max_prob'], dtype=float)
win_dict = {key: np.array(features[key], dtype=float) for key in features.column_names if key.startswith('win@')}

loss_feats = score_examples(nll, entropy, max_prob)
win_feats = aggregate_features(win_dict, nll, worst_percents=[0.05, 0.10])

feature_df = pd.DataFrame({
    'labels': labels,
    **{k: v for k, v in loss_feats.items()},
    **win_feats,
})
print(feature_df.head())

auc_info = auc_metrics(feature_df['labels'], -feature_df['mean_nll'])
calib = expected_calibration_error(feature_df['labels'], 1.0 / feature_df['mean_nll'])
tpr = tpr_at_fpr(feature_df['labels'], -feature_df['mean_nll'], target_fpr=0.01)
print('AUC:', auc_info['auc'])
print('TPR@1%FPR:', tpr)
print('ECE:', calib.ece)

metrics_path = PROJECT_ROOT / 'reports' / f'metrics_core_slice_{SLICE_ID}_{TRACK}_{RUN_MODE.name}.json'
metrics = {
    'slice_id': SLICE_ID,
    'track': TRACK,
    'run_mode': RUN_MODE.name,
    'auc': auc_info['auc'],
    'tpr@1%fpr': tpr,
    'ece': calib.ece,
}
metrics_path.write_text(json.dumps(metrics, indent=2))
print('Saved metrics to', metrics_path)


üìù Store aggregated metrics under `reports/metrics_core_slice_t.json` and keep member/non-member IDs synced from Drive artifacts.