# 06 Â· Core Attacks

Compute loss/confidence, Win-k, and label-free scores using Drive-backed features.

In [None]:
# Persistent project setup on Drive
import os
import sys
from pathlib import Path

DRIVE_ROOT = Path('/content/drive')
try:
    from google.colab import drive  # type: ignore
    if not DRIVE_ROOT.exists():
        drive.mount('/content/drive')
except Exception as exc:  # pragma: no cover
    print(f'Colab drive mount skipped: {exc}')

if DRIVE_ROOT.exists():
    BASE_ROOT = (DRIVE_ROOT / 'MyDrive').resolve()
else:
    BASE_ROOT = Path.home().resolve()

PROJECT_ROOT = BASE_ROOT / 'secure-llm-mia'
if not PROJECT_ROOT.exists():
    raise FileNotFoundError('Clone the repo via 00_colab_setup.ipynb before running this notebook.')

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

os.environ['SECURE_LLM_MIA_ROOT'] = str(PROJECT_ROOT)

DATA_ROOT = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
CHECKPOINT_ROOT = PROJECT_ROOT / 'checkpoints'
for path in (DATA_ROOT, ARTIFACTS_DIR, CHECKPOINT_ROOT):
    path.mkdir(parents=True, exist_ok=True)

os.chdir(PROJECT_ROOT)
print('PROJECT_ROOT:', PROJECT_ROOT)


In [None]:
import numpy as np
import pandas as pd

from src.attacks.loss_confidence import score_examples
from src.attacks.win_k_min_k import aggregate_features
from src.attacks.label_free import CalibrationPrompt, summarize_outputs

# Placeholder synthetic inputs; replace with outputs from notebook 05
rng = np.random.default_rng(1)
BATCH = 32
TOKENS = 16
nll = rng.random((BATCH, TOKENS))
entropy = rng.random((BATCH, TOKENS))
max_prob = rng.random((BATCH, TOKENS))
win_dict = {f'win@{k}': (rng.random((BATCH, TOKENS)) > 0.5) for k in (1, 5, 10, 20)}

loss_features = score_examples(nll, entropy, max_prob)
win_features = aggregate_features(win_dict, nll, worst_percents=[0.05, 0.10])

prompt = CalibrationPrompt()
responses = ['Synthetic response' for _ in range(BATCH)]
label_free_scores = summarize_outputs([prompt.format('context')] * BATCH, responses)

print('Loss features keys:', loss_features.keys())
print('Win-k feature sample:', {k: v[:3] for k, v in win_features.items()})
print('Label-free scores sample:', label_free_scores[:5])


Store aggregated metrics under `reports/metrics_core_slice_t.json` and keep member/non-member IDs synced from Drive artifacts.