# PU Classifier Playground (Rewritten)

This notebook follows one fixed PU workflow on five datasets and compares `latest_v1` vs `latest_v2`.

Protocol:
1. For each dataset, pick an inlier class, sample positives up to `MAX_POSITIVE` (or all if `None`), then split sampled positives into labeled-positive vs unlabeled-positive by `LP_UP_RATIO`.
2. Build unlabeled data by mixing unlabeled-positive with sampled outliers based on `UNLABELED_OUTLIER_FRACTION`.
3. Fit each checkpoint on labeled-positive only, then score unlabeled rows and report metrics.
4. Repeat PU construction `NUM_REPEATS=10` times and average metrics per dataset/model.

Outlier score uses raw logits: `outlier_logit - inlier_logit` (higher = more outlier-like).


In [64]:
from __future__ import annotations

from pathlib import Path
import io
import re
import zipfile
from typing import Dict, List, Optional, Tuple
from urllib.request import urlopen
import warnings
import sys

import numpy as np
import pandas as pd
import torch
from IPython.display import display
from sklearn.datasets import fetch_openml, load_breast_cancer
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [65]:
repo_root = Path.cwd().resolve()
while repo_root != repo_root.parent and not (repo_root / 'simplified_prior').exists():
    repo_root = repo_root.parent
if not (repo_root / 'simplified_prior').exists():
    raise RuntimeError('Could not find repo root containing simplified_prior/.')

if str(repo_root.parent) not in sys.path:
    sys.path.insert(0, str(repo_root.parent))

from slim_pretrain.pretrain.model import NanoTabPFNPUClassifier

print('Repo root:', repo_root)

Repo root: /Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain


In [66]:
# Model/runtime config
CHECKPOINT_PATHS = {
    'latest_v1': repo_root / 'checkpoints' / 'latest_v1.pt',
    'latest_v2': repo_root / 'checkpoints' / 'latest_v2.pt',
}
DEVICE = 'auto'  # auto | cpu | cuda | mps

# PU protocol config
LP_UP_RATIO = (4, 6)  # 40% labeled-positive, 60% unlabeled-positive
UNLABELED_OUTLIER_FRACTION = 0.3
DEFAULT_INLIER_CLASS = 0
INLIER_CLASS_BY_DATASET: Dict[str, int] = {
    # Override per dataset if needed (values must be 0 or 1 after binary coercion).
    'breast_cancer': 0,
    'diabetes': 0,
    'spambase_local': 0,
    'banknote_authentication': 0,
    'rice_cammeo_osmancik': 0,
}

NUM_REPEATS = 10
BASE_SEED = 20260219
# Outlier evaluation uses logit scores (threshold-free ranking metrics).
USE_STANDARD_SCALER = False
MAX_FEATURES: Optional[int] = None
MAX_POSITIVE: Optional[int] = 800  # e.g., 300; sample positives first, then split by LP_UP_RATIO

OPENML_CACHE_DIR = repo_root / '.cache' / 'openml'
OPENML_CACHE_DIR.mkdir(parents=True, exist_ok=True)

RICE_LOCAL_PATH = repo_root / 'notebooks' / 'benchmark' / 'rice+cammeo+and+osmancik.zip'
SPAMBASE_LOCAL_PATH = repo_root / 'notebooks' / 'benchmark' / 'spambase.zip'
BANKNOTE_LOCAL_PATH = repo_root / 'notebooks' / 'benchmark' / 'data_banknote_authentication.txt'

# Five datasets used in this notebook.
DATASET_SPECS = [
    {
        'source': 'sklearn',
        'name': 'breast_cancer',
    },
    {
        'source': 'openml',
        'name': 'diabetes',
        'version': 1,
    },
    {
        'source': 'uci_url',
        'name': 'spambase_local',
        'url': str(SPAMBASE_LOCAL_PATH),
        'url_fallbacks': [],
        'archive_member_hint': 'spambase.data',
        'sep': ',',
        'header': None,
        'target_col': 57,
        'drop_cols': [],
    },
    {
        'source': 'uci_url',
        'name': 'banknote_authentication',
        'url': str(BANKNOTE_LOCAL_PATH),
        'url_fallbacks': [
            'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt',
        ],
        'archive_member_hint': None,
        'sep': ',',
        'header': None,
        'target_col': 4,
        'drop_cols': [],
    },
    {
        'source': 'uci_url',
        'name': 'rice_cammeo_osmancik',
        'url': str(RICE_LOCAL_PATH),
        'url_fallbacks': [],
        'archive_member_hint': 'Rice_Cammeo_Osmancik',
        'sep': ',',
        'header': 0,
        'target_col': 'Class',
        'drop_cols': [],
    },
]

In [67]:
def resolve_device(device_arg: str) -> str:
    if device_arg == 'auto':
        if torch.cuda.is_available():
            return 'cuda'
        if getattr(torch.backends, 'mps', None) is not None and torch.backends.mps.is_available():
            return 'mps'
        return 'cpu'

    if device_arg == 'cuda' and not torch.cuda.is_available():
        raise RuntimeError('CUDA requested but not available.')

    if device_arg == 'mps':
        if getattr(torch.backends, 'mps', None) is None or not torch.backends.mps.is_available():
            raise RuntimeError('MPS requested but not available.')

    return device_arg


def ratio_to_labeled_fraction(lp_up_ratio: Tuple[int, int]) -> float:
    lp, up = int(lp_up_ratio[0]), int(lp_up_ratio[1])
    if lp <= 0 or up <= 0:
        raise ValueError(f'LP_UP_RATIO must be positive integers, got {lp_up_ratio}.')
    return float(lp / (lp + up))


def _decode_bytes_in_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == object:
            out[c] = out[c].map(lambda v: v.decode('utf-8') if isinstance(v, (bytes, bytearray)) else v)
    return out


def _read_arff_bytes(raw_bytes: bytes) -> pd.DataFrame:
    try:
        from scipy.io import arff as scipy_arff

        records, _ = scipy_arff.loadarff(io.BytesIO(raw_bytes))
        return _decode_bytes_in_df(pd.DataFrame(records))
    except Exception:
        text = raw_bytes.decode('utf-8', errors='replace')
        attr_names: List[str] = []
        data_lines: List[str] = []
        in_data = False

        for raw_line in text.splitlines():
            line = raw_line.strip()
            if not line or line.startswith('%'):
                continue
            low = line.lower()
            if low.startswith('@data'):
                in_data = True
                continue
            if not in_data:
                if low.startswith('@attribute'):
                    parts = line.split()
                    if len(parts) >= 2:
                        attr_names.append(parts[1].strip("'\""))
                continue
            data_lines.append(raw_line)

        if not attr_names or not data_lines:
            raise RuntimeError('Could not parse ARFF content.')

        csv_text = '\n'.join(data_lines)
        df = pd.read_csv(io.StringIO(csv_text), header=None)
        if df.shape[1] != len(attr_names):
            raise RuntimeError(
                f'ARFF parser column mismatch: data cols={df.shape[1]} attributes={len(attr_names)}'
            )
        df.columns = attr_names
        return _decode_bytes_in_df(df)


def _read_bytes_from_source(source: str) -> bytes:
    source_l = source.lower()
    if source_l.startswith('http://') or source_l.startswith('https://'):
        with urlopen(source, timeout=60) as resp:
            return resp.read()
    return Path(source).expanduser().read_bytes()


def _load_uci_table_with_fallbacks(
    url_candidates: List[str],
    sep: str,
    header: Optional[int],
    archive_member_hint: Optional[str],
) -> pd.DataFrame:
    last_exc = None

    for u in url_candidates:
        try:
            u_l = u.lower()

            if u_l.endswith('.zip'):
                archive_bytes = _read_bytes_from_source(u)
                with zipfile.ZipFile(io.BytesIO(archive_bytes)) as zf:
                    members = [m for m in zf.namelist() if not m.endswith('/')]
                    if not members:
                        raise RuntimeError(f'No files found in archive: {u}')

                    preferred = members
                    if archive_member_hint:
                        hinted = [m for m in members if archive_member_hint.lower() in m.lower()]
                        if hinted:
                            preferred = hinted

                    selected = None
                    for ext in ('.csv', '.data', '.txt', '.arff'):
                        for m in preferred:
                            if m.lower().endswith(ext):
                                selected = m
                                break
                        if selected is not None:
                            break
                    if selected is None:
                        selected = preferred[0]

                    payload = zf.read(selected)
                    if selected.lower().endswith('.arff'):
                        return _read_arff_bytes(payload)

                    return pd.read_csv(io.BytesIO(payload), sep=sep, header=header, skipinitialspace=True)

            if u_l.endswith('.arff'):
                return _read_arff_bytes(_read_bytes_from_source(u))

            return pd.read_csv(u, sep=sep, header=header, skipinitialspace=True)

        except Exception as exc:
            last_exc = exc

    raise RuntimeError(f'Failed loading UCI source(s): {url_candidates}. Last error: {last_exc}')


def _coerce_binary_target(y_raw: pd.Series) -> Tuple[np.ndarray, Dict[str, int], np.ndarray]:
    y_series = pd.Series(y_raw).copy()
    y_series = y_series.replace(['?', 'nan', 'None'], np.nan)
    valid = y_series.notna().to_numpy()
    y_series = y_series[valid]

    labels = sorted(list(pd.unique(y_series)), key=lambda x: str(x))
    if len(labels) != 2:
        raise ValueError(f'Expected binary labels, got {len(labels)} labels: {labels[:10]}')

    mapping = {labels[0]: 0, labels[1]: 1}
    y = y_series.map(mapping).astype(np.int64).to_numpy()
    mapping_printable = {str(k): int(v) for k, v in mapping.items()}
    return y, mapping_printable, valid


def _coerce_numeric_features(X_raw: pd.DataFrame) -> np.ndarray:
    X_df = pd.DataFrame(X_raw).copy()

    for c in X_df.columns:
        if not pd.api.types.is_numeric_dtype(X_df[c]):
            X_df[c] = X_df[c].astype(str)

    X_df = pd.get_dummies(X_df, dummy_na=True, drop_first=False)
    X_df = X_df.replace([np.inf, -np.inf], np.nan)

    numeric_medians = X_df.median(numeric_only=True)
    X_df = X_df.fillna(numeric_medians)
    X_df = X_df.fillna(0.0)
    return X_df.to_numpy(dtype=np.float32)


def load_dataset_from_spec(spec: Dict[str, object]) -> Dict[str, object]:
    source = str(spec['source'])
    name = str(spec['name'])

    if source == 'sklearn' and name == 'breast_cancer':
        ds = load_breast_cancer(as_frame=True)
        X_raw = ds.data
        y_raw = ds.target

    elif source == 'openml':
        version = int(spec.get('version', 1))
        ds = fetch_openml(name=name, version=version, as_frame=True, data_home=str(OPENML_CACHE_DIR))
        X_raw = ds.data
        y_raw = ds.target

    elif source == 'uci_url':
        sep = spec.get('sep', ',')
        header = spec.get('header', None)
        target_col = spec['target_col']
        drop_cols = set(spec.get('drop_cols', []))
        archive_member_hint = spec.get('archive_member_hint')

        url_candidates = [str(spec['url'])] + [str(u) for u in spec.get('url_fallbacks', [])]
        raw = _load_uci_table_with_fallbacks(
            url_candidates=url_candidates,
            sep=sep,
            header=header,
            archive_member_hint=str(archive_member_hint) if archive_member_hint is not None else None,
        )

        if isinstance(target_col, str):
            if target_col not in raw.columns:
                raise ValueError(f"target_col='{target_col}' not found in columns: {list(raw.columns)[:20]}")
            target_key = target_col
        else:
            target_key = raw.columns[int(target_col)]

        y_raw = raw[target_key]
        feature_cols = [c for c in raw.columns if c != target_key and c not in drop_cols]
        X_raw = raw.loc[:, feature_cols]

    else:
        raise ValueError(f'Unsupported dataset spec: {spec}')

    X = _coerce_numeric_features(pd.DataFrame(X_raw))
    y, label_mapping, valid_mask = _coerce_binary_target(pd.Series(y_raw))

    if X.shape[0] != len(y):
        X = X[valid_mask]

    if MAX_FEATURES is not None and MAX_FEATURES > 0 and X.shape[1] > MAX_FEATURES:
        X = X[:, :MAX_FEATURES]

    inverse_mapping = {int(v): k for k, v in label_mapping.items()}

    return {
        'source': source,
        'name': name,
        'X': X.astype(np.float32),
        'y': y.astype(np.int64),
        'n_rows': int(X.shape[0]),
        'n_features': int(X.shape[1]),
        'label_mapping': label_mapping,
        'inverse_label_mapping': inverse_mapping,
    }

In [68]:
def build_pu_data(
    X: np.ndarray,
    y: np.ndarray,
    inlier_class: int,
    labeled_positive_fraction: float,
    unlabeled_outlier_fraction: float,
    seed: int,
    max_positive: Optional[int] = None,
) -> Dict[str, object]:
    if inlier_class not in {0, 1}:
        raise ValueError(f'inlier_class must be 0/1, got {inlier_class}')
    if not (0.0 < labeled_positive_fraction < 1.0):
        raise ValueError(f'labeled_positive_fraction must be in (0, 1), got {labeled_positive_fraction}')
    if not (0.0 <= unlabeled_outlier_fraction < 1.0):
        raise ValueError(f'unlabeled_outlier_fraction must be in [0, 1), got {unlabeled_outlier_fraction}')

    inlier_idx = np.where(y == inlier_class)[0]
    outlier_idx = np.where(y != inlier_class)[0]

    n_positive_total = len(inlier_idx)
    n_outlier_pool = len(outlier_idx)
    if n_positive_total < 2:
        raise RuntimeError('Need at least 2 positives to split labeled/unlabeled positives.')

    if max_positive is not None:
        if int(max_positive) < 2:
            raise ValueError(f'max_positive must be >= 2 when set, got {max_positive}')
        n_positive_sampled = min(n_positive_total, int(max_positive))
    else:
        n_positive_sampled = n_positive_total

    rng = np.random.default_rng(seed)
    inlier_perm = rng.permutation(inlier_idx)
    outlier_perm = rng.permutation(outlier_idx)

    sampled_positive_idx = inlier_perm[:n_positive_sampled]

    n_labeled_pos = int(round(n_positive_sampled * labeled_positive_fraction))
    n_labeled_pos = max(1, min(n_labeled_pos, n_positive_sampled - 1))
    n_unlabeled_pos = n_positive_sampled - n_labeled_pos

    desired_n_unlabeled_out = int(round(
        (unlabeled_outlier_fraction / max(1e-12, 1.0 - unlabeled_outlier_fraction)) * n_unlabeled_pos
    ))
    if unlabeled_outlier_fraction > 0.0 and desired_n_unlabeled_out == 0:
        desired_n_unlabeled_out = 1

    n_unlabeled_out = min(desired_n_unlabeled_out, n_outlier_pool)

    labeled_idx = sampled_positive_idx[:n_labeled_pos]
    unlabeled_pos_idx = sampled_positive_idx[n_labeled_pos:]
    unlabeled_out_idx = outlier_perm[:n_unlabeled_out]

    unlabeled_idx = np.concatenate([unlabeled_pos_idx, unlabeled_out_idx], axis=0)
    y_unlabeled_true = np.concatenate([
        np.zeros(len(unlabeled_pos_idx), dtype=np.int64),
        np.ones(len(unlabeled_out_idx), dtype=np.int64),
    ])

    perm_u = rng.permutation(len(unlabeled_idx))
    unlabeled_idx = unlabeled_idx[perm_u]
    y_unlabeled_true = y_unlabeled_true[perm_u]

    X_labeled_pos = X[labeled_idx]
    X_unlabeled = X[unlabeled_idx]

    actual_unlabeled_outlier_fraction = float(y_unlabeled_true.mean()) if len(y_unlabeled_true) > 0 else 0.0

    return {
        'X_labeled_pos': X_labeled_pos,
        'X_unlabeled': X_unlabeled,
        'y_unlabeled_true': y_unlabeled_true,
        'n_total': int(len(y)),
        'n_positive_total': int(n_positive_total),
        'n_positive_sampled': int(n_positive_sampled),
        'n_outlier_pool': int(n_outlier_pool),
        'n_labeled_pos': int(n_labeled_pos),
        'n_unlabeled_pos': int(len(unlabeled_pos_idx)),
        'n_unlabeled_out': int(len(unlabeled_out_idx)),
        'n_unlabeled_total': int(len(unlabeled_idx)),
        'requested_unlabeled_outlier_fraction': float(unlabeled_outlier_fraction),
        'actual_unlabeled_outlier_fraction': float(actual_unlabeled_outlier_fraction),
    }


def _compute_fpr_at_target_tpr(
    y_true: np.ndarray,
    score: np.ndarray,
    target_tpr: float = 0.95,
) -> float:
    if len(np.unique(y_true)) < 2:
        return np.nan
    fpr, tpr, _ = roc_curve(y_true, score)
    meet = tpr >= target_tpr
    if not np.any(meet):
        return 1.0
    return float(np.min(fpr[meet]))


def evaluate_on_pu(
    clf: NanoTabPFNPUClassifier,
    X_labeled_pos: np.ndarray,
    X_unlabeled: np.ndarray,
    y_unlabeled_true: np.ndarray,
    use_standard_scaler: bool,
) -> Dict[str, float]:
    X_train = X_labeled_pos
    X_test = X_unlabeled

    if use_standard_scaler:
        scaler = StandardScaler()
        scaler.fit(np.concatenate([X_train, X_test], axis=0))
        X_train = scaler.transform(X_train).astype(np.float32)
        X_test = scaler.transform(X_test).astype(np.float32)

    clf.fit(X_train)
    if clf.X_train is None:
        raise RuntimeError('Classifier is not fitted.')

    X_test_arr = np.asarray(X_test, dtype=np.float32)
    x = np.concatenate((clf.X_train, X_test_arr), axis=0)
    y_train = np.zeros((clf.X_train.shape[0],), dtype=np.float32)

    with torch.no_grad():
        x_t = torch.from_numpy(x).unsqueeze(0).to(torch.float).to(clf.device)
        y_t = torch.from_numpy(y_train).unsqueeze(0).to(torch.float).to(clf.device)
        logits_t = clf.model((x_t, y_t), train_test_split_index=len(clf.X_train)).squeeze(0)
        logits_t = logits_t[:, : clf.num_classes]
        logits = logits_t.to('cpu').numpy()

    inlier_logit = logits[:, 0]
    outlier_logit = logits[:, 1]
    # Outlier score from raw logits (higher means more outlier-like).
    outlier_score = outlier_logit - inlier_logit

    if len(np.unique(y_unlabeled_true)) == 2:
        roc_auc = float(roc_auc_score(y_unlabeled_true, outlier_score))
        average_precision = float(average_precision_score(y_unlabeled_true, outlier_score))
        fpr_at_95_tpr = _compute_fpr_at_target_tpr(y_unlabeled_true, outlier_score, target_tpr=0.95)
    else:
        roc_auc = np.nan
        average_precision = np.nan
        fpr_at_95_tpr = np.nan

    n_outliers_true = int((y_unlabeled_true == 1).sum())
    if n_outliers_true > 0:
        k = n_outliers_true
        topk_idx = np.argsort(outlier_score)[-k:]
        tp_at_k = int(y_unlabeled_true[topk_idx].sum())
        precision_at_k = float(tp_at_k / k)
        recall_at_k = float(tp_at_k / n_outliers_true)
    else:
        k = 0
        tp_at_k = 0
        precision_at_k = np.nan
        recall_at_k = np.nan

    # Logit decision boundary at 0 corresponds to outlier_logit >= inlier_logit.
    y_pred = (outlier_score >= 0.0).astype(np.int64)

    inlier_mask = y_unlabeled_true == 0
    outlier_mask = y_unlabeled_true == 1
    outlier_score_mean_inlier = float(outlier_score[inlier_mask].mean()) if np.any(inlier_mask) else np.nan
    outlier_score_mean_outlier = float(outlier_score[outlier_mask].mean()) if np.any(outlier_mask) else np.nan
    outlier_score_gap = outlier_score_mean_outlier - outlier_score_mean_inlier

    return {
        'accuracy': float(accuracy_score(y_unlabeled_true, y_pred)),
        'balanced_accuracy': float(balanced_accuracy_score(y_unlabeled_true, y_pred)),
        'roc_auc': roc_auc,
        'average_precision': average_precision,
        'fpr_at_95_tpr': fpr_at_95_tpr,
        'precision_at_k': precision_at_k,
        'recall_at_k': recall_at_k,
        'k_for_precision_recall': int(k),
        'tp_at_k': int(tp_at_k),
        'outlier_score_mean_inlier': outlier_score_mean_inlier,
        'outlier_score_mean_outlier': outlier_score_mean_outlier,
        'outlier_score_gap': float(outlier_score_gap),
    }


In [69]:
device = resolve_device(DEVICE)
labeled_positive_fraction = ratio_to_labeled_fraction(LP_UP_RATIO)

missing_ckpts = [name for name, path in CHECKPOINT_PATHS.items() if not path.exists()]
if missing_ckpts:
    raise RuntimeError(f'Missing checkpoints: {missing_ckpts}. Expected: {CHECKPOINT_PATHS}')

models: Dict[str, NanoTabPFNPUClassifier] = {}
for model_name, ckpt_path in CHECKPOINT_PATHS.items():
    models[model_name] = NanoTabPFNPUClassifier.from_checkpoint(ckpt_path, device=device)

print('Device:', device)
print('Models:', {k: str(v) for k, v in CHECKPOINT_PATHS.items()})
print('LP_UP_RATIO:', LP_UP_RATIO, f'-> labeled_positive_fraction={labeled_positive_fraction:.4f}')
print('UNLABELED_OUTLIER_FRACTION:', UNLABELED_OUTLIER_FRACTION)
print('MAX_POSITIVE:', MAX_POSITIVE)
print('NUM_REPEATS:', NUM_REPEATS)

loaded_datasets: List[Dict[str, object]] = []
failed_datasets: List[Dict[str, object]] = []

for spec in DATASET_SPECS:
    try:
        ds = load_dataset_from_spec(spec)
        loaded_datasets.append(ds)
        print(
            f"[OK] {ds['source']}::{ds['name']} rows={ds['n_rows']} features={ds['n_features']} labels={ds['label_mapping']}"
        )
    except Exception as exc:
        failed_datasets.append({'spec': spec, 'error': str(exc)})
        print(f"[FAIL] {spec['source']}::{spec['name']} -> {exc}")

if len(loaded_datasets) != 5:
    raise RuntimeError(
        f'Expected 5 datasets loaded, got {len(loaded_datasets)}. Failed: {failed_datasets}'
    )

display(
    pd.DataFrame(
        [
            {
                'source': ds['source'],
                'dataset': ds['name'],
                'rows': ds['n_rows'],
                'features': ds['n_features'],
                'label_mapping': str(ds['label_mapping']),
                'default_inlier_class': INLIER_CLASS_BY_DATASET.get(ds['name'], DEFAULT_INLIER_CLASS),
                'default_inlier_raw_label': str(
                    ds['inverse_label_mapping'].get(
                        INLIER_CLASS_BY_DATASET.get(ds['name'], DEFAULT_INLIER_CLASS),
                        'unknown',
                    )
                ),
            }
            for ds in loaded_datasets
        ]
    )
)

Device: mps
Models: {'latest_v1': '/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/checkpoints/latest_v1.pt', 'latest_v2': '/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/checkpoints/latest_v2.pt'}
LP_UP_RATIO: (4, 6) -> labeled_positive_fraction=0.4000
UNLABELED_OUTLIER_FRACTION: 0.3
MAX_POSITIVE: 800
NUM_REPEATS: 10
[OK] sklearn::breast_cancer rows=569 features=30 labels={'0': 0, '1': 1}
[OK] openml::diabetes rows=768 features=8 labels={'tested_negative': 0, 'tested_positive': 1}
[OK] uci_url::spambase_local rows=4601 features=57 labels={'0': 0, '1': 1}
[OK] uci_url::banknote_authentication rows=1372 features=4 labels={'0': 0, '1': 1}
[OK] uci_url::rice_cammeo_osmancik rows=3810 features=7 labels={'Cammeo': 0, 'Osmancik': 1}


Unnamed: 0,source,dataset,rows,features,label_mapping,default_inlier_class,default_inlier_raw_label
0,sklearn,breast_cancer,569,30,"{'0': 0, '1': 1}",0,0
1,openml,diabetes,768,8,"{'tested_negative': 0, 'tested_positive': 1}",0,tested_negative
2,uci_url,spambase_local,4601,57,"{'0': 0, '1': 1}",0,0
3,uci_url,banknote_authentication,1372,4,"{'0': 0, '1': 1}",0,0
4,uci_url,rice_cammeo_osmancik,3810,7,"{'Cammeo': 0, 'Osmancik': 1}",0,Cammeo


In [70]:
repeat_rows: List[Dict[str, object]] = []

for repeat_idx in range(NUM_REPEATS):
    print(f'Running repeat {repeat_idx + 1}/{NUM_REPEATS} ...')
    for dataset_idx, ds in enumerate(loaded_datasets):
        ds_name = str(ds['name'])
        inlier_class = int(INLIER_CLASS_BY_DATASET.get(ds_name, DEFAULT_INLIER_CLASS))
        split_seed = BASE_SEED + repeat_idx * 1000 + dataset_idx

        pu = build_pu_data(
            X=ds['X'],
            y=ds['y'],
            inlier_class=inlier_class,
            labeled_positive_fraction=labeled_positive_fraction,
            unlabeled_outlier_fraction=UNLABELED_OUTLIER_FRACTION,
            seed=split_seed,
            max_positive=MAX_POSITIVE,
        )

        for model_name, clf in models.items():
            metrics = evaluate_on_pu(
                clf=clf,
                X_labeled_pos=pu['X_labeled_pos'],
                X_unlabeled=pu['X_unlabeled'],
                y_unlabeled_true=pu['y_unlabeled_true'],
                use_standard_scaler=USE_STANDARD_SCALER,
            )

            repeat_rows.append(
                {
                    'repeat': int(repeat_idx),
                    'split_seed': int(split_seed),
                    'dataset': ds_name,
                    'source': ds['source'],
                    'model': model_name,
                    'inlier_class': int(inlier_class),
                    'inlier_raw_label': str(ds['inverse_label_mapping'].get(inlier_class, 'unknown')),
                    'outlier_raw_label': str(ds['inverse_label_mapping'].get(1 - inlier_class, 'unknown')),
                    'n_rows': int(ds['n_rows']),
                    'n_features': int(ds['n_features']),
                    'n_positive_total': int(pu['n_positive_total']),
                    'n_positive_sampled': int(pu['n_positive_sampled']),
                    'n_labeled_pos': int(pu['n_labeled_pos']),
                    'n_unlabeled_pos': int(pu['n_unlabeled_pos']),
                    'n_unlabeled_out': int(pu['n_unlabeled_out']),
                    'n_unlabeled_total': int(pu['n_unlabeled_total']),
                    'requested_unlabeled_outlier_fraction': float(pu['requested_unlabeled_outlier_fraction']),
                    'actual_unlabeled_outlier_fraction': float(pu['actual_unlabeled_outlier_fraction']),
                    **metrics,
                }
            )

repeat_results_df = pd.DataFrame(repeat_rows)
if repeat_results_df.empty:
    raise RuntimeError('No repeat results were produced.')

print('\nPer-repeat snapshot (first 20 rows): identifiers + composition')
repeat_view_cols = [
    'repeat', 'dataset', 'model', 'inlier_raw_label', 'outlier_raw_label',
    'n_positive_total', 'n_positive_sampled', 'n_labeled_pos', 'n_unlabeled_pos',
    'n_unlabeled_out', 'n_unlabeled_total', 'actual_unlabeled_outlier_fraction',
]
display(repeat_results_df[repeat_view_cols].head(20))

print('Per-repeat snapshot (first 20 rows): metrics')
repeat_metric_cols = [
    'repeat', 'dataset', 'model',
    'accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'fpr_at_95_tpr',
    'precision_at_k', 'recall_at_k', 'outlier_score_gap',
]
display(repeat_results_df[repeat_metric_cols].head(20))

Running repeat 1/10 ...
Running repeat 2/10 ...
Running repeat 3/10 ...
Running repeat 4/10 ...
Running repeat 5/10 ...
Running repeat 6/10 ...
Running repeat 7/10 ...
Running repeat 8/10 ...
Running repeat 9/10 ...
Running repeat 10/10 ...

Per-repeat snapshot (first 20 rows): identifiers + composition


Unnamed: 0,repeat,dataset,model,inlier_raw_label,outlier_raw_label,n_positive_total,n_positive_sampled,n_labeled_pos,n_unlabeled_pos,n_unlabeled_out,n_unlabeled_total,actual_unlabeled_outlier_fraction
0,0,breast_cancer,latest_v1,0,1,212,212,85,127,54,181,0.298343
1,0,breast_cancer,latest_v2,0,1,212,212,85,127,54,181,0.298343
2,0,diabetes,latest_v1,tested_negative,tested_positive,500,500,200,300,129,429,0.300699
3,0,diabetes,latest_v2,tested_negative,tested_positive,500,500,200,300,129,429,0.300699
4,0,spambase_local,latest_v1,0,1,2788,800,320,480,206,686,0.300292
5,0,spambase_local,latest_v2,0,1,2788,800,320,480,206,686,0.300292
6,0,banknote_authentication,latest_v1,0,1,762,762,305,457,196,653,0.300153
7,0,banknote_authentication,latest_v2,0,1,762,762,305,457,196,653,0.300153
8,0,rice_cammeo_osmancik,latest_v1,Cammeo,Osmancik,1630,800,320,480,206,686,0.300292
9,0,rice_cammeo_osmancik,latest_v2,Cammeo,Osmancik,1630,800,320,480,206,686,0.300292


Per-repeat snapshot (first 20 rows): metrics


Unnamed: 0,repeat,dataset,model,accuracy,balanced_accuracy,roc_auc,average_precision,fpr_at_95_tpr,precision_at_k,recall_at_k,outlier_score_gap
0,0,breast_cancer,latest_v1,0.922652,0.944882,0.988043,0.969284,0.086614,0.925926,0.925926,6.639315
1,0,breast_cancer,latest_v2,0.779006,0.837197,0.984252,0.969158,0.110236,0.925926,0.925926,5.090364
2,0,diabetes,latest_v1,0.715618,0.728178,0.777364,0.583392,0.75,0.604651,0.604651,0.75022
3,0,diabetes,latest_v2,0.731935,0.702287,0.770336,0.582231,0.726667,0.589147,0.589147,0.856048
4,0,spambase_local,latest_v1,0.827988,0.792567,0.862824,0.753052,0.595833,0.713592,0.713592,3.118863
5,0,spambase_local,latest_v2,0.858601,0.818598,0.890645,0.850146,0.658333,0.762136,0.762136,2.476312
6,0,banknote_authentication,latest_v1,0.981623,0.986871,1.0,1.0,0.0,1.0,1.0,7.858041
7,0,banknote_authentication,latest_v2,0.990812,0.993435,1.0,1.0,0.0,1.0,1.0,12.197139
8,0,rice_cammeo_osmancik,latest_v1,0.873178,0.901062,0.978803,0.959582,0.091667,0.878641,0.878641,3.289147
9,0,rice_cammeo_osmancik,latest_v2,0.932945,0.90359,0.980229,0.965179,0.091667,0.902913,0.902913,7.260108


In [71]:
metric_cols = [
    'accuracy',
    'balanced_accuracy',
    'roc_auc',
    'average_precision',
    'fpr_at_95_tpr',
    'precision_at_k',
    'recall_at_k',
    'outlier_score_gap',
]

composition_cols = [
    'n_positive_total',
    'n_positive_sampled',
    'n_labeled_pos',
    'n_unlabeled_pos',
    'n_unlabeled_out',
    'n_unlabeled_total',
    'requested_unlabeled_outlier_fraction',
    'actual_unlabeled_outlier_fraction',
]

avg_metrics_df = (
    repeat_results_df
    .groupby(['dataset', 'model'], as_index=False)[metric_cols + composition_cols]
    .mean()
    .sort_values(['dataset', 'model'])
)

std_metrics_df = (
    repeat_results_df
    .groupby(['dataset', 'model'], as_index=False)[metric_cols]
    .std()
    .rename(columns={c: f'{c}_std' for c in metric_cols})
)

summary_df = avg_metrics_df.merge(std_metrics_df, on=['dataset', 'model'], how='left')

print('Average over repeats: compact metric view')
summary_metric_view_cols = [
    'dataset', 'model',
    'accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'fpr_at_95_tpr',
    'precision_at_k', 'recall_at_k', 'outlier_score_gap',
]
display(summary_df[summary_metric_view_cols].sort_values(['dataset', 'model']))

print('Average over repeats: PU composition view')
summary_composition_view_cols = [
    'dataset', 'model',
    'n_positive_total', 'n_positive_sampled', 'n_labeled_pos', 'n_unlabeled_pos',
    'n_unlabeled_out', 'n_unlabeled_total',
    'requested_unlabeled_outlier_fraction', 'actual_unlabeled_outlier_fraction',
]
display(summary_df[summary_composition_view_cols].sort_values(['dataset', 'model']))

print('Average over repeats: metric mean/std view')
summary_metric_std_cols = ['dataset', 'model']
for c in metric_cols:
    summary_metric_std_cols.extend([c, f'{c}_std'])
display(summary_df[summary_metric_std_cols].sort_values(['dataset', 'model']))

roc_auc_pivot = summary_df.pivot_table(index='dataset', columns='model', values='roc_auc')
print('Mean ROC-AUC by dataset/model:')
display(roc_auc_pivot)

avg_by_model = summary_df.groupby('model', as_index=False)[metric_cols].mean().sort_values('roc_auc', ascending=False)
print('Dataset-averaged metrics by model:')
display(avg_by_model)

Average over repeats: compact metric view


Unnamed: 0,dataset,model,accuracy,balanced_accuracy,roc_auc,average_precision,fpr_at_95_tpr,precision_at_k,recall_at_k,outlier_score_gap
0,banknote_authentication,latest_v1,0.978867,0.984902,1.0,1.0,0.0,1.0,1.0,7.745146
1,banknote_authentication,latest_v2,0.990658,0.993326,0.999997,0.999992,0.0,0.99949,0.99949,11.78755
2,breast_cancer,latest_v1,0.872376,0.891492,0.964188,0.931693,0.188976,0.844444,0.844444,4.592378
3,breast_cancer,latest_v2,0.858564,0.88431,0.964304,0.938917,0.18189,0.862963,0.862963,5.003398
4,diabetes,latest_v1,0.696737,0.704516,0.775297,0.593958,0.701667,0.575969,0.575969,0.782298
5,diabetes,latest_v2,0.702331,0.683109,0.775005,0.600159,0.707,0.572868,0.572868,0.838635
6,rice_cammeo_osmancik,latest_v1,0.873907,0.895902,0.970331,0.944952,0.144375,0.871845,0.871845,3.641087
7,rice_cammeo_osmancik,latest_v2,0.916618,0.879731,0.973162,0.948842,0.128542,0.878155,0.878155,8.904526
8,spambase_local,latest_v1,0.836443,0.805813,0.887539,0.782507,0.482292,0.731068,0.731068,3.681258
9,spambase_local,latest_v2,0.882362,0.856499,0.921032,0.880956,0.490417,0.809223,0.809223,2.874966


Average over repeats: PU composition view


Unnamed: 0,dataset,model,n_positive_total,n_positive_sampled,n_labeled_pos,n_unlabeled_pos,n_unlabeled_out,n_unlabeled_total,requested_unlabeled_outlier_fraction,actual_unlabeled_outlier_fraction
0,banknote_authentication,latest_v1,762.0,762.0,305.0,457.0,196.0,653.0,0.3,0.300153
1,banknote_authentication,latest_v2,762.0,762.0,305.0,457.0,196.0,653.0,0.3,0.300153
2,breast_cancer,latest_v1,212.0,212.0,85.0,127.0,54.0,181.0,0.3,0.298343
3,breast_cancer,latest_v2,212.0,212.0,85.0,127.0,54.0,181.0,0.3,0.298343
4,diabetes,latest_v1,500.0,500.0,200.0,300.0,129.0,429.0,0.3,0.300699
5,diabetes,latest_v2,500.0,500.0,200.0,300.0,129.0,429.0,0.3,0.300699
6,rice_cammeo_osmancik,latest_v1,1630.0,800.0,320.0,480.0,206.0,686.0,0.3,0.300292
7,rice_cammeo_osmancik,latest_v2,1630.0,800.0,320.0,480.0,206.0,686.0,0.3,0.300292
8,spambase_local,latest_v1,2788.0,800.0,320.0,480.0,206.0,686.0,0.3,0.300292
9,spambase_local,latest_v2,2788.0,800.0,320.0,480.0,206.0,686.0,0.3,0.300292


Average over repeats: metric mean/std view


Unnamed: 0,dataset,model,accuracy,accuracy_std,balanced_accuracy,balanced_accuracy_std,roc_auc,roc_auc_std,average_precision,average_precision_std,fpr_at_95_tpr,fpr_at_95_tpr_std,precision_at_k,precision_at_k_std,recall_at_k,recall_at_k_std,outlier_score_gap,outlier_score_gap_std
0,banknote_authentication,latest_v1,0.978867,0.010102,0.984902,0.007217,1.0,3.700743e-17,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,7.745146,0.408228
1,banknote_authentication,latest_v2,0.990658,0.008494,0.993326,0.006069,0.999997,1.059129e-05,0.999992,2.4e-05,0.0,0.0,0.99949,0.001613,0.99949,0.001613,11.78755,0.768687
2,breast_cancer,latest_v1,0.872376,0.045518,0.891492,0.045072,0.964188,0.02164428,0.931693,0.032523,0.188976,0.113439,0.844444,0.053244,0.844444,0.053244,4.592378,1.054922
3,breast_cancer,latest_v2,0.858564,0.07427,0.88431,0.059238,0.964304,0.03143102,0.938917,0.044152,0.18189,0.156535,0.862963,0.064859,0.862963,0.064859,5.003398,0.858359
4,diabetes,latest_v1,0.696737,0.023834,0.704516,0.026888,0.775297,0.03355324,0.593958,0.051988,0.701667,0.081744,0.575969,0.044011,0.575969,0.044011,0.782298,0.153643
5,diabetes,latest_v2,0.702331,0.071855,0.683109,0.037765,0.775005,0.03004024,0.600159,0.05278,0.707,0.085713,0.572868,0.045415,0.572868,0.045415,0.838635,0.14927
6,rice_cammeo_osmancik,latest_v1,0.873907,0.016878,0.895902,0.01306,0.970331,0.006834458,0.944952,0.012501,0.144375,0.038365,0.871845,0.020106,0.871845,0.020106,3.641087,0.490406
7,rice_cammeo_osmancik,latest_v2,0.916618,0.012815,0.879731,0.01707,0.973162,0.00757946,0.948842,0.015528,0.128542,0.04041,0.878155,0.029167,0.878155,0.029167,8.904526,1.491251
8,spambase_local,latest_v1,0.836443,0.017917,0.805813,0.021839,0.887539,0.01455109,0.782507,0.034781,0.482292,0.065161,0.731068,0.03357,0.731068,0.03357,3.681258,0.681205
9,spambase_local,latest_v2,0.882362,0.013075,0.856499,0.018634,0.921032,0.02068193,0.880956,0.023944,0.490417,0.162192,0.809223,0.024223,0.809223,0.024223,2.874966,0.40378


Mean ROC-AUC by dataset/model:


model,latest_v1,latest_v2
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
banknote_authentication,1.0,0.999997
breast_cancer,0.964188,0.964304
diabetes,0.775297,0.775005
rice_cammeo_osmancik,0.970331,0.973162
spambase_local,0.887539,0.921032


Dataset-averaged metrics by model:


Unnamed: 0,model,accuracy,balanced_accuracy,roc_auc,average_precision,fpr_at_95_tpr,precision_at_k,recall_at_k,outlier_score_gap
1,latest_v2,0.870107,0.859395,0.9267,0.873773,0.30157,0.82454,0.82454,5.881815
0,latest_v1,0.851666,0.856525,0.919471,0.850622,0.303462,0.804665,0.804665,4.088433


In [72]:
# Optional: inspect one dataset/model in detail
example_dataset = summary_df['dataset'].iloc[0]
example_model = summary_df['model'].iloc[0]

print(f'Detailed per-repeat rows for dataset={example_dataset}, model={example_model}: composition')
detailed_df = repeat_results_df[
    (repeat_results_df['dataset'] == example_dataset)
    & (repeat_results_df['model'] == example_model)
].sort_values('repeat')
display(detailed_df[[
    'repeat', 'split_seed', 'n_positive_total', 'n_positive_sampled',
    'n_labeled_pos', 'n_unlabeled_pos', 'n_unlabeled_out',
    'n_unlabeled_total', 'actual_unlabeled_outlier_fraction',
]])

print('Detailed per-repeat rows: metrics')
display(detailed_df[[
    'repeat', 'accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'fpr_at_95_tpr',
    'precision_at_k', 'recall_at_k', 'outlier_score_gap',
]])


Detailed per-repeat rows for dataset=banknote_authentication, model=latest_v1: composition


Unnamed: 0,repeat,split_seed,n_positive_total,n_positive_sampled,n_labeled_pos,n_unlabeled_pos,n_unlabeled_out,n_unlabeled_total,actual_unlabeled_outlier_fraction
6,0,20260222,762,762,305,457,196,653,0.300153
16,1,20261222,762,762,305,457,196,653,0.300153
26,2,20262222,762,762,305,457,196,653,0.300153
36,3,20263222,762,762,305,457,196,653,0.300153
46,4,20264222,762,762,305,457,196,653,0.300153
56,5,20265222,762,762,305,457,196,653,0.300153
66,6,20266222,762,762,305,457,196,653,0.300153
76,7,20267222,762,762,305,457,196,653,0.300153
86,8,20268222,762,762,305,457,196,653,0.300153
96,9,20269222,762,762,305,457,196,653,0.300153


Detailed per-repeat rows: metrics


Unnamed: 0,repeat,accuracy,balanced_accuracy,roc_auc,average_precision,fpr_at_95_tpr,precision_at_k,recall_at_k,outlier_score_gap
6,0,0.981623,0.986871,1.0,1.0,0.0,1.0,1.0,7.858041
16,1,0.987749,0.991247,1.0,1.0,0.0,1.0,1.0,7.656234
26,2,0.977029,0.983589,1.0,1.0,0.0,1.0,1.0,7.606638
36,3,0.990812,0.993435,1.0,1.0,0.0,1.0,1.0,7.71559
46,4,0.964778,0.974836,1.0,1.0,0.0,1.0,1.0,8.438809
56,5,0.966309,0.97593,1.0,1.0,0.0,1.0,1.0,7.610722
66,6,0.972435,0.980306,1.0,1.0,0.0,1.0,1.0,6.911646
76,7,0.975498,0.982495,1.0,1.0,0.0,1.0,1.0,8.195551
86,8,0.995406,0.996718,1.0,1.0,0.0,1.0,1.0,7.553327
96,9,0.977029,0.983589,1.0,1.0,0.0,1.0,1.0,7.904904
