# PU Classifier Playground (Rewritten)

This notebook follows one fixed PU workflow on five datasets and compares `latest_v1` vs `latest_v2`.

Protocol:
1. For each dataset, pick an inlier class, and split inliers into labeled-positive vs unlabeled-positive by `LP_UP_RATIO`.
2. Build unlabeled data by mixing unlabeled-positive with sampled outliers based on `UNLABELED_OUTLIER_FRACTION`.
3. Fit each checkpoint on labeled-positive only, then score unlabeled rows and report metrics.
4. Repeat PU construction `NUM_REPEATS=10` times and average metrics per dataset/model.


In [10]:
from __future__ import annotations

from pathlib import Path
import io
import re
import zipfile
from typing import Dict, List, Optional, Tuple
from urllib.request import urlopen
import warnings
import sys

import numpy as np
import pandas as pd
import torch
from IPython.display import display
from sklearn.datasets import fetch_openml, load_breast_cancer
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [11]:
repo_root = Path.cwd().resolve()
while repo_root != repo_root.parent and not (repo_root / 'simplified_prior').exists():
    repo_root = repo_root.parent
if not (repo_root / 'simplified_prior').exists():
    raise RuntimeError('Could not find repo root containing simplified_prior/.')

if str(repo_root.parent) not in sys.path:
    sys.path.insert(0, str(repo_root.parent))

from slim_pretrain.pretrain.model import NanoTabPFNPUClassifier

print('Repo root:', repo_root)

Repo root: /Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain


In [12]:
# Model/runtime config
CHECKPOINT_PATHS = {
    'latest_v1': repo_root / 'checkpoints' / 'latest_v1.pt',
    'latest_v2': repo_root / 'checkpoints' / 'latest_v2.pt',
}
DEVICE = 'auto'  # auto | cpu | cuda

# PU protocol config
LP_UP_RATIO = (4, 6)  # 40% labeled-positive, 60% unlabeled-positive
UNLABELED_OUTLIER_FRACTION = 0.20
DEFAULT_INLIER_CLASS = 0
INLIER_CLASS_BY_DATASET: Dict[str, int] = {
    # Override per dataset if needed (values must be 0 or 1 after binary coercion).
    'breast_cancer': 0,
    'diabetes': 0,
    'spambase_local': 0,
    'banknote_authentication': 0,
    'rice_cammeo_osmancik': 0,
}

NUM_REPEATS = 10
BASE_SEED = 20260219
THRESHOLD = 0.50
USE_STANDARD_SCALER = False
MAX_FEATURES: Optional[int] = None

OPENML_CACHE_DIR = repo_root / '.cache' / 'openml'
OPENML_CACHE_DIR.mkdir(parents=True, exist_ok=True)

RICE_LOCAL_PATH = repo_root / 'notebooks' / 'benchmark' / 'rice+cammeo+and+osmancik.zip'
SPAMBASE_LOCAL_PATH = repo_root / 'notebooks' / 'benchmark' / 'spambase.zip'
BANKNOTE_LOCAL_PATH = repo_root / 'notebooks' / 'benchmark' / 'data_banknote_authentication.txt'

# Five datasets used in this notebook.
DATASET_SPECS = [
    {
        'source': 'sklearn',
        'name': 'breast_cancer',
    },
    {
        'source': 'openml',
        'name': 'diabetes',
        'version': 1,
    },
    {
        'source': 'uci_url',
        'name': 'spambase_local',
        'url': str(SPAMBASE_LOCAL_PATH),
        'url_fallbacks': [],
        'archive_member_hint': 'spambase.data',
        'sep': ',',
        'header': None,
        'target_col': 57,
        'drop_cols': [],
    },
    {
        'source': 'uci_url',
        'name': 'banknote_authentication',
        'url': str(BANKNOTE_LOCAL_PATH),
        'url_fallbacks': [
            'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt',
        ],
        'archive_member_hint': None,
        'sep': ',',
        'header': None,
        'target_col': 4,
        'drop_cols': [],
    },
    {
        'source': 'uci_url',
        'name': 'rice_cammeo_osmancik',
        'url': str(RICE_LOCAL_PATH),
        'url_fallbacks': [],
        'archive_member_hint': 'Rice_Cammeo_Osmancik',
        'sep': ',',
        'header': 0,
        'target_col': 'Class',
        'drop_cols': [],
    },
]

In [13]:
def resolve_device(device_arg: str) -> str:
    if device_arg == 'auto':
        return 'cuda' if torch.cuda.is_available() else 'cpu'
    if device_arg == 'cuda' and not torch.cuda.is_available():
        raise RuntimeError('CUDA requested but not available.')
    return device_arg


def ratio_to_labeled_fraction(lp_up_ratio: Tuple[int, int]) -> float:
    lp, up = int(lp_up_ratio[0]), int(lp_up_ratio[1])
    if lp <= 0 or up <= 0:
        raise ValueError(f'LP_UP_RATIO must be positive integers, got {lp_up_ratio}.')
    return float(lp / (lp + up))


def _decode_bytes_in_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if out[c].dtype == object:
            out[c] = out[c].map(lambda v: v.decode('utf-8') if isinstance(v, (bytes, bytearray)) else v)
    return out


def _read_arff_bytes(raw_bytes: bytes) -> pd.DataFrame:
    try:
        from scipy.io import arff as scipy_arff

        records, _ = scipy_arff.loadarff(io.BytesIO(raw_bytes))
        return _decode_bytes_in_df(pd.DataFrame(records))
    except Exception:
        text = raw_bytes.decode('utf-8', errors='replace')
        attr_names: List[str] = []
        data_lines: List[str] = []
        in_data = False

        for raw_line in text.splitlines():
            line = raw_line.strip()
            if not line or line.startswith('%'):
                continue
            low = line.lower()
            if low.startswith('@data'):
                in_data = True
                continue
            if not in_data:
                if low.startswith('@attribute'):
                    parts = line.split()
                    if len(parts) >= 2:
                        attr_names.append(parts[1].strip("'\""))
                continue
            data_lines.append(raw_line)

        if not attr_names or not data_lines:
            raise RuntimeError('Could not parse ARFF content.')

        csv_text = '\n'.join(data_lines)
        df = pd.read_csv(io.StringIO(csv_text), header=None)
        if df.shape[1] != len(attr_names):
            raise RuntimeError(
                f'ARFF parser column mismatch: data cols={df.shape[1]} attributes={len(attr_names)}'
            )
        df.columns = attr_names
        return _decode_bytes_in_df(df)


def _read_bytes_from_source(source: str) -> bytes:
    source_l = source.lower()
    if source_l.startswith('http://') or source_l.startswith('https://'):
        with urlopen(source, timeout=60) as resp:
            return resp.read()
    return Path(source).expanduser().read_bytes()


def _load_uci_table_with_fallbacks(
    url_candidates: List[str],
    sep: str,
    header: Optional[int],
    archive_member_hint: Optional[str],
) -> pd.DataFrame:
    last_exc = None

    for u in url_candidates:
        try:
            u_l = u.lower()

            if u_l.endswith('.zip'):
                archive_bytes = _read_bytes_from_source(u)
                with zipfile.ZipFile(io.BytesIO(archive_bytes)) as zf:
                    members = [m for m in zf.namelist() if not m.endswith('/')]
                    if not members:
                        raise RuntimeError(f'No files found in archive: {u}')

                    preferred = members
                    if archive_member_hint:
                        hinted = [m for m in members if archive_member_hint.lower() in m.lower()]
                        if hinted:
                            preferred = hinted

                    selected = None
                    for ext in ('.csv', '.data', '.txt', '.arff'):
                        for m in preferred:
                            if m.lower().endswith(ext):
                                selected = m
                                break
                        if selected is not None:
                            break
                    if selected is None:
                        selected = preferred[0]

                    payload = zf.read(selected)
                    if selected.lower().endswith('.arff'):
                        return _read_arff_bytes(payload)

                    return pd.read_csv(io.BytesIO(payload), sep=sep, header=header, skipinitialspace=True)

            if u_l.endswith('.arff'):
                return _read_arff_bytes(_read_bytes_from_source(u))

            return pd.read_csv(u, sep=sep, header=header, skipinitialspace=True)

        except Exception as exc:
            last_exc = exc

    raise RuntimeError(f'Failed loading UCI source(s): {url_candidates}. Last error: {last_exc}')


def _coerce_binary_target(y_raw: pd.Series) -> Tuple[np.ndarray, Dict[str, int], np.ndarray]:
    y_series = pd.Series(y_raw).copy()
    y_series = y_series.replace(['?', 'nan', 'None'], np.nan)
    valid = y_series.notna().to_numpy()
    y_series = y_series[valid]

    labels = sorted(list(pd.unique(y_series)), key=lambda x: str(x))
    if len(labels) != 2:
        raise ValueError(f'Expected binary labels, got {len(labels)} labels: {labels[:10]}')

    mapping = {labels[0]: 0, labels[1]: 1}
    y = y_series.map(mapping).astype(np.int64).to_numpy()
    mapping_printable = {str(k): int(v) for k, v in mapping.items()}
    return y, mapping_printable, valid


def _coerce_numeric_features(X_raw: pd.DataFrame) -> np.ndarray:
    X_df = pd.DataFrame(X_raw).copy()

    for c in X_df.columns:
        if not pd.api.types.is_numeric_dtype(X_df[c]):
            X_df[c] = X_df[c].astype(str)

    X_df = pd.get_dummies(X_df, dummy_na=True, drop_first=False)
    X_df = X_df.replace([np.inf, -np.inf], np.nan)

    numeric_medians = X_df.median(numeric_only=True)
    X_df = X_df.fillna(numeric_medians)
    X_df = X_df.fillna(0.0)
    return X_df.to_numpy(dtype=np.float32)


def load_dataset_from_spec(spec: Dict[str, object]) -> Dict[str, object]:
    source = str(spec['source'])
    name = str(spec['name'])

    if source == 'sklearn' and name == 'breast_cancer':
        ds = load_breast_cancer(as_frame=True)
        X_raw = ds.data
        y_raw = ds.target

    elif source == 'openml':
        version = int(spec.get('version', 1))
        ds = fetch_openml(name=name, version=version, as_frame=True, data_home=str(OPENML_CACHE_DIR))
        X_raw = ds.data
        y_raw = ds.target

    elif source == 'uci_url':
        sep = spec.get('sep', ',')
        header = spec.get('header', None)
        target_col = spec['target_col']
        drop_cols = set(spec.get('drop_cols', []))
        archive_member_hint = spec.get('archive_member_hint')

        url_candidates = [str(spec['url'])] + [str(u) for u in spec.get('url_fallbacks', [])]
        raw = _load_uci_table_with_fallbacks(
            url_candidates=url_candidates,
            sep=sep,
            header=header,
            archive_member_hint=str(archive_member_hint) if archive_member_hint is not None else None,
        )

        if isinstance(target_col, str):
            if target_col not in raw.columns:
                raise ValueError(f"target_col='{target_col}' not found in columns: {list(raw.columns)[:20]}")
            target_key = target_col
        else:
            target_key = raw.columns[int(target_col)]

        y_raw = raw[target_key]
        feature_cols = [c for c in raw.columns if c != target_key and c not in drop_cols]
        X_raw = raw.loc[:, feature_cols]

    else:
        raise ValueError(f'Unsupported dataset spec: {spec}')

    X = _coerce_numeric_features(pd.DataFrame(X_raw))
    y, label_mapping, valid_mask = _coerce_binary_target(pd.Series(y_raw))

    if X.shape[0] != len(y):
        X = X[valid_mask]

    if MAX_FEATURES is not None and MAX_FEATURES > 0 and X.shape[1] > MAX_FEATURES:
        X = X[:, :MAX_FEATURES]

    inverse_mapping = {int(v): k for k, v in label_mapping.items()}

    return {
        'source': source,
        'name': name,
        'X': X.astype(np.float32),
        'y': y.astype(np.int64),
        'n_rows': int(X.shape[0]),
        'n_features': int(X.shape[1]),
        'label_mapping': label_mapping,
        'inverse_label_mapping': inverse_mapping,
    }

In [14]:
def build_pu_data(
    X: np.ndarray,
    y: np.ndarray,
    inlier_class: int,
    labeled_positive_fraction: float,
    unlabeled_outlier_fraction: float,
    seed: int,
) -> Dict[str, object]:
    if inlier_class not in {0, 1}:
        raise ValueError(f'inlier_class must be 0/1, got {inlier_class}')
    if not (0.0 < labeled_positive_fraction < 1.0):
        raise ValueError(f'labeled_positive_fraction must be in (0, 1), got {labeled_positive_fraction}')
    if not (0.0 <= unlabeled_outlier_fraction < 1.0):
        raise ValueError(f'unlabeled_outlier_fraction must be in [0, 1), got {unlabeled_outlier_fraction}')

    inlier_idx = np.where(y == inlier_class)[0]
    outlier_idx = np.where(y != inlier_class)[0]

    n_inlier = len(inlier_idx)
    n_outlier_pool = len(outlier_idx)
    if n_inlier < 2:
        raise RuntimeError('Need at least 2 inliers to split labeled/unlabeled positives.')

    n_labeled_pos = int(round(n_inlier * labeled_positive_fraction))
    n_labeled_pos = max(1, min(n_labeled_pos, n_inlier - 1))
    n_unlabeled_pos = n_inlier - n_labeled_pos

    desired_n_unlabeled_out = int(round(
        (unlabeled_outlier_fraction / max(1e-12, 1.0 - unlabeled_outlier_fraction)) * n_unlabeled_pos
    ))
    if unlabeled_outlier_fraction > 0.0 and desired_n_unlabeled_out == 0:
        desired_n_unlabeled_out = 1

    n_unlabeled_out = min(desired_n_unlabeled_out, n_outlier_pool)

    rng = np.random.default_rng(seed)
    inlier_perm = rng.permutation(inlier_idx)
    outlier_perm = rng.permutation(outlier_idx)

    labeled_idx = inlier_perm[:n_labeled_pos]
    unlabeled_pos_idx = inlier_perm[n_labeled_pos:]
    unlabeled_out_idx = outlier_perm[:n_unlabeled_out]

    unlabeled_idx = np.concatenate([unlabeled_pos_idx, unlabeled_out_idx], axis=0)
    y_unlabeled_true = np.concatenate([
        np.zeros(len(unlabeled_pos_idx), dtype=np.int64),
        np.ones(len(unlabeled_out_idx), dtype=np.int64),
    ])

    perm_u = rng.permutation(len(unlabeled_idx))
    unlabeled_idx = unlabeled_idx[perm_u]
    y_unlabeled_true = y_unlabeled_true[perm_u]

    X_labeled_pos = X[labeled_idx]
    X_unlabeled = X[unlabeled_idx]

    actual_unlabeled_outlier_fraction = float(y_unlabeled_true.mean()) if len(y_unlabeled_true) > 0 else 0.0

    return {
        'X_labeled_pos': X_labeled_pos,
        'X_unlabeled': X_unlabeled,
        'y_unlabeled_true': y_unlabeled_true,
        'n_total': int(len(y)),
        'n_inlier_total': int(n_inlier),
        'n_outlier_pool': int(n_outlier_pool),
        'n_labeled_pos': int(n_labeled_pos),
        'n_unlabeled_pos': int(len(unlabeled_pos_idx)),
        'n_unlabeled_out': int(len(unlabeled_out_idx)),
        'n_unlabeled_total': int(len(unlabeled_idx)),
        'requested_unlabeled_outlier_fraction': float(unlabeled_outlier_fraction),
        'actual_unlabeled_outlier_fraction': float(actual_unlabeled_outlier_fraction),
    }


def evaluate_on_pu(
    clf: NanoTabPFNPUClassifier,
    X_labeled_pos: np.ndarray,
    X_unlabeled: np.ndarray,
    y_unlabeled_true: np.ndarray,
    threshold: float,
    use_standard_scaler: bool,
) -> Dict[str, float]:
    X_train = X_labeled_pos
    X_test = X_unlabeled

    if use_standard_scaler:
        scaler = StandardScaler()
        scaler.fit(np.concatenate([X_train, X_test], axis=0))
        X_train = scaler.transform(X_train).astype(np.float32)
        X_test = scaler.transform(X_test).astype(np.float32)

    clf.fit(X_train)
    outlier_proba = clf.predict_proba(X_test)[:, 1]
    y_pred = (outlier_proba >= threshold).astype(np.int64)

    cm = confusion_matrix(y_unlabeled_true, y_pred, labels=[0, 1])

    if len(np.unique(y_unlabeled_true)) == 2:
        roc_auc = float(roc_auc_score(y_unlabeled_true, outlier_proba))
        average_precision = float(average_precision_score(y_unlabeled_true, outlier_proba))
    else:
        roc_auc = np.nan
        average_precision = np.nan

    return {
        'accuracy': float(accuracy_score(y_unlabeled_true, y_pred)),
        'balanced_accuracy': float(balanced_accuracy_score(y_unlabeled_true, y_pred)),
        'precision': float(precision_score(y_unlabeled_true, y_pred, zero_division=0)),
        'recall': float(recall_score(y_unlabeled_true, y_pred, zero_division=0)),
        'f1': float(f1_score(y_unlabeled_true, y_pred, zero_division=0)),
        'roc_auc': roc_auc,
        'average_precision': average_precision,
        'cm_00': int(cm[0, 0]),
        'cm_01': int(cm[0, 1]),
        'cm_10': int(cm[1, 0]),
        'cm_11': int(cm[1, 1]),
    }

In [15]:
device = resolve_device(DEVICE)
labeled_positive_fraction = ratio_to_labeled_fraction(LP_UP_RATIO)

missing_ckpts = [name for name, path in CHECKPOINT_PATHS.items() if not path.exists()]
if missing_ckpts:
    raise RuntimeError(f'Missing checkpoints: {missing_ckpts}. Expected: {CHECKPOINT_PATHS}')

models: Dict[str, NanoTabPFNPUClassifier] = {}
for model_name, ckpt_path in CHECKPOINT_PATHS.items():
    models[model_name] = NanoTabPFNPUClassifier.from_checkpoint(ckpt_path, device=device)

print('Device:', device)
print('Models:', {k: str(v) for k, v in CHECKPOINT_PATHS.items()})
print('LP_UP_RATIO:', LP_UP_RATIO, f'-> labeled_positive_fraction={labeled_positive_fraction:.4f}')
print('UNLABELED_OUTLIER_FRACTION:', UNLABELED_OUTLIER_FRACTION)
print('NUM_REPEATS:', NUM_REPEATS)

loaded_datasets: List[Dict[str, object]] = []
failed_datasets: List[Dict[str, object]] = []

for spec in DATASET_SPECS:
    try:
        ds = load_dataset_from_spec(spec)
        loaded_datasets.append(ds)
        print(
            f"[OK] {ds['source']}::{ds['name']} rows={ds['n_rows']} features={ds['n_features']} labels={ds['label_mapping']}"
        )
    except Exception as exc:
        failed_datasets.append({'spec': spec, 'error': str(exc)})
        print(f"[FAIL] {spec['source']}::{spec['name']} -> {exc}")

if len(loaded_datasets) != 5:
    raise RuntimeError(
        f'Expected 5 datasets loaded, got {len(loaded_datasets)}. Failed: {failed_datasets}'
    )

display(
    pd.DataFrame(
        [
            {
                'source': ds['source'],
                'dataset': ds['name'],
                'rows': ds['n_rows'],
                'features': ds['n_features'],
                'label_mapping': str(ds['label_mapping']),
                'default_inlier_class': INLIER_CLASS_BY_DATASET.get(ds['name'], DEFAULT_INLIER_CLASS),
                'default_inlier_raw_label': str(
                    ds['inverse_label_mapping'].get(
                        INLIER_CLASS_BY_DATASET.get(ds['name'], DEFAULT_INLIER_CLASS),
                        'unknown',
                    )
                ),
            }
            for ds in loaded_datasets
        ]
    )
)

Device: cpu
Models: {'latest_v1': '/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/checkpoints/latest_v1.pt', 'latest_v2': '/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/checkpoints/latest_v2.pt'}
LP_UP_RATIO: (4, 6) -> labeled_positive_fraction=0.4000
UNLABELED_OUTLIER_FRACTION: 0.2
NUM_REPEATS: 10
[OK] sklearn::breast_cancer rows=569 features=30 labels={'0': 0, '1': 1}
[OK] openml::diabetes rows=768 features=8 labels={'tested_negative': 0, 'tested_positive': 1}
[OK] uci_url::spambase_local rows=4601 features=57 labels={'0': 0, '1': 1}
[OK] uci_url::banknote_authentication rows=1372 features=4 labels={'0': 0, '1': 1}
[OK] uci_url::rice_cammeo_osmancik rows=3810 features=7 labels={'Cammeo': 0, 'Osmancik': 1}


Unnamed: 0,source,dataset,rows,features,label_mapping,default_inlier_class,default_inlier_raw_label
0,sklearn,breast_cancer,569,30,"{'0': 0, '1': 1}",0,0
1,openml,diabetes,768,8,"{'tested_negative': 0, 'tested_positive': 1}",0,tested_negative
2,uci_url,spambase_local,4601,57,"{'0': 0, '1': 1}",0,0
3,uci_url,banknote_authentication,1372,4,"{'0': 0, '1': 1}",0,0
4,uci_url,rice_cammeo_osmancik,3810,7,"{'Cammeo': 0, 'Osmancik': 1}",0,Cammeo


In [16]:
repeat_rows: List[Dict[str, object]] = []

for repeat_idx in range(NUM_REPEATS):
    print(f'Running repeat {repeat_idx + 1}/{NUM_REPEATS} ...')
    for dataset_idx, ds in enumerate(loaded_datasets):
        ds_name = str(ds['name'])
        inlier_class = int(INLIER_CLASS_BY_DATASET.get(ds_name, DEFAULT_INLIER_CLASS))
        split_seed = BASE_SEED + repeat_idx * 1000 + dataset_idx

        pu = build_pu_data(
            X=ds['X'],
            y=ds['y'],
            inlier_class=inlier_class,
            labeled_positive_fraction=labeled_positive_fraction,
            unlabeled_outlier_fraction=UNLABELED_OUTLIER_FRACTION,
            seed=split_seed,
        )

        for model_name, clf in models.items():
            metrics = evaluate_on_pu(
                clf=clf,
                X_labeled_pos=pu['X_labeled_pos'],
                X_unlabeled=pu['X_unlabeled'],
                y_unlabeled_true=pu['y_unlabeled_true'],
                threshold=THRESHOLD,
                use_standard_scaler=USE_STANDARD_SCALER,
            )

            repeat_rows.append(
                {
                    'repeat': int(repeat_idx),
                    'split_seed': int(split_seed),
                    'dataset': ds_name,
                    'source': ds['source'],
                    'model': model_name,
                    'inlier_class': int(inlier_class),
                    'inlier_raw_label': str(ds['inverse_label_mapping'].get(inlier_class, 'unknown')),
                    'outlier_raw_label': str(ds['inverse_label_mapping'].get(1 - inlier_class, 'unknown')),
                    'n_rows': int(ds['n_rows']),
                    'n_features': int(ds['n_features']),
                    'n_labeled_pos': int(pu['n_labeled_pos']),
                    'n_unlabeled_pos': int(pu['n_unlabeled_pos']),
                    'n_unlabeled_out': int(pu['n_unlabeled_out']),
                    'n_unlabeled_total': int(pu['n_unlabeled_total']),
                    'requested_unlabeled_outlier_fraction': float(pu['requested_unlabeled_outlier_fraction']),
                    'actual_unlabeled_outlier_fraction': float(pu['actual_unlabeled_outlier_fraction']),
                    **metrics,
                }
            )

repeat_results_df = pd.DataFrame(repeat_rows)
if repeat_results_df.empty:
    raise RuntimeError('No repeat results were produced.')

print('\nPer-repeat snapshot (first 20 rows): identifiers + composition')
repeat_view_cols = [
    'repeat', 'dataset', 'model', 'inlier_raw_label', 'outlier_raw_label',
    'n_labeled_pos', 'n_unlabeled_pos', 'n_unlabeled_out', 'n_unlabeled_total',
    'actual_unlabeled_outlier_fraction',
]
display(repeat_results_df[repeat_view_cols].head(20))

print('Per-repeat snapshot (first 20 rows): metrics')
repeat_metric_cols = [
    'repeat', 'dataset', 'model',
    'accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'average_precision',
]
display(repeat_results_df[repeat_metric_cols].head(20))

Running repeat 1/10 ...
Running repeat 2/10 ...
Running repeat 3/10 ...
Running repeat 4/10 ...
Running repeat 5/10 ...
Running repeat 6/10 ...
Running repeat 7/10 ...
Running repeat 8/10 ...
Running repeat 9/10 ...
Running repeat 10/10 ...

Per-repeat snapshot (first 20 rows): identifiers + composition


Unnamed: 0,repeat,dataset,model,inlier_raw_label,outlier_raw_label,n_labeled_pos,n_unlabeled_pos,n_unlabeled_out,n_unlabeled_total,actual_unlabeled_outlier_fraction
0,0,breast_cancer,latest_v1,0,1,85,127,32,159,0.201258
1,0,breast_cancer,latest_v2,0,1,85,127,32,159,0.201258
2,0,diabetes,latest_v1,tested_negative,tested_positive,200,300,75,375,0.2
3,0,diabetes,latest_v2,tested_negative,tested_positive,200,300,75,375,0.2
4,0,spambase_local,latest_v1,0,1,1115,1673,418,2091,0.199904
5,0,spambase_local,latest_v2,0,1,1115,1673,418,2091,0.199904
6,0,banknote_authentication,latest_v1,0,1,305,457,114,571,0.19965
7,0,banknote_authentication,latest_v2,0,1,305,457,114,571,0.19965
8,0,rice_cammeo_osmancik,latest_v1,Cammeo,Osmancik,652,978,244,1222,0.199673
9,0,rice_cammeo_osmancik,latest_v2,Cammeo,Osmancik,652,978,244,1222,0.199673


Per-repeat snapshot (first 20 rows): metrics


Unnamed: 0,repeat,dataset,model,accuracy,balanced_accuracy,precision,recall,f1,roc_auc,average_precision
0,0,breast_cancer,latest_v1,0.836478,0.897638,0.551724,1.0,0.711111,0.982037,0.930622
1,0,breast_cancer,latest_v2,0.622642,0.76378,0.347826,1.0,0.516129,0.979823,0.929842
2,0,diabetes,latest_v1,0.658667,0.716667,0.348571,0.813333,0.488,0.777911,0.441941
3,0,diabetes,latest_v2,0.706667,0.656667,0.355372,0.573333,0.438776,0.694933,0.360762
4,0,spambase_local,latest_v1,0.856528,0.786512,0.633484,0.669856,0.651163,0.877234,0.68233
5,0,spambase_local,latest_v2,0.911047,0.810712,0.879085,0.643541,0.743094,0.913106,0.811214
6,0,banknote_authentication,latest_v1,0.935201,0.959519,0.754967,1.0,0.860377,1.0,1.0
7,0,banknote_authentication,latest_v2,0.992995,0.995624,0.966102,1.0,0.982759,1.0,1.0
8,0,rice_cammeo_osmancik,latest_v1,0.816694,0.874715,0.522026,0.971311,0.679083,0.970901,0.914168
9,0,rice_cammeo_osmancik,latest_v2,0.935352,0.881177,0.873303,0.790984,0.830108,0.976487,0.926353


In [17]:
metric_cols = [
    'accuracy',
    'balanced_accuracy',
    'precision',
    'recall',
    'f1',
    'roc_auc',
    'average_precision',
]

composition_cols = [
    'n_labeled_pos',
    'n_unlabeled_pos',
    'n_unlabeled_out',
    'n_unlabeled_total',
    'requested_unlabeled_outlier_fraction',
    'actual_unlabeled_outlier_fraction',
]

avg_metrics_df = (
    repeat_results_df
    .groupby(['dataset', 'model'], as_index=False)[metric_cols + composition_cols]
    .mean()
    .sort_values(['dataset', 'model'])
)

std_metrics_df = (
    repeat_results_df
    .groupby(['dataset', 'model'], as_index=False)[metric_cols]
    .std()
    .rename(columns={c: f'{c}_std' for c in metric_cols})
)

summary_df = avg_metrics_df.merge(std_metrics_df, on=['dataset', 'model'], how='left')

print('Average over repeats: compact metric view')
summary_metric_view_cols = [
    'dataset', 'model',
    'roc_auc', 'average_precision', 'f1', 'balanced_accuracy', 'accuracy', 'precision', 'recall',
]
display(summary_df[summary_metric_view_cols].sort_values(['dataset', 'model']))

print('Average over repeats: PU composition view')
summary_composition_view_cols = [
    'dataset', 'model',
    'n_labeled_pos', 'n_unlabeled_pos', 'n_unlabeled_out', 'n_unlabeled_total',
    'requested_unlabeled_outlier_fraction', 'actual_unlabeled_outlier_fraction',
]
display(summary_df[summary_composition_view_cols].sort_values(['dataset', 'model']))

print('Average over repeats: metric mean/std view')
summary_metric_std_cols = ['dataset', 'model']
for c in metric_cols:
    summary_metric_std_cols.extend([c, f'{c}_std'])
display(summary_df[summary_metric_std_cols].sort_values(['dataset', 'model']))

roc_auc_pivot = summary_df.pivot_table(index='dataset', columns='model', values='roc_auc')
print('Mean ROC-AUC by dataset/model:')
display(roc_auc_pivot)

avg_by_model = summary_df.groupby('model', as_index=False)[metric_cols].mean().sort_values('roc_auc', ascending=False)
print('Dataset-averaged metrics by model:')
display(avg_by_model)

Average over repeats: compact metric view


Unnamed: 0,dataset,model,roc_auc,average_precision,f1,balanced_accuracy,accuracy,precision,recall
0,banknote_authentication,latest_v1,0.999981,0.999925,0.800546,0.936761,0.898774,0.669828,1.0
1,banknote_authentication,latest_v2,0.999954,0.999817,0.967504,0.991466,0.98634,0.938065,1.0
2,breast_cancer,latest_v1,0.928888,0.821946,0.632252,0.83029,0.784906,0.486974,0.90625
3,breast_cancer,latest_v2,0.950049,0.880342,0.669192,0.84984,0.795597,0.533702,0.940625
4,diabetes,latest_v1,0.742129,0.420379,0.455637,0.682167,0.641067,0.327285,0.750667
5,diabetes,latest_v2,0.7138,0.386098,0.406418,0.635,0.7072,0.369699,0.514667
6,rice_cammeo_osmancik,latest_v1,0.971604,0.918961,0.684296,0.876604,0.821195,0.529193,0.968852
7,rice_cammeo_osmancik,latest_v2,0.971386,0.921585,0.830209,0.877433,0.936498,0.889929,0.779098
8,spambase_local,latest_v1,0.874552,0.664982,0.649607,0.795999,0.846294,0.597941,0.712201
9,spambase_local,latest_v2,0.903254,0.80562,0.733138,0.814126,0.903013,0.817524,0.666029


Average over repeats: PU composition view


Unnamed: 0,dataset,model,n_labeled_pos,n_unlabeled_pos,n_unlabeled_out,n_unlabeled_total,requested_unlabeled_outlier_fraction,actual_unlabeled_outlier_fraction
0,banknote_authentication,latest_v1,305.0,457.0,114.0,571.0,0.2,0.19965
1,banknote_authentication,latest_v2,305.0,457.0,114.0,571.0,0.2,0.19965
2,breast_cancer,latest_v1,85.0,127.0,32.0,159.0,0.2,0.201258
3,breast_cancer,latest_v2,85.0,127.0,32.0,159.0,0.2,0.201258
4,diabetes,latest_v1,200.0,300.0,75.0,375.0,0.2,0.2
5,diabetes,latest_v2,200.0,300.0,75.0,375.0,0.2,0.2
6,rice_cammeo_osmancik,latest_v1,652.0,978.0,244.0,1222.0,0.2,0.199673
7,rice_cammeo_osmancik,latest_v2,652.0,978.0,244.0,1222.0,0.2,0.199673
8,spambase_local,latest_v1,1115.0,1673.0,418.0,2091.0,0.2,0.199904
9,spambase_local,latest_v2,1115.0,1673.0,418.0,2091.0,0.2,0.199904


Average over repeats: metric mean/std view


Unnamed: 0,dataset,model,accuracy,accuracy_std,balanced_accuracy,balanced_accuracy_std,precision,precision_std,recall,recall_std,f1,f1_std,roc_auc,roc_auc_std,average_precision,average_precision_std
0,banknote_authentication,latest_v1,0.898774,0.031962,0.936761,0.019967,0.669828,0.065954,1.0,0.0,0.800546,0.048587,0.999981,3.8e-05,0.999925,0.000151
1,banknote_authentication,latest_v2,0.98634,0.010726,0.991466,0.006701,0.938065,0.046537,1.0,0.0,0.967504,0.024939,0.999954,0.000139,0.999817,0.000553
2,breast_cancer,latest_v1,0.784906,0.053842,0.83029,0.054494,0.486974,0.064483,0.90625,0.069096,0.632252,0.068349,0.928888,0.033156,0.821946,0.069891
3,breast_cancer,latest_v2,0.795597,0.108337,0.84984,0.07561,0.533702,0.159372,0.940625,0.061538,0.669192,0.12943,0.950049,0.040534,0.880342,0.078021
4,diabetes,latest_v1,0.641067,0.026104,0.682167,0.034012,0.327285,0.025431,0.750667,0.056934,0.455637,0.033636,0.742129,0.046958,0.420379,0.073907
5,diabetes,latest_v2,0.7072,0.097079,0.635,0.053943,0.369699,0.078523,0.514667,0.205043,0.406418,0.075558,0.7138,0.048031,0.386098,0.078355
6,rice_cammeo_osmancik,latest_v1,0.821195,0.014487,0.876604,0.011089,0.529193,0.020943,0.968852,0.011784,0.684296,0.018375,0.971604,0.005316,0.918961,0.010967
7,rice_cammeo_osmancik,latest_v2,0.936498,0.00525,0.877433,0.015215,0.889929,0.020007,0.779098,0.033432,0.830209,0.017024,0.971386,0.005145,0.921585,0.010091
8,spambase_local,latest_v1,0.846294,0.010739,0.795999,0.011947,0.597941,0.026552,0.712201,0.024268,0.649607,0.018356,0.874552,0.010362,0.664982,0.021142
9,spambase_local,latest_v2,0.903013,0.010523,0.814126,0.015341,0.817524,0.04793,0.666029,0.030097,0.733138,0.026084,0.903254,0.014547,0.80562,0.020698


Mean ROC-AUC by dataset/model:


model,latest_v1,latest_v2
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
banknote_authentication,0.999981,0.999954
breast_cancer,0.928888,0.950049
diabetes,0.742129,0.7138
rice_cammeo_osmancik,0.971604,0.971386
spambase_local,0.874552,0.903254


Dataset-averaged metrics by model:


Unnamed: 0,model,accuracy,balanced_accuracy,precision,recall,f1,roc_auc,average_precision
1,latest_v2,0.86573,0.833573,0.709784,0.780084,0.721292,0.907689,0.798692
0,latest_v1,0.798447,0.824364,0.522244,0.867594,0.644468,0.903431,0.765238


In [18]:
# Optional: inspect one dataset/model in detail
example_dataset = summary_df['dataset'].iloc[0]
example_model = summary_df['model'].iloc[0]

print(f'Detailed per-repeat rows for dataset={example_dataset}, model={example_model}: composition')
detailed_df = repeat_results_df[
    (repeat_results_df['dataset'] == example_dataset)
    & (repeat_results_df['model'] == example_model)
].sort_values('repeat')
display(detailed_df[[
    'repeat', 'split_seed', 'n_labeled_pos', 'n_unlabeled_pos', 'n_unlabeled_out',
    'n_unlabeled_total', 'actual_unlabeled_outlier_fraction',
]])

print('Detailed per-repeat rows: metrics')
display(detailed_df[[
    'repeat', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'average_precision',
]])


Detailed per-repeat rows for dataset=banknote_authentication, model=latest_v1: composition


Unnamed: 0,repeat,split_seed,n_labeled_pos,n_unlabeled_pos,n_unlabeled_out,n_unlabeled_total,actual_unlabeled_outlier_fraction
6,0,20260222,305,457,114,571,0.19965
16,1,20261222,305,457,114,571,0.19965
26,2,20262222,305,457,114,571,0.19965
36,3,20263222,305,457,114,571,0.19965
46,4,20264222,305,457,114,571,0.19965
56,5,20265222,305,457,114,571,0.19965
66,6,20266222,305,457,114,571,0.19965
76,7,20267222,305,457,114,571,0.19965
86,8,20268222,305,457,114,571,0.19965
96,9,20269222,305,457,114,571,0.19965


Detailed per-repeat rows: metrics


Unnamed: 0,repeat,accuracy,balanced_accuracy,precision,recall,f1,roc_auc,average_precision
6,0,0.935201,0.959519,0.754967,1.0,0.860377,1.0,1.0
16,1,0.922942,0.95186,0.721519,1.0,0.838235,1.0,1.0
26,2,0.921191,0.950766,0.716981,1.0,0.835165,0.999942,0.999775
36,3,0.870403,0.919037,0.606383,1.0,0.754967,1.0,1.0
46,4,0.887916,0.929978,0.640449,1.0,0.780822,0.999885,0.999548
56,5,0.900175,0.937637,0.666667,1.0,0.8,1.0,1.0
66,6,0.828371,0.892779,0.537736,1.0,0.699387,1.0,1.0
76,7,0.89317,0.93326,0.651429,1.0,0.788927,1.0,1.0
86,8,0.928196,0.955142,0.735484,1.0,0.847584,1.0,1.0
96,9,0.900175,0.937637,0.666667,1.0,0.8,0.999981,0.999924
