# Evaluation of `pretrain_v2` Pretrained Model

This notebook evaluates `pretrain_v2/pretrained_model/latest.pt` on a small set of binary classification benchmarks.

- Includes dataset types: mostly/all continuous, mixed continuous+categorical, mostly/all categorical.
- Builds feature metadata required by the model:
  - `feature_is_categorical`
  - `feature_cardinalities`
- Converts each dataset to PU format for 10 independent replicates.
- Reports average classification + outlier-detection metrics.

In [22]:
from __future__ import annotations

import json
from io import BytesIO, TextIOWrapper
import math
import sys
import zipfile
from pathlib import Path
from typing import Dict, Optional

import numpy as np
import pandas as pd
import torch
from scipy.io import arff
from urllib.request import urlretrieve
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    roc_auc_score,
    roc_curve,
)


In [23]:
# Path resolution for both full-repo and standalone pretrain_v2 usage.
cwd = Path.cwd().resolve()
if (cwd / "pretrain_v2").exists():
    repo_root = cwd
    pretrain_root = cwd / "pretrain_v2"
elif (cwd / "model.py").exists() and (cwd / "__init__.py").exists() and cwd.name == "pretrain_v2":
    pretrain_root = cwd
    repo_root = cwd.parent
else:
    raise RuntimeError(
        "Run this notebook either from the repo root (containing pretrain_v2/) "
        "or from inside the pretrain_v2 folder."
    )

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from pretrain_v2.model import NanoTabPFNPUModel

print(f"repo_root={repo_root}")
print(f"pretrain_root={pretrain_root}")

repo_root=/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain
pretrain_root=/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/pretrain_v2


In [24]:
# ===== User-configurable evaluation settings =====
if torch.cuda.is_available():
    DEVICE = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

CHECKPOINT_PATH = pretrain_root / "pretrained_model" / "latest.pt"
LEGACY_CHECKPOINT_PATH = pretrain_root / "saved_models" / "legacy_model.pt"
LEGACY_MODEL_COMMIT = "bfa65b8"

OUTPUT_DIR = pretrain_root / "evaluation_outputs"
CACHE_DIR = pretrain_root / ".cache"

# Download UCI Repository datasets (required for this benchmark set).
ALLOW_UCI_DOWNLOAD = True

N_REPLICATES = 10
MAX_ATTEMPTS_PER_DATASET = 200

MAX_POSITIVE_SIZE = 900
UNLABELED_LABELED_POSITIVE_RATIO = (2, 1)  # unlabeled:labeled among selected positives
OUTLIER_RATE = 0.13  # fraction of outliers in unlabeled set

GLOBAL_SEED = 42

if not CHECKPOINT_PATH.exists():
    raise FileNotFoundError(f"Checkpoint not found: {CHECKPOINT_PATH}")
if not LEGACY_CHECKPOINT_PATH.exists():
    raise FileNotFoundError(f"Legacy checkpoint not found: {LEGACY_CHECKPOINT_PATH}")
if not (0.0 <= OUTLIER_RATE < 1.0):
    raise ValueError("OUTLIER_RATE must satisfy 0 <= OUTLIER_RATE < 1.")

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

print(f"DEVICE={DEVICE}")
print(f"CHECKPOINT_PATH={CHECKPOINT_PATH}")
print(f"LEGACY_CHECKPOINT_PATH={LEGACY_CHECKPOINT_PATH}")
print(f"LEGACY_MODEL_COMMIT={LEGACY_MODEL_COMMIT}")


DEVICE=mps
CHECKPOINT_PATH=/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/pretrain_v2/pretrained_model/latest.pt
LEGACY_CHECKPOINT_PATH=/Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/pretrain_v2/saved_models/legacy_model.pt
LEGACY_MODEL_COMMIT=bfa65b8


In [25]:
import torch.nn.functional as F
from torch import nn
from torch.nn.modules.transformer import LayerNorm, Linear, MultiheadAttention


class LegacyNanoTabPFNPUModel(nn.Module):
    """Legacy PU-adapted NanoTabPFN model from commit bfa65b8."""

    def __init__(
        self,
        embedding_size: int,
        num_attention_heads: int,
        mlp_hidden_size: int,
        num_layers: int,
        num_outputs: int = 2,
    ):
        super().__init__()
        self.feature_encoder = LegacyFeatureEncoder(embedding_size)
        self.target_encoder = LegacyTargetEncoderPU(embedding_size)
        self.transformer_blocks = nn.ModuleList(
            [
                LegacyTransformerEncoderLayerPU(
                    embedding_size=embedding_size,
                    nhead=num_attention_heads,
                    mlp_hidden_size=mlp_hidden_size,
                )
                for _ in range(num_layers)
            ]
        )
        self.decoder = LegacyDecoder(embedding_size, mlp_hidden_size, num_outputs)

    def forward(self, src: tuple[torch.Tensor, torch.Tensor], train_test_split_index: int) -> torch.Tensor:
        x_src, y_src = src
        if len(y_src.shape) < len(x_src.shape):
            y_src = y_src.unsqueeze(-1)

        x_src = self.feature_encoder(x_src, train_test_split_index)
        num_rows = x_src.shape[1]
        y_src = self.target_encoder(y_src, num_rows)
        src_table = torch.cat([x_src, y_src], dim=2)

        for block in self.transformer_blocks:
            src_table = block(src_table, train_test_split_index=train_test_split_index)

        output = src_table[:, train_test_split_index:, -1, :]
        return self.decoder(output)


class LegacyFeatureEncoder(nn.Module):
    def __init__(self, embedding_size: int):
        super().__init__()
        self.linear_layer = nn.Linear(1, embedding_size)

    def forward(self, x: torch.Tensor, train_test_split_index: int) -> torch.Tensor:
        x = x.unsqueeze(-1)
        train_rows = int(max(0, min(train_test_split_index, x.shape[1])))
        if train_rows >= 2:
            train_slice = x[:, :train_rows]
            mean = torch.mean(train_slice, dim=1, keepdim=True)
            std = torch.std(train_slice, dim=1, keepdim=True, unbiased=False).clamp_min(1e-20)
        elif train_rows == 1:
            train_slice = x[:, :1]
            mean = torch.mean(train_slice, dim=1, keepdim=True)
            std = torch.ones_like(mean)
        else:
            mean = torch.zeros_like(x[:, :1])
            std = torch.ones_like(x[:, :1])
        x = (x - mean) / std
        x = torch.clip(x, min=-100, max=100)
        return self.linear_layer(x)


class LegacyTargetEncoderPU(nn.Module):
    def __init__(self, embedding_size: int):
        super().__init__()
        self.linear_layer = nn.Linear(1, embedding_size)
        self.unlabeled_embedding = nn.Parameter(torch.zeros(1, 1, embedding_size))
        nn.init.normal_(self.unlabeled_embedding, std=0.02)

    def forward(self, y_train: torch.Tensor, num_rows: int) -> torch.Tensor:
        if y_train.dim() == 2:
            y_train = y_train.unsqueeze(-1)
        if y_train.shape[1] > num_rows:
            raise ValueError("y_train rows exceed total num_rows.")

        batch_size = y_train.shape[0]
        pad_rows = num_rows - y_train.shape[1]
        if pad_rows > 0:
            padding = torch.full(
                (batch_size, pad_rows, 1),
                -1.0,
                dtype=y_train.dtype,
                device=y_train.device,
            )
            y_full = torch.cat([y_train, padding], dim=1)
        else:
            y_full = y_train

        observed_mask = y_full >= 0
        y_for_linear = y_full.clone()
        y_for_linear[~observed_mask] = 0.0
        embedded = self.linear_layer(y_for_linear)

        unlabeled = self.unlabeled_embedding.expand(batch_size, num_rows, -1)
        embedded = torch.where(observed_mask.expand_as(embedded), embedded, unlabeled)
        return embedded.unsqueeze(2)


class LegacyTransformerEncoderLayerPU(nn.Module):
    def __init__(
        self,
        embedding_size: int,
        nhead: int,
        mlp_hidden_size: int,
        layer_norm_eps: float = 1e-5,
        batch_first: bool = True,
        device=None,
        dtype=None,
    ):
        super().__init__()
        self.self_attention_between_datapoints = MultiheadAttention(
            embedding_size, nhead, batch_first=batch_first, device=device, dtype=dtype
        )
        self.self_attention_between_features = MultiheadAttention(
            embedding_size, nhead, batch_first=batch_first, device=device, dtype=dtype
        )

        self.linear1 = Linear(embedding_size, mlp_hidden_size, device=device, dtype=dtype)
        self.linear2 = Linear(mlp_hidden_size, embedding_size, device=device, dtype=dtype)

        self.norm1 = LayerNorm(embedding_size, eps=layer_norm_eps, device=device, dtype=dtype)
        self.norm2 = LayerNorm(embedding_size, eps=layer_norm_eps, device=device, dtype=dtype)
        self.norm3 = LayerNorm(embedding_size, eps=layer_norm_eps, device=device, dtype=dtype)

    def forward(self, src: torch.Tensor, train_test_split_index: int) -> torch.Tensor:
        batch_size, rows_size, col_size, embedding_size = src.shape

        src_f = src.reshape(batch_size * rows_size, col_size, embedding_size)
        src_f = self.self_attention_between_features(src_f, src_f, src_f)[0] + src_f
        src = src_f.reshape(batch_size, rows_size, col_size, embedding_size)
        src = self.norm1(src)

        src = src.transpose(1, 2)
        src_d = src.reshape(batch_size * col_size, rows_size, embedding_size)

        src_left = self.self_attention_between_datapoints(
            src_d[:, :train_test_split_index],
            src_d[:, :train_test_split_index],
            src_d[:, :train_test_split_index],
        )[0]
        src_right = self.self_attention_between_datapoints(
            src_d[:, train_test_split_index:],
            src_d,
            src_d,
        )[0]
        src_d = torch.cat([src_left, src_right], dim=1) + src_d

        src = src_d.reshape(batch_size, col_size, rows_size, embedding_size)
        src = src.transpose(2, 1)
        src = self.norm2(src)

        src = self.linear2(F.gelu(self.linear1(src))) + src
        src = self.norm3(src)
        return src


class LegacyDecoder(nn.Module):
    def __init__(self, embedding_size: int, mlp_hidden_size: int, num_outputs: int):
        super().__init__()
        self.linear1 = nn.Linear(embedding_size, mlp_hidden_size)
        self.linear2 = nn.Linear(mlp_hidden_size, num_outputs)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear2(F.gelu(self.linear1(x)))


def load_latest_model_from_checkpoint(checkpoint_path: Path, device: str = "cpu"):
    payload = torch.load(checkpoint_path, map_location=device)
    model_cfg = payload.get("config", {}).get("model", {})

    model = NanoTabPFNPUModel(
        embedding_size=int(model_cfg.get("embedding_size", 128)),
        num_attention_heads=int(model_cfg.get("num_attention_heads", 8)),
        mlp_hidden_size=int(model_cfg.get("mlp_hidden_size", 256)),
        num_layers=int(model_cfg.get("num_layers", 6)),
        num_outputs=int(model_cfg.get("num_outputs", 2)),
        max_categorical_classes=int(model_cfg.get("max_categorical_classes", 64)),
    ).to(device)

    state_dict = payload.get("model_state_dict", payload)
    load_result = model.load_state_dict(state_dict, strict=False)
    model.eval()
    return model, payload, load_result


def load_legacy_model_from_checkpoint(checkpoint_path: Path, device: str = "cpu"):
    payload = torch.load(checkpoint_path, map_location=device)
    model_cfg = payload.get("config", {}).get("model", {})

    model = LegacyNanoTabPFNPUModel(
        embedding_size=int(model_cfg.get("embedding_size", 128)),
        num_attention_heads=int(model_cfg.get("num_attention_heads", 8)),
        mlp_hidden_size=int(model_cfg.get("mlp_hidden_size", 256)),
        num_layers=int(model_cfg.get("num_layers", 6)),
        num_outputs=int(model_cfg.get("num_outputs", 2)),
    ).to(device)

    state_dict = payload.get("model_state_dict", payload)
    load_result = model.load_state_dict(state_dict, strict=False)
    model.eval()
    return model, payload, load_result


latest_model, latest_payload, latest_load_result = load_latest_model_from_checkpoint(CHECKPOINT_PATH, device=DEVICE)
legacy_model, legacy_payload, legacy_load_result = load_legacy_model_from_checkpoint(LEGACY_CHECKPOINT_PATH, device=DEVICE)

MAX_CATEGORICAL_CLASSES = int(latest_model.feature_encoder.categorical_embedding.num_embeddings - 1)

MODEL_SPECS = [
    {
        "model_name": "latest",
        "model": latest_model,
        "supports_categorical": True,
        "checkpoint_path": str(CHECKPOINT_PATH),
    },
    {
        "model_name": "legacy",
        "model": legacy_model,
        "supports_categorical": False,
        "checkpoint_path": str(LEGACY_CHECKPOINT_PATH),
    },
]

print("Loaded models for comparison.")
print(f"max_categorical_classes={MAX_CATEGORICAL_CLASSES}")
print(
    "latest model: "
    f"missing_keys={len(latest_load_result.missing_keys)}, unexpected_keys={len(latest_load_result.unexpected_keys)}"
)
print(
    "legacy model: "
    f"missing_keys={len(legacy_load_result.missing_keys)}, unexpected_keys={len(legacy_load_result.unexpected_keys)}"
)


Loaded models for comparison.
max_categorical_classes=64
latest model: missing_keys=0, unexpected_keys=0
legacy model: missing_keys=0, unexpected_keys=0


In [26]:
UCI_BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases"


def _download_with_cache(url: str, subdir: str, filename: str, allow_download: bool = True) -> Path:
    target_dir = CACHE_DIR / "uci" / subdir
    target_dir.mkdir(parents=True, exist_ok=True)
    local_path = target_dir / filename

    if local_path.exists():
        return local_path
    if not allow_download:
        raise FileNotFoundError(
            f"UCI cached file not found and downloads are disabled: {local_path}. "
            "Set ALLOW_UCI_DOWNLOAD=True to fetch it."
        )

    urlretrieve(url, local_path)
    return local_path


def _strip_object_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    object_cols = df.select_dtypes(include=["object", "string"]).columns
    for col in object_cols:
        df[col] = df[col].astype("string").str.strip()
    return df


def _read_uci_table_from_zip(zip_path: Path) -> pd.DataFrame:
    with zipfile.ZipFile(zip_path, "r") as zf:
        members = [
            name
            for name in zf.namelist()
            if not name.endswith("/") and not name.lower().startswith("__macosx/")
        ]
        if len(members) == 0:
            raise FileNotFoundError(f"No data files found in zip: {zip_path}")

        preferred_exts = (".csv", ".arff", ".data", ".txt", ".xlsx", ".xls")
        ordered_members = []
        for ext in preferred_exts:
            ordered_members.extend([name for name in members if name.lower().endswith(ext)])
        ordered_members.extend([name for name in members if name not in ordered_members])

        last_error = None
        for selected in ordered_members:
            try:
                with zf.open(selected) as f:
                    lower = selected.lower()
                    if lower.endswith(".csv"):
                        df = pd.read_csv(f)
                    elif lower.endswith(".arff"):
                        with TextIOWrapper(f, encoding="utf-8", errors="ignore") as txt_f:
                            data, _ = arff.loadarff(txt_f)
                        df = pd.DataFrame(data)
                        for col in df.columns:
                            if df[col].dtype == object:
                                df[col] = df[col].apply(
                                    lambda value: value.decode("utf-8") if isinstance(value, (bytes, bytearray)) else value
                                )
                    elif lower.endswith(".xlsx") or lower.endswith(".xls"):
                        df = pd.read_excel(BytesIO(f.read()))
                    else:
                        df = pd.read_csv(f)
                return _strip_object_columns(df)
            except Exception as exc:
                last_error = exc

        raise RuntimeError(f"Failed to parse any file from zip: {zip_path}") from last_error


def get_benchmark_datasets(allow_uci_download: bool = True, binary_seed: int = 42):
    datasets = []
    root_missing_drop_threshold = 0.20

    # 1) UCI Breast Cancer Wisconsin (Diagnostic)
    # - Remove ID column.
    # - All features are continuous.
    wdbc_feature_names = [
        "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
        "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
        "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
        "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
        "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
        "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst",
    ]
    wdbc_cols = ["id", "target"] + wdbc_feature_names
    wdbc_path = _download_with_cache(
        f"{UCI_BASE_URL}/breast-cancer-wisconsin/wdbc.data",
        subdir="wdbc",
        filename="wdbc.data",
        allow_download=allow_uci_download,
    )
    wdbc_df = pd.read_csv(wdbc_path, header=None, names=wdbc_cols)
    datasets.append(
        {
            "name": "uci_wdbc_continuous",
            "source": "uci:wdbc",
            "X": wdbc_df[wdbc_feature_names].copy(),
            "y": wdbc_df["target"].copy(),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    # 2) UCI Adult
    # - Drop rows with any missing values.
    # - Treat only Categorical/Binary variables as categorical.
    # - Treat Integer variables as continuous.
    adult_cols = [
        "age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
        "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
        "hours_per_week", "native_country", "target",
    ]
    adult_path = _download_with_cache(
        f"{UCI_BASE_URL}/adult/adult.data",
        subdir="adult",
        filename="adult.data",
        allow_download=allow_uci_download,
    )
    adult_df = pd.read_csv(adult_path, header=None, names=adult_cols, na_values=["?"], skipinitialspace=True)
    adult_df = _strip_object_columns(adult_df)
    adult_df = adult_df.dropna(axis=0).reset_index(drop=True)

    adult_categorical_cols = [
        "workclass", "education", "marital_status", "occupation", "relationship",
        "race", "sex", "native_country",
    ]
    adult_feature_cols = [col for col in adult_cols if col != "target"]
    adult_continuous_cols = [col for col in adult_feature_cols if col not in adult_categorical_cols]

    datasets.append(
        {
            "name": "uci_adult_mixed",
            "source": "uci:adult",
            "X": adult_df[adult_feature_cols].copy(),
            "y": adult_df["target"].copy(),
            "schema_hint": {
                "force_categorical_cols": adult_categorical_cols,
                "force_continuous_cols": adult_continuous_cols,
            },
        }
    )

    # 3) UCI Spambase - all continuous
    spambase_cols = [f"f{i}" for i in range(1, 58)] + ["target"]
    spambase_path = _download_with_cache(
        f"{UCI_BASE_URL}/spambase/spambase.data",
        subdir="spambase",
        filename="spambase.data",
        allow_download=allow_uci_download,
    )
    spambase_df = pd.read_csv(spambase_path, header=None, names=spambase_cols)
    datasets.append(
        {
            "name": "uci_spambase_continuous",
            "source": "uci:spambase",
            "X": spambase_df[[f"f{i}" for i in range(1, 58)]].copy(),
            "y": spambase_df["target"].copy(),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    # 4) UCI Mushroom - all categorical
    # - Drop stalk-root if its missing rate is high; then drop remaining missing rows.
    mushroom_feature_cols = [
        "cap_shape", "cap_surface", "cap_color", "bruises", "odor",
        "gill_attachment", "gill_spacing", "gill_size", "gill_color", "stalk_shape",
        "stalk_root", "stalk_surface_above_ring", "stalk_surface_below_ring",
        "stalk_color_above_ring", "stalk_color_below_ring", "veil_type", "veil_color",
        "ring_number", "ring_type", "spore_print_color", "population", "habitat",
    ]
    mushroom_cols = ["target"] + mushroom_feature_cols
    mushroom_path = _download_with_cache(
        f"{UCI_BASE_URL}/mushroom/agaricus-lepiota.data",
        subdir="mushroom",
        filename="agaricus-lepiota.data",
        allow_download=allow_uci_download,
    )
    mushroom_df = pd.read_csv(mushroom_path, header=None, names=mushroom_cols, na_values=["?"], skipinitialspace=True)
    mushroom_df = _strip_object_columns(mushroom_df)

    stalk_root_missing_rate = float(mushroom_df["stalk_root"].isna().mean())
    if stalk_root_missing_rate > root_missing_drop_threshold:
        mushroom_df = mushroom_df.drop(columns=["stalk_root"])

    mushroom_df = mushroom_df.dropna(axis=0).reset_index(drop=True)
    mushroom_X_cols = [col for col in mushroom_df.columns if col != "target"]
    datasets.append(
        {
            "name": "uci_mushroom_categorical",
            "source": "uci:mushroom",
            "X": mushroom_df[mushroom_X_cols].copy(),
            "y": mushroom_df["target"].copy(),
            "schema_hint": {"force_all_categorical": True},
        }
    )

    # 5) UCI MAGIC Gamma Telescope - all continuous
    magic_feature_cols = [
        "fLength", "fWidth", "fSize", "fConc", "fConc1",
        "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",
    ]
    magic_cols = magic_feature_cols + ["target"]
    magic_path = _download_with_cache(
        f"{UCI_BASE_URL}/magic/magic04.data",
        subdir="magic_gamma_telescope",
        filename="magic04.data",
        allow_download=allow_uci_download,
    )
    magic_df = pd.read_csv(magic_path, header=None, names=magic_cols)
    datasets.append(
        {
            "name": "uci_magic_gamma_continuous",
            "source": "uci:magic-gamma-telescope",
            "X": magic_df[magic_feature_cols].copy(),
            "y": magic_df["target"].copy(),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    # 6) UCI Car Evaluation - all categorical
    # Target mapping: {unacc, acc} vs {good, vgood}
    car_cols = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "target"]
    car_path = _download_with_cache(
        f"{UCI_BASE_URL}/car/car.data",
        subdir="car_evaluation",
        filename="car.data",
        allow_download=allow_uci_download,
    )
    car_df = pd.read_csv(car_path, header=None, names=car_cols)
    car_df = _strip_object_columns(car_df)

    car_binary_target = car_df["target"].map(
        {
            "unacc": "unacc_or_acc",
            "acc": "unacc_or_acc",
            "good": "good_or_vgood",
            "vgood": "good_or_vgood",
        }
    )
    car_valid = car_binary_target.notna()
    car_df = car_df.loc[car_valid].reset_index(drop=True)
    car_binary_target = car_binary_target.loc[car_valid].reset_index(drop=True)

    datasets.append(
        {
            "name": "uci_car_evaluation_categorical",
            "source": "uci:car-evaluation",
            "X": car_df[["buying", "maint", "doors", "persons", "lug_boot", "safety"]].copy(),
            "y": car_binary_target,
            "schema_hint": {"force_all_categorical": True},
        }
    )

    # 7) UCI Banknote Authentication - all continuous
    banknote_cols = ["variance", "skewness", "curtosis", "entropy", "target"]
    banknote_path = _download_with_cache(
        f"{UCI_BASE_URL}/data_banknote_authentication/data_banknote_authentication.txt",
        subdir="banknote_authentication",
        filename="data_banknote_authentication.txt",
        allow_download=allow_uci_download,
    )
    banknote_df = pd.read_csv(banknote_path, header=None, names=banknote_cols)
    datasets.append(
        {
            "name": "uci_banknote_authentication_continuous",
            "source": "uci:banknote-authentication",
            "X": banknote_df[["variance", "skewness", "curtosis", "entropy"]].copy(),
            "y": banknote_df["target"].copy(),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    # 8) UCI Rice (Cammeo and Osmancik) - all continuous (including integer columns)
    rice_zip_path = _download_with_cache(
        "https://archive.ics.uci.edu/static/public/545/rice+cammeo+and+osmancik.zip",
        subdir="rice_cammeo_osmancik",
        filename="rice+cammeo+and+osmancik.zip",
        allow_download=allow_uci_download,
    )
    rice_df = _read_uci_table_from_zip(rice_zip_path)
    rice_target_col = next(
        (candidate for candidate in ["Class", "class", "target", "Target"] if candidate in rice_df.columns),
        rice_df.columns[-1],
    )
    rice_feature_cols = [col for col in rice_df.columns if col != rice_target_col]
    datasets.append(
        {
            "name": "uci_rice_cammeo_osmancik_continuous",
            "source": "uci:rice-cammeo-and-osmancik",
            "X": rice_df[rice_feature_cols].copy(),
            "y": rice_df[rice_target_col].copy(),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    # 9) UCI Default of Credit Card Clients - all continuous, remove ID column
    default_credit_zip_path = _download_with_cache(
        "https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip",
        subdir="default_credit_card_clients",
        filename="default+of+credit+card+clients.zip",
        allow_download=allow_uci_download,
    )
    default_credit_df = _read_uci_table_from_zip(default_credit_zip_path)
    default_credit_df = _strip_object_columns(default_credit_df)

    def _normalize_col_name(col_name: object) -> str:
        text = str(col_name).strip().lower()
        text = "".join(ch if ch.isalnum() else "_" for ch in text)
        while "__" in text:
            text = text.replace("__", "_")
        return text.strip("_")

    def _is_placeholder_col_name(col_name: object) -> bool:
        norm = _normalize_col_name(col_name)
        return norm.startswith("unnamed") or norm.isdigit()

    # If parser produced placeholder columns and first row looks like the true header, promote it.
    if default_credit_df.shape[0] > 0:
        first_row_norm = [_normalize_col_name(v) for v in default_credit_df.iloc[0].tolist()]
        header_keywords = {"id", "y", "limit_bal", "pay_0", "bill_amt1", "default_payment_next_month"}
        placeholder_cols = all(_is_placeholder_col_name(col) for col in default_credit_df.columns)
        if placeholder_cols or len(header_keywords & set(first_row_norm)) >= 2:
            default_credit_df.columns = default_credit_df.iloc[0].astype(str).tolist()
            default_credit_df = default_credit_df.iloc[1:].reset_index(drop=True)
            default_credit_df = _strip_object_columns(default_credit_df)

    # Remove duplicated header-like first row if present after parsing.
    if default_credit_df.shape[0] > 0:
        first_row = default_credit_df.iloc[0].astype("string").str.lower().tolist()
        col_tokens = [str(col).strip().lower() for col in default_credit_df.columns.tolist()]
        if sum(int(a == b) for a, b in zip(first_row, col_tokens)) >= max(3, len(col_tokens) // 2):
            default_credit_df = default_credit_df.iloc[1:].reset_index(drop=True)

    normalized_to_original = {_normalize_col_name(col): col for col in default_credit_df.columns}
    # Per dataset spec, prefer target variable Y.
    preferred_norm_names = [
        "y",
        "default_payment_next_month",
        "defaultpaymentnextmonth",
        "target",
    ]
    default_credit_target_col = next(
        (normalized_to_original[name] for name in preferred_norm_names if name in normalized_to_original),
        None,
    )

    if default_credit_target_col is None:
        contains_default_cols = [
            col
            for col in default_credit_df.columns
            if "default" in _normalize_col_name(col) and "month" in _normalize_col_name(col)
        ]
        if len(contains_default_cols) > 0:
            default_credit_target_col = contains_default_cols[0]

    if default_credit_target_col is None:
        binary_cols = [
            col
            for col in default_credit_df.columns
            if int(pd.Series(default_credit_df[col]).nunique(dropna=True)) == 2
        ]
        named_binary_cols = [
            col
            for col in binary_cols
            if _normalize_col_name(col) in {"y", "target"} or "default" in _normalize_col_name(col)
        ]
        if len(named_binary_cols) > 0:
            default_credit_target_col = named_binary_cols[0]
        elif len(binary_cols) > 0:
            default_credit_target_col = binary_cols[-1]

    if default_credit_target_col is None:
        raise ValueError("Could not infer target column for default credit dataset.")

    default_credit_target = pd.to_numeric(default_credit_df[default_credit_target_col], errors="coerce")
    valid_target = default_credit_target.notna()
    default_credit_df = default_credit_df.loc[valid_target].reset_index(drop=True)
    default_credit_target = default_credit_target.loc[valid_target].reset_index(drop=True)

    if int(default_credit_target.nunique(dropna=True)) != 2:
        unique_counts = {
            str(col): int(pd.Series(default_credit_df[col]).nunique(dropna=True))
            for col in default_credit_df.columns
        }
        raise ValueError(
            "Default credit target is not binary after parsing. "
            f"Selected target column={default_credit_target_col!r}, "
            f"num_classes={int(default_credit_target.nunique(dropna=True))}, "
            f"column_nunique={unique_counts}"
        )

    default_credit_feature_cols = [
        col
        for col in default_credit_df.columns
        if col != default_credit_target_col and _normalize_col_name(col) != "id"
    ]
    datasets.append(
        {
            "name": "uci_default_credit_card_clients_continuous",
            "source": "uci:default-of-credit-card-clients",
            "X": default_credit_df[default_credit_feature_cols].copy(),
            "y": default_credit_target.astype(np.int64).copy(),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    # 10) UCI Abalone - convert Rings to binary using a data-driven cutoff (median Rings)
    abalone_cols = [
        "Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight",
        "Viscera_weight", "Shell_weight", "Rings",
    ]
    abalone_path = _download_with_cache(
        f"{UCI_BASE_URL}/abalone/abalone.data",
        subdir="abalone",
        filename="abalone.data",
        allow_download=allow_uci_download,
    )
    abalone_df = pd.read_csv(abalone_path, header=None, names=abalone_cols)
    abalone_df = _strip_object_columns(abalone_df)

    rings_series = pd.to_numeric(abalone_df["Rings"], errors="coerce")
    rings_cutoff = int(np.nanmedian(rings_series.to_numpy()))
    abalone_binary_target = np.where(rings_series >= rings_cutoff, f"rings_ge_{rings_cutoff}", f"rings_lt_{rings_cutoff}")

    datasets.append(
        {
            "name": "uci_abalone_binary_rings_cutoff",
            "source": "uci:abalone",
            "X": abalone_df[[c for c in abalone_cols if c != "Rings"]].copy(),
            "y": pd.Series(abalone_binary_target),
            "schema_hint": {},
        }
    )

    # 11) UCI Letter Recognition - randomly choose two letter classes for binary classification
    letter_cols = ["target"] + [f"x{i}" for i in range(1, 17)]
    letter_path = _download_with_cache(
        f"{UCI_BASE_URL}/letter-recognition/letter-recognition.data",
        subdir="letter_recognition",
        filename="letter-recognition.data",
        allow_download=allow_uci_download,
    )
    letter_df = pd.read_csv(letter_path, header=None, names=letter_cols)
    letter_df = _strip_object_columns(letter_df)

    letter_classes = np.array(sorted(letter_df["target"].dropna().unique().tolist()), dtype=object)
    if letter_classes.shape[0] < 2:
        raise ValueError("Letter Recognition dataset must have at least two classes.")
    class_rng = np.random.default_rng(int(binary_seed))
    chosen_classes = class_rng.choice(letter_classes, size=2, replace=False)
    chosen_classes = np.sort(chosen_classes)

    letter_two_class_df = letter_df[letter_df["target"].isin(chosen_classes)].reset_index(drop=True)
    letter_binary_target = np.where(
        letter_two_class_df["target"].to_numpy() == chosen_classes[0],
        f"letter_{chosen_classes[0]}",
        f"letter_{chosen_classes[1]}",
    )

    datasets.append(
        {
            "name": f"uci_letter_recognition_{chosen_classes[0]}_vs_{chosen_classes[1]}",
            "source": "uci:letter-recognition",
            "X": letter_two_class_df[[f"x{i}" for i in range(1, 17)]].copy(),
            "y": pd.Series(letter_binary_target),
            "schema_hint": {"force_all_continuous": True},
        }
    )

    return datasets


benchmark_datasets = get_benchmark_datasets(allow_uci_download=ALLOW_UCI_DOWNLOAD, binary_seed=GLOBAL_SEED)
pd.DataFrame(
    [
        {
            "dataset": d["name"],
            "source": d["source"],
            "rows": int(d["X"].shape[0]),
            "features": int(d["X"].shape[1]),
        }
        for d in benchmark_datasets
    ]
)


Unnamed: 0,dataset,source,rows,features
0,uci_wdbc_continuous,uci:wdbc,569,30
1,uci_adult_mixed,uci:adult,30162,14
2,uci_spambase_continuous,uci:spambase,4601,57
3,uci_mushroom_categorical,uci:mushroom,8124,21
4,uci_magic_gamma_continuous,uci:magic-gamma-telescope,19020,10
5,uci_car_evaluation_categorical,uci:car-evaluation,1728,6
6,uci_banknote_authentication_continuous,uci:banknote-authentication,1372,4
7,uci_rice_cammeo_osmancik_continuous,uci:rice-cammeo-and-osmancik,3810,7
8,uci_default_credit_card_clients_continuous,uci:default-of-credit-card-clients,30000,23
9,uci_abalone_binary_rings_cutoff,uci:abalone,4177,8


In [27]:
def infer_feature_schema(df: pd.DataFrame, schema_hint: Optional[Dict] = None) -> Dict[str, str]:
    schema_hint = schema_hint or {}
    force_all_categorical = bool(schema_hint.get("force_all_categorical", False))
    force_all_continuous = bool(schema_hint.get("force_all_continuous", False))
    force_categorical_cols = set(schema_hint.get("force_categorical_cols", []))
    force_continuous_cols = set(schema_hint.get("force_continuous_cols", []))

    if force_all_categorical and force_all_continuous:
        raise ValueError("Cannot force all features to both categorical and continuous.")
    overlap = force_categorical_cols & force_continuous_cols
    if len(overlap) > 0:
        raise ValueError(f"Columns listed as both categorical and continuous: {sorted(overlap)}")

    schema: Dict[str, str] = {}
    for col in df.columns:
        s = df[col]
        if force_all_categorical:
            schema[col] = "categorical"
            continue
        if force_all_continuous:
            schema[col] = "continuous"
            continue
        if col in force_continuous_cols:
            schema[col] = "continuous"
            continue
        if col in force_categorical_cols:
            schema[col] = "categorical"
            continue

        if (
            pd.api.types.is_object_dtype(s)
            or isinstance(getattr(s, "dtype", None), pd.CategoricalDtype)
            or pd.api.types.is_bool_dtype(s)
        ):
            schema[col] = "categorical"
        elif pd.api.types.is_integer_dtype(s) and int(s.nunique(dropna=True)) <= 20:
            schema[col] = "categorical"
        else:
            schema[col] = "continuous"
    return schema

def encode_dataset_with_schema(
    df: pd.DataFrame,
    schema: Dict[str, str],
    max_categorical_classes: int,
):
    encoded = pd.DataFrame(index=df.index)
    metadata_rows = []

    for col in df.columns:
        kind = schema[col]
        s = df[col]
        raw_unique = int(pd.Series(s).nunique(dropna=True))

        if kind == "categorical":
            s_obj = pd.Series(s, copy=False)
            s_obj = s_obj.where(s_obj.notna(), "__MISSING__").astype("string")

            counts = s_obj.value_counts(dropna=False)
            if counts.shape[0] > max_categorical_classes:
                keep_n = max(1, max_categorical_classes - 1)
                keep_values = set(counts.index[:keep_n].tolist())
                s_obj = s_obj.where(s_obj.isin(keep_values), "__OTHER__")

            cat = pd.Categorical(s_obj.astype(str))
            codes = cat.codes.astype(np.int64)
            cardinality = int(len(cat.categories))

            encoded[col] = codes.astype(np.float32)
            metadata_rows.append(
                {
                    "feature": col,
                    "feature_type": "categorical",
                    "raw_unique_values": raw_unique,
                    "cardinality": cardinality,
                }
            )
        else:
            s_num = pd.to_numeric(s, errors="coerce")
            fill_value = float(s_num.median()) if s_num.notna().any() else 0.0
            s_num = s_num.fillna(fill_value).astype(np.float32)

            encoded[col] = s_num
            metadata_rows.append(
                {
                    "feature": col,
                    "feature_type": "continuous",
                    "raw_unique_values": raw_unique,
                    "cardinality": 1,
                }
            )

    feature_metadata = pd.DataFrame(metadata_rows)
    feature_is_categorical = (feature_metadata["feature_type"].to_numpy() == "categorical")
    feature_cardinalities = feature_metadata["cardinality"].to_numpy(dtype=np.int64)

    X_np = encoded.to_numpy(dtype=np.float32)
    return X_np, feature_is_categorical, feature_cardinalities, feature_metadata


def drop_high_cardinality_categorical_features(
    df: pd.DataFrame,
    schema: Dict[str, str],
    max_allowed_cardinality: int = 10,
):
    drop_cols = []
    for col in df.columns:
        if schema.get(col) != "categorical":
            continue
        n_unique = int(pd.Series(df[col]).nunique(dropna=True))
        if n_unique > max_allowed_cardinality:
            drop_cols.append(col)

    if len(drop_cols) > 0:
        df = df.drop(columns=drop_cols)

    schema = {col: schema[col] for col in df.columns if col in schema}
    return df, schema, drop_cols


def prepare_dataset(record: Dict, max_categorical_classes: int):
    X_raw = record["X"].reset_index(drop=True)
    y_raw = pd.Series(record["y"]).reset_index(drop=True)

    valid = y_raw.notna()
    X_raw = X_raw.loc[valid].reset_index(drop=True)
    y_raw = y_raw.loc[valid].reset_index(drop=True)

    if y_raw.nunique(dropna=True) != 2:
        raise ValueError(f"Dataset '{record['name']}' is not binary after cleaning.")

    schema = infer_feature_schema(X_raw, schema_hint=record.get("schema_hint"))
    X_filtered, schema, dropped_cols = drop_high_cardinality_categorical_features(
        X_raw,
        schema,
        max_allowed_cardinality=10,
    )
    if X_filtered.shape[1] == 0:
        raise ValueError(
            f"Dataset '{record['name']}' has no features after removing categorical columns with >10 classes."
        )

    X_np, feature_is_cat, feature_card, feature_meta = encode_dataset_with_schema(
        X_filtered,
        schema,
        max_categorical_classes=max_categorical_classes,
    )
    if len(dropped_cols) > 0:
        print(f"[info] {record['name']}: dropped high-cardinality categorical columns: {sorted(dropped_cols)}")

    return {
        "name": record["name"],
        "source": record["source"],
        "X": X_np,
        "y": y_raw.to_numpy(),
        "feature_is_categorical": feature_is_cat,
        "feature_cardinalities": feature_card,
        "feature_metadata": feature_meta,
    }

In [28]:
def build_pu_task(
    X: np.ndarray,
    y: np.ndarray,
    rng: np.random.Generator,
    max_positive_size: int,
    unlabeled_labeled_positive_ratio: tuple[int, int],
    outlier_rate: float,
):
    labels = np.unique(y)
    if labels.shape[0] != 2:
        return None

    positive_label = rng.choice(labels)
    pos_idx = np.where(y == positive_label)[0]
    neg_idx = np.where(y != positive_label)[0]

    if len(pos_idx) < 2 or len(neg_idx) < 1:
        return None

    selected_pos_n = int(min(max_positive_size, len(pos_idx)))
    selected_pos_idx = rng.choice(pos_idx, size=selected_pos_n, replace=False)

    u_ratio, l_ratio = unlabeled_labeled_positive_ratio
    if u_ratio < 0 or l_ratio <= 0 or (u_ratio + l_ratio) <= 0:
        raise ValueError("UNLABELED_LABELED_POSITIVE_RATIO must be (u, l) with u>=0, l>0.")

    unlabeled_pos_n = int(round(selected_pos_n * (float(u_ratio) / float(u_ratio + l_ratio))))
    unlabeled_pos_n = int(np.clip(unlabeled_pos_n, 1, selected_pos_n - 1))
    labeled_pos_n = selected_pos_n - unlabeled_pos_n
    if labeled_pos_n <= 0 or unlabeled_pos_n <= 0:
        return None

    labeled_pos_idx = rng.choice(selected_pos_idx, size=labeled_pos_n, replace=False)
    unlabeled_pos_idx = np.setdiff1d(selected_pos_idx, labeled_pos_idx, assume_unique=False)

    neg_needed = int(round(unlabeled_pos_n * outlier_rate / max(1e-8, 1.0 - outlier_rate)))
    if outlier_rate > 0.0 and unlabeled_pos_n > 0:
        neg_needed = max(1, neg_needed)
    neg_needed = min(neg_needed, len(neg_idx))
    if neg_needed <= 0:
        return None

    unlabeled_neg_idx = rng.choice(neg_idx, size=neg_needed, replace=False)

    unlabeled_idx = np.concatenate([unlabeled_pos_idx, unlabeled_neg_idx])
    unlabeled_y = np.concatenate(
        [
            np.zeros(unlabeled_pos_idx.shape[0], dtype=np.int64),  # inlier
            np.ones(unlabeled_neg_idx.shape[0], dtype=np.int64),   # outlier
        ]
    )
    perm = rng.permutation(unlabeled_idx.shape[0])
    unlabeled_idx = unlabeled_idx[perm]
    unlabeled_y = unlabeled_y[perm]

    labeled_perm = rng.permutation(labeled_pos_idx.shape[0])
    labeled_pos_idx = labeled_pos_idx[labeled_perm]

    X_task = np.concatenate([X[labeled_pos_idx], X[unlabeled_idx]], axis=0).astype(np.float32)
    y_train = np.zeros(labeled_pos_idx.shape[0], dtype=np.float32)

    return {
        "X": X_task,
        "y_train": y_train,
        "y_test": unlabeled_y,
        "train_size": int(labeled_pos_idx.shape[0]),
        "num_unlabeled_inliers": int(unlabeled_pos_idx.shape[0]),
        "num_unlabeled_outliers": int(unlabeled_neg_idx.shape[0]),
        "positive_label": str(positive_label),
    }


def fpr_at_fixed_tpr(y_true: np.ndarray, outlier_score: np.ndarray, target_tpr: float = 0.95) -> float:
    if np.unique(y_true).shape[0] < 2:
        return float("nan")
    fpr, tpr, _ = roc_curve(y_true, outlier_score)
    valid = np.where(tpr >= target_tpr)[0]
    if valid.size == 0:
        return 1.0
    return float(np.min(fpr[valid]))


@torch.no_grad()
def evaluate_single_pu_task(
    model,
    task: Dict,
    feature_is_categorical: np.ndarray,
    feature_cardinalities: np.ndarray,
    device: str,
    supports_categorical: bool,
) -> Dict[str, float]:
    x = torch.from_numpy(task["X"]).unsqueeze(0).to(device=device, dtype=torch.float32)
    y_train = torch.from_numpy(task["y_train"]).unsqueeze(0).to(device=device, dtype=torch.float32)

    if supports_categorical:
        feature_is_cat_t = torch.from_numpy(feature_is_categorical).unsqueeze(0).to(device=device, dtype=torch.bool)
        feature_card_t = torch.from_numpy(feature_cardinalities).unsqueeze(0).to(device=device, dtype=torch.long)
        logits = model(
            (x, y_train),
            train_test_split_index=task["train_size"],
            feature_is_categorical=feature_is_cat_t,
            feature_cardinalities=feature_card_t,
        ).squeeze(0)
    else:
        logits = model(
            (x, y_train),
            train_test_split_index=task["train_size"],
        ).squeeze(0)

    logits_np = logits.detach().cpu().numpy()
    y_true = task["y_test"].astype(np.int64)
    y_pred = np.argmax(logits_np, axis=1)

    outlier_score = logits_np[:, 1]

    binary_ready = np.unique(y_true).shape[0] == 2
    outlier_mean = float(np.mean(outlier_score[y_true == 1])) if np.any(y_true == 1) else float("nan")
    inlier_mean = float(np.mean(outlier_score[y_true == 0])) if np.any(y_true == 0) else float("nan")

    metrics = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "roc_auc": float(roc_auc_score(y_true, outlier_score)) if binary_ready else float("nan"),
        "average_precision": float(average_precision_score(y_true, outlier_score)) if binary_ready else float("nan"),
        "fpr_at_tpr_0_80": float(fpr_at_fixed_tpr(y_true, outlier_score, target_tpr=0.80)),
        "fpr_at_tpr_0_90": float(fpr_at_fixed_tpr(y_true, outlier_score, target_tpr=0.90)),
        "fpr_at_tpr_0_95": float(fpr_at_fixed_tpr(y_true, outlier_score, target_tpr=0.95)),
        "outlier_score_gap": float(outlier_mean - inlier_mean)
        if (not np.isnan(outlier_mean) and not np.isnan(inlier_mean))
        else float("nan"),
    }
    return metrics


In [29]:
prepared_datasets = [prepare_dataset(d, max_categorical_classes=MAX_CATEGORICAL_CLASSES) for d in benchmark_datasets]

profile_rows = []
for d in prepared_datasets:
    feature_meta = d["feature_metadata"]
    num_cat = int((feature_meta["feature_type"] == "categorical").sum())
    num_cont = int((feature_meta["feature_type"] == "continuous").sum())
    max_card = int(feature_meta.loc[feature_meta["feature_type"] == "categorical", "cardinality"].max()) if num_cat > 0 else 1
    profile_rows.append(
        {
            "dataset": d["name"],
            "source": d["source"],
            "rows": int(d["X"].shape[0]),
            "features": int(d["X"].shape[1]),
            "continuous_features": num_cont,
            "categorical_features": num_cat,
            "max_categorical_cardinality": max_card,
        }
    )

profile_df = pd.DataFrame(profile_rows)
profile_df

[info] uci_adult_mixed: dropped high-cardinality categorical columns: ['education', 'native_country', 'occupation']
[info] uci_mushroom_categorical: dropped high-cardinality categorical columns: ['gill_color']


Unnamed: 0,dataset,source,rows,features,continuous_features,categorical_features,max_categorical_cardinality
0,uci_wdbc_continuous,uci:wdbc,569,30,30,0,1
1,uci_adult_mixed,uci:adult,30162,11,6,5,7
2,uci_spambase_continuous,uci:spambase,4601,57,57,0,1
3,uci_mushroom_categorical,uci:mushroom,8124,20,0,20,10
4,uci_magic_gamma_continuous,uci:magic-gamma-telescope,19020,10,10,0,1
5,uci_car_evaluation_categorical,uci:car-evaluation,1728,6,0,6,4
6,uci_banknote_authentication_continuous,uci:banknote-authentication,1372,4,4,0,1
7,uci_rice_cammeo_osmancik_continuous,uci:rice-cammeo-and-osmancik,3810,7,7,0,1
8,uci_default_credit_card_clients_continuous,uci:default-of-credit-card-clients,30000,23,23,0,1
9,uci_abalone_binary_rings_cutoff,uci:abalone,4177,8,8,0,1


In [30]:
rng_master = np.random.default_rng(GLOBAL_SEED)

replicate_frames = []

for dataset_idx, dataset in enumerate(prepared_datasets):
    ds_seed = int(rng_master.integers(0, 2**31 - 1))
    ds_rng = np.random.default_rng(ds_seed)

    rows = []
    attempts = 0
    collected_replicates = 0

    while collected_replicates < N_REPLICATES and attempts < MAX_ATTEMPTS_PER_DATASET:
        attempts += 1
        task = build_pu_task(
            X=dataset["X"],
            y=dataset["y"],
            rng=ds_rng,
            max_positive_size=MAX_POSITIVE_SIZE,
            unlabeled_labeled_positive_ratio=UNLABELED_LABELED_POSITIVE_RATIO,
            outlier_rate=OUTLIER_RATE,
        )
        if task is None:
            continue

        unlabeled_total = int(task["num_unlabeled_inliers"] + task["num_unlabeled_outliers"])
        real_outlier_proportion = (
            float(task["num_unlabeled_outliers"]) / float(unlabeled_total) if unlabeled_total > 0 else float("nan")
        )
        real_unlabeled_positive_to_labeled_positive_ratio = (
            float(task["num_unlabeled_inliers"]) / float(task["train_size"]) if int(task["train_size"]) > 0 else float("nan")
        )
        real_positive_only_sample_size = int(task["train_size"] + task["num_unlabeled_inliers"])

        for model_spec in MODEL_SPECS:
            metric = evaluate_single_pu_task(
                model=model_spec["model"],
                task=task,
                feature_is_categorical=dataset["feature_is_categorical"],
                feature_cardinalities=dataset["feature_cardinalities"],
                device=DEVICE,
                supports_categorical=bool(model_spec["supports_categorical"]),
            )
            metric.update(
                {
                    "model_name": model_spec["model_name"],
                    "dataset": dataset["name"],
                    "source": dataset["source"],
                    "replicate": collected_replicates + 1,
                    "attempt": attempts,
                    "positive_label": task["positive_label"],
                    "labeled_positive_size": int(task["train_size"]),
                    "real_labeled_positive_size": int(task["train_size"]),
                    "real_positive_only_sample_size": real_positive_only_sample_size,
                    "unlabeled_inlier_size": int(task["num_unlabeled_inliers"]),
                    "unlabeled_outlier_size": int(task["num_unlabeled_outliers"]),
                    "real_outlier_proportion": real_outlier_proportion,
                    "real_unlabeled_positive_to_labeled_positive_ratio": real_unlabeled_positive_to_labeled_positive_ratio,
                }
            )
            rows.append(metric)

        collected_replicates += 1

    if collected_replicates < N_REPLICATES:
        print(
            f"[warn] dataset={dataset['name']} collected {collected_replicates} replicates "
            f"within {MAX_ATTEMPTS_PER_DATASET} attempts."
        )

    rep_df = pd.DataFrame(rows)
    replicate_frames.append(rep_df)

replicate_results_df = pd.concat(replicate_frames, ignore_index=True) if len(replicate_frames) > 0 else pd.DataFrame()

metric_columns = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "average_precision",
    "fpr_at_tpr_0_80",
    "fpr_at_tpr_0_90",
    "fpr_at_tpr_0_95",
    "outlier_score_gap",
]

if replicate_results_df.empty:
    metrics_by_model_df = pd.DataFrame(
        columns=["dataset", "source", "model_name", "replicates"] + metric_columns
    )
    metrics_latest_df = pd.DataFrame(columns=["dataset", "source", "replicates"] + metric_columns)
    metrics_legacy_df = pd.DataFrame(columns=["dataset", "source", "replicates"] + metric_columns)

    metrics_summary_df = pd.DataFrame(
        columns=["dataset", "source", "replicates_latest", "replicates_legacy"]
        + [
            col_name
            for metric in metric_columns
            for col_name in (
                f"{metric}_latest",
                f"{metric}_legacy",
                f"{metric}_delta_latest_minus_legacy",
            )
        ]
    )
    composition_summary_df = pd.DataFrame(
        columns=[
            "dataset",
            "source",
            "replicates",
            "true_positive_only_sample_size",
            "true_unlabeled_to_labeled_positive_ratio",
            "true_outlier_rate",
        ]
    )
else:
    metrics_by_model_df = (
        replicate_results_df.groupby(["dataset", "source", "model_name"], as_index=False)
        .agg(
            replicates=("replicate", "count"),
            accuracy=("accuracy", "mean"),
            balanced_accuracy=("balanced_accuracy", "mean"),
            roc_auc=("roc_auc", "mean"),
            average_precision=("average_precision", "mean"),
            fpr_at_tpr_0_80=("fpr_at_tpr_0_80", "mean"),
            fpr_at_tpr_0_90=("fpr_at_tpr_0_90", "mean"),
            fpr_at_tpr_0_95=("fpr_at_tpr_0_95", "mean"),
            outlier_score_gap=("outlier_score_gap", "mean"),
        )
        .sort_values(["dataset", "model_name"])
        .reset_index(drop=True)
    )

    metrics_latest_df = (
        metrics_by_model_df[metrics_by_model_df["model_name"] == "latest"]
        .drop(columns=["model_name"])
        .reset_index(drop=True)
    )
    metrics_legacy_df = (
        metrics_by_model_df[metrics_by_model_df["model_name"] == "legacy"]
        .drop(columns=["model_name"])
        .reset_index(drop=True)
    )

    latest_for_merge = metrics_latest_df.rename(
        columns={"replicates": "replicates_latest", **{metric: f"{metric}_latest" for metric in metric_columns}}
    )
    legacy_for_merge = metrics_legacy_df.rename(
        columns={"replicates": "replicates_legacy", **{metric: f"{metric}_legacy" for metric in metric_columns}}
    )

    metrics_summary_df = (
        latest_for_merge.merge(legacy_for_merge, on=["dataset", "source"], how="outer")
        .sort_values("dataset")
        .reset_index(drop=True)
    )
    for metric in metric_columns:
        latest_col = f"{metric}_latest"
        legacy_col = f"{metric}_legacy"
        delta_col = f"{metric}_delta_latest_minus_legacy"
        if latest_col in metrics_summary_df.columns and legacy_col in metrics_summary_df.columns:
            metrics_summary_df[delta_col] = metrics_summary_df[latest_col] - metrics_summary_df[legacy_col]
        else:
            metrics_summary_df[delta_col] = float("nan")

    # True PU composition is task-defined (independent of model); compute once from latest rows.
    composition_source_df = replicate_results_df[replicate_results_df["model_name"] == "latest"]
    composition_summary_df = (
        composition_source_df.groupby(["dataset", "source"], as_index=False)
        .agg(
            replicates=("replicate", "count"),
            true_positive_only_sample_size=("real_positive_only_sample_size", "mean"),
            true_unlabeled_to_labeled_positive_ratio=("real_unlabeled_positive_to_labeled_positive_ratio", "mean"),
            true_outlier_rate=("real_outlier_proportion", "mean"),
        )
        .sort_values("dataset")
        .reset_index(drop=True)
    )


In [31]:
print("Performance metrics over replicates: latest model")
print("outlier_score_gap = mean(outlier score for true outliers) - mean(outlier score for true inliers)")
metrics_latest_df


Performance metrics over replicates: latest model
outlier_score_gap = mean(outlier score for true outliers) - mean(outlier score for true inliers)


Unnamed: 0,dataset,source,replicates,accuracy,balanced_accuracy,roc_auc,average_precision,fpr_at_tpr_0_80,fpr_at_tpr_0_90,fpr_at_tpr_0_95,outlier_score_gap
0,uci_abalone_binary_rings_cutoff,uci:abalone,10,0.863043,0.732833,0.852922,0.57257,0.2595,0.433833,0.582,0.980327
1,uci_adult_mixed,uci:adult,10,0.131014,0.499861,0.707013,0.326188,0.4855,0.711333,0.870167,0.376068
2,uci_banknote_authentication_continuous,uci:banknote-authentication,10,0.976869,0.986702,0.999797,0.998902,0.0,0.0,0.0,4.65006
3,uci_car_evaluation_categorical,uci:car-evaluation,10,0.351194,0.55051,0.83979,0.552646,0.306612,0.433485,0.583785,0.750079
4,uci_default_credit_card_clients_continuous,uci:default-of-credit-card-clients,10,0.857681,0.521972,0.62815,0.236282,0.6755,0.832833,0.914167,0.146138
5,uci_letter_recognition_C_vs_U,uci:letter-recognition,10,0.970276,0.956098,0.994319,0.972273,0.00161,0.010157,0.034492,3.128218
6,uci_magic_gamma_continuous,uci:magic-gamma-telescope,10,0.882754,0.63225,0.817622,0.499203,0.338,0.487667,0.600833,0.775427
7,uci_mushroom_categorical,uci:mushroom,10,0.132029,0.500444,0.92493,0.764888,0.092,0.240667,0.377833,1.284515
8,uci_rice_cammeo_osmancik_continuous,uci:rice-cammeo-and-osmancik,10,0.942754,0.864139,0.962948,0.863908,0.036333,0.088,0.182833,4.952948
9,uci_spambase_continuous,uci:spambase,10,0.87942,0.777194,0.887322,0.664657,0.173833,0.356833,0.556667,0.886307


In [32]:
print("Performance metrics over replicates: legacy model")
print("outlier_score_gap = mean(outlier score for true outliers) - mean(outlier score for true inliers)")
metrics_legacy_df


Performance metrics over replicates: legacy model
outlier_score_gap = mean(outlier score for true outliers) - mean(outlier score for true inliers)


Unnamed: 0,dataset,source,replicates,accuracy,balanced_accuracy,roc_auc,average_precision,fpr_at_tpr_0_80,fpr_at_tpr_0_90,fpr_at_tpr_0_95,outlier_score_gap
0,uci_abalone_binary_rings_cutoff,uci:abalone,10,0.848841,0.743083,0.834152,0.542298,0.305,0.4645,0.6155,0.643791
1,uci_adult_mixed,uci:adult,10,0.849565,0.677389,0.759678,0.453299,0.456,0.686333,0.828,0.295297
2,uci_banknote_authentication_continuous,uci:banknote-authentication,10,0.958265,0.976007,0.999678,0.998091,0.0,0.0,0.000983,3.863725
3,uci_car_evaluation_categorical,uci:car-evaluation,10,0.674749,0.701178,0.85517,0.562884,0.281923,0.369346,0.492478,0.847423
4,uci_default_credit_card_clients_continuous,uci:default-of-credit-card-clients,10,0.856957,0.542806,0.61578,0.233514,0.719,0.8435,0.9195,0.110759
5,uci_letter_recognition_C_vs_U,uci:letter-recognition,10,0.959197,0.950668,0.993585,0.967343,0.00385,0.012174,0.041098,2.59732
6,uci_magic_gamma_continuous,uci:magic-gamma-telescope,10,0.867826,0.662861,0.795263,0.475472,0.388667,0.5495,0.688333,0.406703
7,uci_mushroom_categorical,uci:mushroom,10,0.950725,0.9405,0.982233,0.925381,0.007667,0.017667,0.097167,1.668629
8,uci_rice_cammeo_osmancik_continuous,uci:rice-cammeo-and-osmancik,10,0.940725,0.879972,0.963887,0.864864,0.034,0.093167,0.169,3.923024
9,uci_spambase_continuous,uci:spambase,10,0.891884,0.788139,0.872294,0.664551,0.204667,0.386333,0.636,0.607235


In [33]:
print("True PU composition over replicates (additional table):")
composition_summary_df

True PU composition over replicates (additional table):


Unnamed: 0,dataset,source,replicates,true_positive_only_sample_size,true_unlabeled_to_labeled_positive_ratio,true_outlier_rate
0,uci_abalone_binary_rings_cutoff,uci:abalone,10,900.0,2.0,0.130435
1,uci_adult_mixed,uci:adult,10,900.0,2.0,0.130435
2,uci_banknote_authentication_continuous,uci:banknote-authentication,10,701.2,2.00197,0.130219
3,uci_car_evaluation_categorical,uci:car-evaluation,10,440.4,1.986667,0.128645
4,uci_default_credit_card_clients_continuous,uci:default-of-credit-card-clients,10,900.0,2.0,0.130435
5,uci_letter_recognition_C_vs_U,uci:letter-recognition,10,759.1,2.002857,0.129608
6,uci_magic_gamma_continuous,uci:magic-gamma-telescope,10,900.0,2.0,0.130435
7,uci_mushroom_categorical,uci:mushroom,10,900.0,2.0,0.130435
8,uci_rice_cammeo_osmancik_continuous,uci:rice-cammeo-and-osmancik,10,900.0,2.0,0.130435
9,uci_spambase_continuous,uci:spambase,10,900.0,2.0,0.130435


In [34]:
run_id = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
run_dir = OUTPUT_DIR / f"eval_{run_id}"
run_dir.mkdir(parents=True, exist_ok=True)

summary_path = run_dir / "summary_metrics.csv"
summary_latest_path = run_dir / "summary_metrics_latest.csv"
summary_legacy_path = run_dir / "summary_metrics_legacy.csv"
metrics_by_model_path = run_dir / "metrics_by_model.csv"
replicate_path = run_dir / "replicate_metrics.csv"
profile_path = run_dir / "dataset_feature_profile.csv"
composition_path = run_dir / "pu_composition_summary.csv"
config_path = run_dir / "run_config.json"

metrics_summary_df.to_csv(summary_path, index=False)
metrics_latest_df.to_csv(summary_latest_path, index=False)
metrics_legacy_df.to_csv(summary_legacy_path, index=False)
metrics_by_model_df.to_csv(metrics_by_model_path, index=False)
replicate_results_df.to_csv(replicate_path, index=False)
profile_df.to_csv(profile_path, index=False)
composition_summary_df.to_csv(composition_path, index=False)

for dataset in prepared_datasets:
    safe_name = "".join(ch if ch.isalnum() or ch in {"_", "-"} else "_" for ch in dataset["name"])
    dataset["feature_metadata"].to_csv(run_dir / f"feature_metadata_{safe_name}.csv", index=False)

run_config = {
    "checkpoint_path": str(CHECKPOINT_PATH),
    "legacy_checkpoint_path": str(LEGACY_CHECKPOINT_PATH),
    "legacy_model_commit": LEGACY_MODEL_COMMIT,
    "device": DEVICE,
    "allow_uci_download": ALLOW_UCI_DOWNLOAD,
    "n_replicates": N_REPLICATES,
    "max_attempts_per_dataset": MAX_ATTEMPTS_PER_DATASET,
    "max_positive_size": MAX_POSITIVE_SIZE,
    "unlabeled_labeled_positive_ratio": list(UNLABELED_LABELED_POSITIVE_RATIO),
    "outlier_rate": OUTLIER_RATE,
    "max_categorical_classes": MAX_CATEGORICAL_CLASSES,
    "global_seed": GLOBAL_SEED,
}
with open(config_path, "w", encoding="utf-8") as f:
    json.dump(run_config, f, indent=2)

print(f"Saved evaluation outputs to: {run_dir}")
print(f"- {summary_path.name}")
print(f"- {summary_latest_path.name}")
print(f"- {summary_legacy_path.name}")
print(f"- {metrics_by_model_path.name}")
print(f"- {replicate_path.name}")
print(f"- {profile_path.name}")
print(f"- {composition_path.name}")


Saved evaluation outputs to: /Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/slim_pretrain/pretrain_v2/evaluation_outputs/eval_20260228_023531
- summary_metrics.csv
- summary_metrics_latest.csv
- summary_metrics_legacy.csv
- metrics_by_model.csv
- replicate_metrics.csv
- dataset_feature_profile.csv
- pu_composition_summary.csv
