TABPFN

In [None]:
!pip install tabpfn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import (
  f1_score, accuracy_score, classification_report,
  roc_auc_score, roc_curve
)
from tabpfn import TabPFNClassifier

In [None]:
def run_tabpfn_pipeline(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    feature_names
):
    # === Step 1: Prepare data (convert to NumPy if needed)
    X_train_np = X_train.values if hasattr(X_train, 'values') else X_train
    y_train_np = y_train.values.ravel() if hasattr(y_train, 'values') else y_train
    X_val_np = X_val.values if hasattr(X_val, 'values') else X_val
    y_val_np = y_val.values.ravel() if hasattr(y_val, 'values') else y_val
    X_test_np = X_test.values if hasattr(X_test, 'values') else X_test
    y_test_np = y_test.values.ravel() if hasattr(y_test, 'values') else y_test

    # === Step 2: Initialize and fit TabPFN (fit just sets schema)
    model_tabpfn = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu')
    model_tabpfn.fit(X_train_np, y_train_np)

    # === Step 3: Predict probabilities
    val_probs = model_tabpfn.predict_proba(X_val_np)[:, 1]
    test_probs = model_tabpfn.predict_proba(X_test_np)[:, 1]

    # === Step 4: Find best threshold (F1 optimized)
    thresholds = np.linspace(0, 1, 101)
    f1s = [f1_score(y_val_np, (val_probs >= t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1s)]
    print(f"\n✅ Best threshold (F1) for TabPFN: {best_threshold:.2f}")

    # === Step 5: Predict labels using best threshold
    test_preds = (test_probs >= best_threshold).astype(int)

    # === Step 6: Metrics
    accuracy = accuracy_score(y_test_np, test_preds)
    auc = roc_auc_score(y_test_np, test_probs)
    report_str = classification_report(y_test_np, test_preds, target_names=["Class 0", "Class 1"])
    report_df = pd.DataFrame(
        classification_report(y_test_np, test_preds, output_dict=True)
    ).T

    print(f"\n🔎 Test Accuracy (TabPFN): {accuracy:.2f}")
    print("Classification Report (TabPFN):")
    print(report_str)
    print(f"AUC-ROC (TabPFN): {auc:.2f}")

    # === Step 7: ROC Curve
    fpr, tpr, _ = roc_curve(y_test_np, test_probs)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.title("ROC Curve - TabPFN")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # === Step 8: Placeholder Feature Importance
    feature_importance = pd.DataFrame(columns=["Feature", "Importance"])  # ❌ Not available in TabPFN

    # === Step 9: Package results (✅ Yes — reusable + pluggable)
    results_tabpfn = {
        'val_probs': val_probs,
        'test_probs': test_probs,
        'test_preds': test_preds,
        'accuracy': accuracy,
        'auc': auc,
        'best_threshold': best_threshold,
        'fpr': fpr,
        'tpr': tpr,
        'report_df': report_df,
        'feature_importance': feature_importance
    }

    return model_tabpfn, results_tabpfn
