# Project 2 – Towards Foundation Models for Tabular Data

We:
1) Preprocess them **as in the papers** (cats vs. nums, standardization).
2) Train strong **tree baselines** (CatBoost, LightGBM, XGBoost).
3) Train **TabPFN** (classification) and **TabNet** (classification & regression).
4) Build an **ensemble** on one dataset (default: `credit-g`).
5) **Fine-tune the same model on another dataset** (we include a **TransTab transfer** example: pretrain on `mfeat-fourier` → fine-tune on `credit-g`), which supports differing schemas across tables.
6) **Perform transfer-learning** on two different pairs selected from TabPFN paper.

In [1]:
import os, warnings, logging, torch

# Prefer GPU when available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Silence common warning/progress bars ("warning bars")
warnings.filterwarnings("ignore")
os.environ["TQDM_DISABLE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
logging.getLogger("lightgbm").setLevel(logging.ERROR)

def to_device(x):
    try:
        import torch
        if isinstance(x, torch.Tensor):
            return x.to(DEVICE, non_blocking=True)
    except Exception:
        pass
    return x

print(f"Using DEVICE = {DEVICE}")


Using DEVICE = cuda


In [2]:

# --- Environment setup (run once) ---
%pip -q install --upgrade pip
%pip -q install scikit-learn pandas numpy matplotlib openml
%pip -q install xgboost lightgbm catboost
# Deep models
%pip -q install pytorch-tabnet
%pip -q install tabpfn  # classification
%pip -q install transtab  

import numpy as np, pandas as pd, sklearn, openml  
print('Versions:')
import importlib, sys
for m in ['numpy','pandas','sklearn','openml','xgboost','lightgbm','catboost']:
    try:
        print(m, importlib.import_module(m).__version__)
    except Exception as e:
        print(m, 'not found:', e)


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Versions:
numpy 2.2.6
pandas 2.3.3
sklearn 1.6.1
openml 0.15.1
xgboost 3.0.5
lightgbm 4.6.0
catboost 1.2.8


# Utilities

In [3]:

# --- Utilities: seed, metrics, helpers ---
import numpy as np, pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
import logging

logging.getLogger('lightgbm').setLevel(logging.ERROR)  
from pathlib import Path
RESULTS_DIR = Path("./results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

def set_seed(seed: int = 42):
    import random, os, numpy as np
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    except Exception:
        pass

set_seed(42)

from sklearn.metrics import (
    roc_auc_score, f1_score, accuracy_score, mean_absolute_error, mean_squared_error
)

def cls_metrics(y_true, y_proba_or_pred, average='macro', is_multiclass=False):
    """
    If probabilities are provided, compute ROC-AUC (OVR if multiclass).
    Always compute macro-F1 from predicted labels.
    """
    import numpy as _np
    # decide if proba or labels
    if isinstance(y_proba_or_pred, _np.ndarray) and y_proba_or_pred.ndim >= 2:
        # proba given
        y_pred = y_proba_or_pred.argmax(1)
        try:
            if is_multiclass:
                auc = roc_auc_score(y_true, y_proba_or_pred, multi_class='ovr')
            else:
                # assume y_true is {0,1}
                auc = roc_auc_score(y_true, y_proba_or_pred[:,1])
        except Exception:
            auc = _np.nan
    else:
        # labels
        y_pred = y_proba_or_pred
        auc = _np.nan
    f1 = f1_score(y_true, y_pred, average=average)
    acc = accuracy_score(y_true, y_pred)
    return dict(roc_auc=auc, macro_f1=f1, acc=acc)

def reg_metrics(y_true, y_pred):
    import numpy as np
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    mae = mean_absolute_error(y_true, y_pred)
    try:
        rmse = mean_squared_error(y_true, y_pred, squared=False)  # newer sklearn
    except TypeError:
        rmse = mean_squared_error(y_true, y_pred) ** 0.5          # older sklearn
    return dict(MAE=float(mae), RMSE=float(rmse))


def cat_indices_from_df(df: pd.DataFrame, target: str) -> List[int]:
    """Return categorical column indices based on object/category dtype (excluding target)."""
    cats = [i for i, c in enumerate(df.columns) if c != target and (df[c].dtype == 'object' or str(df[c].dtype).startswith('category'))]
    return cats

def train_val_test_split_indices(n, stratify=None, test_size=0.2, val_size=0.2, seed=42):
    """Return train/val/test indices, with optional stratification (labels array)."""
    from sklearn.model_selection import train_test_split
    import numpy as _np
    idx = _np.arange(n)
    idx_train, idx_test = train_test_split(idx, test_size=test_size, random_state=seed, stratify=stratify)
    strat2 = stratify[idx_train] if stratify is not None else None
    val_rel = val_size / (1.0 - test_size)
    idx_train, idx_val = train_test_split(idx_train, test_size=val_rel, random_state=seed, stratify=strat2)
    return idx_train, idx_val, idx_test

def save_csv(df, filename):
    path = RESULTS_DIR / filename
    df.to_csv(path, index=False)
    print(f"Saved CSV -> {path.resolve()}")


# Datasets

This notebook has been **modified** to use exactly **three datasets**:
- **mfeat-fourier** (OpenML ID **14**) — **multi-class** (10 classes) classification — from the TabPFN paper's CC18 suite
- **credit-g (German Credit)** (OpenML ID **31**) — **binary** classification — from the TabPFN paper's CC18 suite
- **SARCOS** — **regression** (21 features, 7 targets y1..y7) - from TabNet paper

> **Notes**    
> • GPU is recommended for TabNet/TransTab; TabPFN runs on CPU reasonably fast for ≤10k rows.


In [4]:
# --- Dataset loaders (OpenML) + SARCOS (GPML) ---
from typing import Optional
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, KFold
from pathlib import Path
import openml
import io, urllib.request

def load_openml_dataset(dataset_id: int, target: Optional[str]=None, as_frame=True):
    d = openml.datasets.get_dataset(dataset_id, download_all_files=True)
    if target is None:
        target = d.default_target_attribute
    X, y, categorical_indicator, attribute_names = d.get_data(
        dataset_format="dataframe", target=target
    )
    df = pd.concat([X, y.rename(target)], axis=1)
    return df, target, categorical_indicator, attribute_names

def load_sarcos_single(target: str = "y1", sample: Optional[int] = None, seed: int = 42):
    """
    SARCOS inverse dynamics (regression, 21 features, 7 targets y1..y7).
    Loads train+test from GPML, concatenates, and returns (df, target).
    Set `target` to one of 'y1'..'y7'. Optionally subsample with `sample`.
    """
    try:
        from scipy.io import loadmat
    except Exception:
        from scipy.io import loadmat  # will raise if not installed

    def _fetch_mat(url: str):
        with urllib.request.urlopen(url) as r:
            return loadmat(io.BytesIO(r.read()))

    tr = _fetch_mat("http://www.gaussianprocess.org/gpml/data/sarcos_inv.mat")
    te = _fetch_mat("http://www.gaussianprocess.org/gpml/data/sarcos_inv_test.mat")

    Xtr, Ytr = tr["sarcos_inv"][:, :21], tr["sarcos_inv"][:, 21:]
    Xte, Yte = te["sarcos_inv_test"][:, :21], te["sarcos_inv_test"][:, 21:]
    X = np.vstack([Xtr, Xte]); Y = np.vstack([Ytr, Yte])

    cols_X = [f"x{i}" for i in range(21)]
    cols_Y = [f"y{i+1}" for i in range(7)]
    df = pd.DataFrame(np.hstack([X, Y]), columns=cols_X + cols_Y)

    assert target in df.columns, f"target must be one of {cols_Y}"
    if sample is not None and len(df) > sample:
        df = df.sample(n=sample, random_state=seed).reset_index(drop=True)
    return df.astype(np.float32), target

# 1) mfeat-fourier (ID 14) — multi-class, numeric
mfeat_fourier_df, mfeat_fourier_y, _, _ = load_openml_dataset(14)  # target = 'class'
mf_X = mfeat_fourier_df.drop(columns=[mfeat_fourier_y]).astype(float)
mf_y = mfeat_fourier_df[mfeat_fourier_y].astype('category').cat.codes.values  # 0..9

# 2) credit-g (ID 31) — binary, cat-heavy
creditg_df, creditg_y, _, _ = load_openml_dataset(31)
cg_y = creditg_df[creditg_y].astype('category').cat.codes.values
cg_X = creditg_df.drop(columns=[creditg_y])
cg_cat_cols = [c for c in cg_X.columns if cg_X[c].dtype == 'object' or str(cg_X[c].dtype).startswith('category')]
cg_num_cols = [c for c in cg_X.columns if c not in cg_cat_cols]

# 3) SARCOS — regression; choose one torque target (y1..y7). No time-aware CV needed.
sarcos_df, sarcos_target = load_sarcos_single(target="y1", sample=None, seed=42)
bh_y = sarcos_df[sarcos_target].values.astype(np.float32)         # keep names used later
bh_X = sarcos_df.drop(columns=[sarcos_target]).astype(np.float32) # all numeric

print('Loaded datasets:')
print('mfeat-fourier:', mf_X.shape, 'classes:', len(np.unique(mf_y)))
print('credit-g    :', cg_X.shape, f'(cats={len(cg_cat_cols)}, nums={len(cg_num_cols)})')
print('sarcos(y1)  :', bh_X.shape)


Loaded datasets:
mfeat-fourier: (2000, 76) classes: 10
credit-g    : (1000, 20) (cats=13, nums=7)
sarcos(y1)  : (48933, 27)


## Classical ML Baselines (XGBoost / LightGBM / CatBoost)

**Goal.** Strong tree-based baselines across all three datasets with correct categorical handling.

**Configuration**

- **XGBoost**: tree-based booster (e.g., `gbtree`), early stopping via validation AUC/RMSE, tune `max_depth`, `eta`, `subsample`, `colsample_bytree`.
- **LightGBM**: use `categorical_feature` for credit-g; gradient-based one-side sampling is fine; tune `num_leaves`, `feature_fraction`, `bagging_fraction`, `min_data_in_leaf`.
- **CatBoost**: pass raw categoricals for credit-g; automatic target statistics; control `depth`, `learning_rate`, `l2_leaf_reg`.

**Reporting**

- **mfeat-fourier**: macro-F1, OvR/AUC (optionally accuracy).  
- **credit-g**: ROC-AUC (primary), macro-F1.  
- **SARCOS**: RMSE/MAE (per target or y1 primary).

> Ensure consistent CV (e.g., 5-fold stratified for classification) and log the mean ± std for each metric.

In [5]:

# --- Classical ML baselines: XGBoost, LightGBM, CatBoost ---
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np, pandas as pd

import warnings
warnings.filterwarnings(
    "ignore",
    message=r"X does not have valid feature names, but .* was fitted with feature names",
    category=UserWarning,
    module=r"sklearn\.utils\.validation",
)

# Helper to build pipelines
def build_cls_pipeline_xgb(cat_cols: List[str], num_cols: List[str], multiclass=False):
    from xgboost import XGBClassifier
    ohe = ColumnTransformer([('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)],
                            remainder='passthrough')
    obj = 'multi:softprob' if multiclass else 'binary:logistic'
    clf = XGBClassifier(
        objective=obj, n_estimators=600, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, tree_method='hist', eval_metric='logloss', n_jobs=-1
    )
    return Pipeline([('prep', ohe), ('clf', clf)])

from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline

from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline

def build_cls_pipeline_lgbm(cat_cols, num_cols, multiclass=False):
    objective = 'multiclass' if multiclass else 'binary'

    if len(cat_cols) == 0:
        clf = LGBMClassifier(
            objective=objective,
            n_estimators=1200,
            learning_rate=0.03,
            num_leaves=255,
            min_data_in_leaf=5,
            min_gain_to_split=0.0,
            feature_fraction=1.0,
            bagging_fraction=1.0,
            max_depth=-1,
            force_col_wise=True,
            n_jobs=-1,
            verbosity=-1,      # <- silence LightGBM logs
        )
        return Pipeline([('clf', clf)])

    import pandas as pd
    class LGBMNativeCats:
        def __init__(self):
            self.cat_cols = list(cat_cols)
            self.num_cols = list(num_cols)
            self.columns = self.cat_cols + self.num_cols
            self.model = LGBMClassifier(
                objective=objective,
                n_estimators=1200,
                learning_rate=0.03,
                num_leaves=255,
                min_data_in_leaf=5,
                min_gain_to_split=0.0,
                feature_fraction=1.0,
                bagging_fraction=1.0,
                max_depth=-1,
                force_col_wise=True,
                n_jobs=-1,
                verbosity=-1,    # <- silence LightGBM logs
            )
        def _prep(self, X):
            Xdf = X[self.columns].copy()
            for c in self.cat_cols:
                Xdf[c] = Xdf[c].astype('category')
            return Xdf
        def fit(self, X, y):
            Xdf = self._prep(X)
            cat_idx = [Xdf.columns.get_loc(c) for c in self.cat_cols]
            self.model.fit(Xdf, y, categorical_feature=cat_idx)
            return self
        def predict_proba(self, X):
            Xdf = self._prep(X)
            return self.model.predict_proba(Xdf)
        def predict(self, X):
            Xdf = self._prep(X)
            return self.model.predict(Xdf)
    return LGBMNativeCats()

def build_cls_pipeline_catboost(cat_cols: List[str], num_cols: List[str], multiclass=False):
    from catboost import CatBoostClassifier
    # CatBoost can ingest raw categoricals; we pass indices at fit time
    class CatBoostWrapper:
        def __init__(self):
            self.cat_cols = cat_cols
            self.num_cols = num_cols
            self.columns = cat_cols + num_cols
            self.model = CatBoostClassifier(
                loss_function='MultiClass' if multiclass else 'Logloss',
                eval_metric='MultiClass' if multiclass else 'AUC',
                depth=8, learning_rate=0.05, iterations=1500, verbose=False
            )
        def fit(self, X, y):
            Xdf = X[self.columns].copy()
            cat_idx = [Xdf.columns.get_loc(c) for c in self.cat_cols]
            self.model.fit(Xdf, y, cat_features=cat_idx)
            return self
        def predict_proba(self, X):
            Xdf = X[self.columns].copy()
            return self.model.predict_proba(Xdf)
        def predict(self, X):
            Xdf = X[self.columns].copy()
            return self.model.predict(Xdf).astype(int)
    return CatBoostWrapper()

def build_reg_pipeline_xgb():
    from xgboost import XGBRegressor
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    reg = XGBRegressor(
        n_estimators=1200, learning_rate=0.03, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, tree_method='hist',
        objective='reg:squarederror', n_jobs=-1
    )
    return Pipeline([('scaler', scaler), ('reg', reg)])

def build_reg_pipeline_lgbm():
    from lightgbm import LGBMRegressor
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    reg = LGBMRegressor(
        n_estimators=1500, learning_rate=0.03, num_leaves=63,
        subsample=0.8, colsample_bytree=0.8, n_jobs=-1
    )
    return Pipeline([('scaler', scaler), ('reg', reg)])

def build_reg_pipeline_catboost():
    from catboost import CatBoostRegressor
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    reg = CatBoostRegressor(
        depth=8, learning_rate=0.05, iterations=2000, verbose=False,
        loss_function='RMSE'
    )
    return Pipeline([('scaler', scaler), ('reg', reg)])

print("Baselines ready.")


Baselines ready.


## Tabular Deep Learning (DL)

We evaluate **TabPFN** and **TabNet** on all datasets. Training prefers **GPU** when available.

### TabPFN
- Treat as a strong out-of-the-box DL baseline for classification.  
- Prepare inputs: numeric standardized, categoricals ordinal-encoded (embeddings on the DL side).  
- Metrics and splits mirror baselines for comparability.

### TabNet
- Use `TabNetClassifier` / `TabNetRegressor` with `device_name="cuda"` when available.  
- Typical hyperparams: `n_d`, `n_a`, `n_steps`, `gamma`, `lambda_sparse`, early stopping on validation metric.  
- For SARCOS, use multi-output regression or train per-target models.

In [34]:
import numpy as np

# ----- helpers: robust constructors (GPU + ignore caps) -----
def _make_tabpfn():
    from tabpfn import TabPFNClassifier
    # try common arg names across versions
    ensemble_keys = ("N_ensemble_configurations", "N_ensembles", "n_ensembles")
    base_kwargs_list = [
        # some versions accept 'devices', others 'device'
        dict(ignore_pretraining_limits=True, devices=["cuda"]),
        dict(ignore_pretraining_limits=True, device="cuda"),
        dict(ignore_pretraining_limits=True),  # fallback, will auto-pick device
    ]
    last_err = None
    for ens_key in ensemble_keys:
        for base_kwargs in base_kwargs_list:
            try:
                return TabPFNClassifier(**{ens_key: 32, **base_kwargs})
            except TypeError as e:
                last_err = e
                continue
    # final fallback: no ensemble kw, minimal args
    try:
        return TabPFNClassifier(ignore_pretraining_limits=True)
    except TypeError:
        # absolute minimal fallback
        return TabPFNClassifier()

def _make_tabpfn_reg():
    from tabpfn import TabPFNRegressor
    ensemble_keys = ("N_ensemble_configurations", "N_ensembles", "n_ensembles")
    base_kwargs_list = [
        dict(ignore_pretraining_limits=True, devices=["cuda"]),
        dict(ignore_pretraining_limits=True, device="cuda"),
        dict(ignore_pretraining_limits=True),
    ]
    for ens_key in ensemble_keys:
        for base_kwargs in base_kwargs_list:
            try:
                return TabPFNRegressor(**{ens_key: 16, **base_kwargs})
            except TypeError:
                continue
    try:
        return TabPFNRegressor(ignore_pretraining_limits=True)
    except TypeError:
        return TabPFNRegressor()

# ----- optional (recommended): cap only the TRAIN fold size to 10k, keep full validation -----
def _cap_train_fold(tr_idx, y, max_train=10_000, random_state=42):
    if tr_idx.size <= max_train:
        return tr_idx
    rng = np.random.default_rng(random_state)
    # stratified subsample for classification, uniform for regression (y can be float)
    if np.issubdtype(y.dtype, np.integer) and y.ndim == 1:
        # stratified: sample per class proportionally
        capped = []
        _, counts = np.unique(y[tr_idx], return_counts=True)
        target_per_class = np.floor(counts / counts.sum() * max_train).astype(int)
        # ensure at least 1 per present class
        target_per_class[target_per_class == 0] = 1
        # adjust rounding drift
        diff = max_train - target_per_class.sum()
        # add remainder to the largest classes
        if diff > 0:
            order = np.argsort(-counts)
            for k in order[:diff]:
                target_per_class[k] += 1
        classes = np.unique(y[tr_idx])
        for cls, take in zip(classes, target_per_class):
            cls_idx = tr_idx[y[tr_idx] == cls]
            sel = rng.choice(cls_idx, size=min(take, cls_idx.size), replace=False)
            capped.append(sel)
        return np.concatenate(capped)
    else:
        # regression: uniform sample
        return rng.choice(tr_idx, size=max_train, replace=False)

# ----- your CV loops (patched) -----
def run_tabpfn_cv(X, y, n_splits=5, is_multiclass=False, random_state=42, cap_train_to_10k=True):
    """Stratified CV for classification with GPU + cap handling."""
    if not TABPFN_OK:
        return None
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # ensure contiguous labels 0..C-1 and float32 features
    y = np.asarray(y)
    # map labels to 0..C-1 for safety
    _, y_mapped = np.unique(y, return_inverse=True)
    y = y_mapped.astype(int)
    Xnp = X.to_numpy(dtype=np.float32, copy=False) if hasattr(X, "to_numpy") else np.asarray(X, dtype=np.float32)

    metrics = []
    for tr, te in skf.split(Xnp, y):
        if cap_train_to_10k:
            tr_eff = _cap_train_fold(tr, y, max_train=10_000, random_state=random_state)
        else:
            tr_eff = tr
        clf = _make_tabpfn()
        clf.fit(Xnp[tr_eff], y[tr_eff])
        proba = clf.predict_proba(Xnp[te])
        m = cls_metrics(y[te], proba, is_multiclass=is_multiclass)
        metrics.append(m)

    out = {k: float(np.nanmean([m[k] for m in metrics])) for k in metrics[0]}
    out_std = {k + "_std": float(np.nanstd([m[k] for m in metrics])) for k in metrics[0]}
    out.update(out_std)
    return out

def run_tabpfn_reg_cv(X, y, n_splits=5, random_state=42, cap_train_to_10k=True):
    """KFold CV for regression with GPU + cap handling."""
    if not TABPFN_REG_OK:
        return None
    from sklearn.model_selection import KFold
    Xnp = X.to_numpy(dtype=np.float32, copy=False) if hasattr(X, "to_numpy") else np.asarray(X, dtype=np.float32)
    ynp = np.asarray(y, dtype=np.float32).ravel()

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    mets = []
    for tr, te in kf.split(Xnp):
        if cap_train_to_10k:
            tr_eff = _cap_train_fold(tr, ynp, max_train=10_000, random_state=random_state)
        else:
            tr_eff = tr
        reg = _make_tabpfn_reg()
        reg.fit(Xnp[tr_eff], ynp[tr_eff])
        yhat = np.asarray(reg.predict(Xnp[te]), dtype=np.float32).ravel()
        mets.append(reg_metrics(ynp[te], yhat))

    out = {k: float(np.mean([m[k] for m in mets])) for k in mets[0]}
    out_std = {k + "_std": float(np.std([m[k] for m in mets])) for k in mets[0]}
    out.update(out_std)
    return out
    
print("TabPFN block ready.")

TabPFN block ready.


In [39]:
import warnings
from tqdm.auto import tqdm  
warnings.filterwarnings(
    "ignore",
    message="Best weights from best epoch are automatically used!",
    module=r"pytorch_tabnet\.callbacks"
)

warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    module=r"pytorch_tabnet\.callbacks"
)

mf_scaler = StandardScaler()
mf_X_std = mf_scaler.fit_transform(mf_X.values)

# --- TabNet: classification & regression ---
try:
    from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
    import torch
    TABNET_OK = True
except Exception as e:
    TABNET_OK = False
    print("TabNet not available:", e)

def _tabnet_cv_cls(X, y, n_splits=5, is_multiclass=False, seed=42):
    if not TABNET_OK: return None
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    metrics = []
    for tr, te in skf.split(X, y):
        model = TabNetClassifier(
            n_d=32, n_a=32, n_steps=5, gamma=1.3,
            lambda_sparse=1e-4, optimizer_params=dict(lr=2e-2),
            verbose=0
        )
        metric = ['auc'] if not is_multiclass else ['accuracy']
        model.fit(
            X_train=X[tr], y_train=y[tr],
            eval_set=[(X[te], y[te])], eval_name=['val'],
            eval_metric=metric,
            max_epochs=150, patience=20, batch_size=1024, virtual_batch_size=128
        )
        proba = model.predict_proba(X[te])
        m = cls_metrics(y[te], proba, is_multiclass=is_multiclass)
        metrics.append(m)
    import numpy as np
    out = {k: float(np.nanmean([m[k] for m in metrics])) for k in metrics[0]}
    out_std = {k+'_std': float(np.nanstd([m[k] for m in metrics])) for k in metrics[0]}
    out.update(out_std)
    return out

# --- tqdm helpers for notebook/text ---
import sys
try:
    from tqdm.notebook import tqdm as _tqdm
    _WIDGETS = True
except Exception:
    from tqdm import tqdm as _tqdm
    _WIDGETS = False

def ptqdm(disable=True, *args, **kwargs):
    kwargs.setdefault("dynamic_ncols", True)
    if not _WIDGETS:
        kwargs.setdefault("file", sys.stdout)  # avoid pink stderr spam
    return _tqdm(disable=True, *args, **kwargs)

from pytorch_tabnet.callbacks import Callback

class TqdmEpochCallback(Callback):
    def __init__(self, total_epochs:int, desc:str):
        super().__init__()
        self.total = total_epochs
        self.desc = desc
        self.pbar = None
    def on_train_begin(self, logs=None):
        self.pbar = ptqdm(disable=True, total=self.total, desc=self.desc, leave=False)
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        show = {}
        for k in ("loss","val_0_rmse","val_0_mae","lr"):
            if k in logs:
                try: show[k] = f"{logs[k]:.4f}"
                except Exception: show[k] = str(logs[k])
        self.pbar.update(1)
        if show: self.pbar.set_postfix(show)
    def on_train_end(self, logs=None):
        if self.pbar: self.pbar.close()

def _tabnet_cv_reg(X, y, n_splits=5, seed=42):
    import numpy as np
    from sklearn.model_selection import KFold
    from sklearn.preprocessing import StandardScaler
    try:
        from pytorch_tabnet.tab_model import TabNetRegressor
        import torch
    except Exception:
        return None

    cfg = dict(
        n_d=32, n_a=32, n_steps=5, gamma=1.5, lambda_sparse=1e-4,
        lr=1e-3, batch=512, vbatch=128, max_epochs=300, patience=50
    )

    X = np.asarray(X, dtype=np.float32)
    y = np.asarray(y, dtype=np.float32).reshape(-1, 1)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    mets = []

    splits = list(kf.split(X))
    assert isinstance(splits, list) and len(splits) > 0, "No CV splits produced."

    for fold_idx, (tr, te) in enumerate(
        tqdm(splits, desc="TabNet CV (folds)", leave=False, disable=False)
    ):
        y_scaler = StandardScaler().fit(y[tr])
        ytr_s = y_scaler.transform(y[tr])

        model = TabNetRegressor(
            n_d=cfg["n_d"], n_a=cfg["n_a"], n_steps=cfg["n_steps"],
            gamma=cfg["gamma"], lambda_sparse=cfg["lambda_sparse"],
            seed=seed, verbose=0,  # keep TabNet quiet; tqdm shows progress
            optimizer_fn=torch.optim.AdamW,
            optimizer_params=dict(lr=cfg["lr"], weight_decay=1e-5),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            scheduler_params=dict(mode="min", factor=0.5, patience=10, min_lr=1e-5),
        )

        # epoch bar callback
        cb = TqdmEpochCallback(total_epochs=cfg["max_epochs"], desc=f"fold {fold_idx+1} (epochs)")

        model.fit(
            X_train=X[tr], y_train=ytr_s,
            eval_set=[(X[te], y_scaler.transform(y[te]))], eval_name=['val'],
            eval_metric=['rmse'],
            max_epochs=cfg["max_epochs"], patience=cfg["patience"],
            batch_size=cfg["batch"], virtual_batch_size=cfg["vbatch"],
            callbacks=[cb],
        )

        y_hat = y_scaler.inverse_transform(model.predict(X[te])).ravel()
        mets.append(reg_metrics(y[te].ravel(), y_hat))

    out = {k: float(np.mean([m[k] for m in mets])) for k in mets[0]}
    out_std = {k+'_std': float(np.std([m[k] for m in mets], ddof=1)) for k in mets[0]}
    out.update(out_std)
    out["config"] = cfg
    return out

print("TabNet block ready.")

TabNet block ready.


In [40]:
def cv_eval_cls_df(model, X, y, multiclass: bool = False, n_splits: int = 5, random_state=None):
    """
    Cross-validated classification metrics for a given sklearn Pipeline/estimator.
    Returns a dict like {'roc_auc': ..., 'acc': ..., 'macro_f1': ...}.
    - Works for binary and multi-class (OVR) using predict_proba.
    - Encodes non-numeric targets once to stable integer codes.
    """
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

    if random_state is None:
        random_state = globals().get("SEED", 42)

    # Ensure X is a DataFrame for nice slicing; y as a 1D array of ints if not already
    X_df = pd.DataFrame(X) if not hasattr(X, "iloc") else X
    if not np.issubdtype(pd.Series(y).dtype, np.integer):
        y_series = pd.Series(y).astype("category")
        y_enc = y_series.cat.codes.to_numpy()
    else:
        y_enc = np.asarray(y)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    aucs, accs, f1s = [], [], []

    for tr_idx, va_idx in skf.split(np.zeros(len(y_enc)), y_enc):
        Xtr, Xva = X_df.iloc[tr_idx], X_df.iloc[va_idx]
        ytr, yva = y_enc[tr_idx], y_enc[va_idx]

        fitted = model.fit(Xtr, ytr)

        # probabilities
        proba = fitted.predict_proba(Xva)
        n_classes = proba.shape[1]

        if multiclass or n_classes > 2:
            roc = roc_auc_score(yva, proba, multi_class="ovr")
            pred = proba.argmax(axis=1)
        else:
            roc = roc_auc_score(yva, proba[:, 1])
            pred = (proba[:, 1] >= 0.5).astype(int)

        aucs.append(float(roc))
        accs.append(float(accuracy_score(yva, pred)))
        f1s.append(float(f1_score(yva, pred, average="macro")))

    return {
        "roc_auc": float(np.mean(aucs)),
        "acc": float(np.mean(accs)),
        "macro_f1": float(np.mean(f1s)),
    }

def cv_eval_reg_pipe(pipe_builder, X, y, folds: int = 5, random_state=None, **builder_kwargs):
    """
    Cross-validated regression metrics for a pipeline BUILDER function.
    Returns mean and std across folds: RMSE, RMSE_std, MAE, MAE_std.
    """
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    if random_state is None:
        random_state = globals().get("SEED", 42)

    if hasattr(X, "iloc"):
        X_df = X.copy()
    else:
        X = np.asarray(X)
        X_df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])

    cat_cols = [c for c in X_df.columns if X_df[c].dtype.name in ("object", "category")]
    num_cols = [c for c in X_df.columns if c not in cat_cols]

    try:
        pipe = pipe_builder(cat_cols=cat_cols, num_cols=num_cols, **builder_kwargs)
    except TypeError:
        pipe = pipe_builder()

    kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    rmses, maes = [], []
    y_arr = np.asarray(y)

    for tr_idx, va_idx in kf.split(X_df):
        Xtr, Xva = X_df.iloc[tr_idx], X_df.iloc[va_idx]
        ytr, yva = y_arr[tr_idx], y_arr[va_idx]

        fitted = pipe.fit(Xtr, ytr)
        pred = fitted.predict(Xva)

        rmses.append(float(np.sqrt(mean_squared_error(yva, pred))))
        maes.append(float(mean_absolute_error(yva, pred)))

    return {
        "RMSE": float(np.mean(rmses)),
        "RMSE_std": float(np.std(rmses, ddof=1)) if len(rmses) > 1 else 0.0,
        "MAE": float(np.mean(maes)),
        "MAE_std": float(np.std(maes, ddof=1)) if len(maes) > 1 else 0.0,
    }

if "results" not in globals() or not isinstance(globals().get("results"), dict):
    results = {}

## Results for mfeat-fourier

In [9]:
# === RUN + PRINT + SAVE: mfeat-fourier (multi-class) ===
import warnings, numpy as np, pandas as pd
dset_name = "mfeat-fourier"

# ---- run classical
res_mf = {}
print("[mfeat] Running XGBoost...")
res_mf['xgb'] = cv_eval_cls_df(build_cls_pipeline_xgb(cat_cols=[], num_cols=[], multiclass=True), mf_X, mf_y, multiclass=True)

print("[mfeat] Running LightGBM...")
res_mf['lgbm'] = cv_eval_cls_df(build_cls_pipeline_lgbm(cat_cols=[], num_cols=[], multiclass=True), mf_X, mf_y, multiclass=True)

print("[mfeat] Running CatBoost...")
res_mf['catboost'] = cv_eval_cls_df(build_cls_pipeline_catboost([], list(mf_X.columns), multiclass=True), mf_X, mf_y, multiclass=True)

# ---- print + save classical
def _print_save(df_rows, fname, title):
    if not df_rows:
        print(f"No results for {title}."); return None
    df = pd.DataFrame(df_rows)
    metric_cols = [c for c in df.columns if c not in ("dataset", "model")]
    df = df[["dataset", "model"] + sorted(metric_cols)]
    df_show = df.copy()
    for c in metric_cols:
        if pd.api.types.is_numeric_dtype(df_show[c]):
            df_show[c] = df_show[c].astype(float).round(4)
    print(f"\n=== {title} ===")
    print(df_show.to_string(index=False))

    # save here
    out = RESULTS_DIR / fname
    df.to_csv(out, index=False)
    print(f"Saved CSV -> {out.resolve()}")
    return df

rows_classical = []
for k in ["xgb","lgbm","catboost"]:
    if isinstance(res_mf.get(k), dict):
        rows_classical.append({"dataset": dset_name, "model": k, **res_mf[k]})
_ = _print_save(rows_classical, "classical_mfeat_fourier.csv", "Classical models on mfeat-fourier")

# ---- run DL
print("[mfeat] Running TabPFN...")
res_mf['tabpfn'] = run_tabpfn_cv(mf_X, mf_y, n_splits=5, is_multiclass=True)

print("[mfeat] Running TabNet...")
res_mf['tabnet'] = _tabnet_cv_cls(mf_X_std.astype(np.float32), mf_y, n_splits=5, is_multiclass=True)

# ---- print + save DL
rows_dl = []
for k in ["tabpfn","tabnet"]:
    if isinstance(res_mf.get(k), dict):
        rows_dl.append({"dataset": dset_name, "model": k, **res_mf[k]})
_ = _print_save(rows_dl, "dl_mfeat_fourier.csv", "DL models on mfeat-fourier (TabPFN, TabNet)")

# stash to global results
results['mfeat-fourier'] = res_mf
print("[mfeat] Done.")


[mfeat] Running XGBoost...
[mfeat] Running LightGBM...
[mfeat] Running CatBoost...

=== Classical models on mfeat-fourier ===
      dataset    model    acc  macro_f1  roc_auc
mfeat-fourier      xgb 0.8400    0.8396   0.9828
mfeat-fourier     lgbm 0.8290    0.8278   0.9808
mfeat-fourier catboost 0.8415    0.8406   0.9840
Saved CSV -> /home/kutaytire/RL_training/results/classical_mfeat_fourier.csv
[mfeat] Running TabPFN...
[mfeat] Running TabNet...

Early stopping occurred at epoch 105 with best_epoch = 85 and best_val_accuracy = 0.72

Early stopping occurred at epoch 91 with best_epoch = 71 and best_val_accuracy = 0.705

Early stopping occurred at epoch 127 with best_epoch = 107 and best_val_accuracy = 0.71

Early stopping occurred at epoch 115 with best_epoch = 95 and best_val_accuracy = 0.71
Stop training because you reached max_epochs = 150 with best_epoch = 134 and best_val_accuracy = 0.77

=== DL models on mfeat-fourier (TabPFN, TabNet) ===
      dataset  model   acc  acc_std  macr

## Results for credit-g

In [10]:
# === RUN + PRINT + SAVE: credit-g (binary) ===
import warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore", message="Best weights from best epoch are automatically used!", module=r"pytorch_tabnet\.callbacks")

dset_name = "credit-g"

# ---- run classical
res_cg = {}
print("[credit-g] Running XGBoost...")
res_cg['xgb'] = cv_eval_cls_df(build_cls_pipeline_xgb(cat_cols=cg_cat_cols, num_cols=cg_num_cols, multiclass=False), cg_X, cg_y, multiclass=False)

print("[credit-g] Running LightGBM...")
res_cg['lgbm'] = cv_eval_cls_df(build_cls_pipeline_lgbm(cat_cols=cg_cat_cols, num_cols=cg_num_cols, multiclass=False), cg_X, cg_y, multiclass=False)

print("[credit-g] Running CatBoost...")
res_cg['catboost'] = cv_eval_cls_df(build_cls_pipeline_catboost(cat_cols=cg_cat_cols, num_cols=cg_num_cols, multiclass=False), cg_X, cg_y, multiclass=False)

def _print_save(df_rows, fname, title):
    if not df_rows:
        print(f"No results for {title}."); return None
    df = pd.DataFrame(df_rows)
    metric_cols = [c for c in df.columns if c not in ("dataset", "model")]
    df = df[["dataset", "model"] + sorted(metric_cols)]
    df_show = df.copy()
    for c in metric_cols:
        if pd.api.types.is_numeric_dtype(df_show[c]):
            df_show[c] = df_show[c].astype(float).round(4)
    print(f"\n=== {title} ===")
    print(df_show.to_string(index=False))

    # save here
    out = RESULTS_DIR / fname
    df.to_csv(out, index=False)
    print(f"Saved CSV -> {out.resolve()}")
    return df

rows_classical = []
for k in ["xgb","lgbm","catboost"]:
    if isinstance(res_cg.get(k), dict):
        rows_classical.append({"dataset": dset_name, "model": k, **res_cg[k]})
_ = _print_save(rows_classical, "classical_credit_g.csv", "Classical models on credit-g")

# ---- prep for DL (ordinal cats + scale nums)
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
cg_X_ord = cg_X.copy()
if len(cg_cat_cols) > 0:
    cg_X_ord[cg_cat_cols] = enc.fit_transform(cg_X_ord[cg_cat_cols])
scaler_cg = StandardScaler()
cg_X_scaled = cg_X_ord.copy()
if len(cg_num_cols) > 0:
    cg_X_scaled[cg_num_cols] = scaler_cg.fit_transform(cg_X_scaled[cg_num_cols])

# ---- run DL
print("[credit-g] Running TabPFN...")
res_cg['tabpfn'] = run_tabpfn_cv(cg_X_ord, cg_y, n_splits=5, is_multiclass=False)

print("[credit-g] Running TabNet...")
res_cg['tabnet'] = _tabnet_cv_cls(cg_X_scaled.values.astype(np.float32), cg_y, n_splits=5, is_multiclass=False)

rows_dl = []
for k in ["tabpfn","tabnet"]:
    if isinstance(res_cg.get(k), dict):
        rows_dl.append({"dataset": dset_name, "model": k, **res_cg[k]})
_ = _print_save(rows_dl, "dl_credit_g.csv", "DL models on credit-g (TabPFN, TabNet)")

# stash to global results
results['credit-g'] = res_cg
print("[credit-g] Done.")


[credit-g] Running XGBoost...
[credit-g] Running LightGBM...
[credit-g] Running CatBoost...

=== Classical models on credit-g ===
 dataset    model   acc  macro_f1  roc_auc
credit-g      xgb 0.762    0.6973   0.7766
credit-g     lgbm 0.718    0.6294   0.6974
credit-g catboost 0.760    0.6860   0.7816
Saved CSV -> /home/kutaytire/RL_training/results/classical_credit_g.csv
[credit-g] Running TabPFN...
[credit-g] Running TabNet...

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_auc = 0.48982

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_auc = 0.46458

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_auc = 0.50054

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_auc = 0.40125

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_auc = 0.46387

=== DL models on credit-g (TabPFN, TabNet) ===
 dataset  model   acc  acc_std  macro_f1  macro_f1_std  roc_auc  roc_auc_std
credit-g tabpfn 0.753  

## Results for SARCOS

In [None]:
# === RUN + PRINT + SAVE: SARCOS (regression) ===
import warnings, numpy as np, pandas as pd
from pathlib import Path
warnings.filterwarnings("ignore", message="Best weights from best epoch are automatically used!", module=r"pytorch_tabnet\.callbacks")
warnings.filterwarnings("ignore", message="X does not have valid feature names")

# dataset name uses whichever torque you picked (y1..y7)
dset_name = f"sarcos_{globals().get('sarcos_target','y1')}"

# results dir
RESULTS_DIR = Path("./results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Standardize features for TabNet (helps stability on SARCOS)
from sklearn.preprocessing import StandardScaler
scaler_sar = StandardScaler()
bh_X_std = scaler_sar.fit_transform(bh_X.values)  # reuses bh_X from your SARCOS loader cell

# ---- Run classical (builders + cv_eval_reg_pipe assumed defined with stds) ----
res_bh = {}
print(f"[{dset_name}] Running XGBoost...")
res_bh['xgb'] = cv_eval_reg_pipe(build_reg_pipeline_xgb, bh_X.values, bh_y, folds=5)

print(f"[{dset_name}] Running LightGBM...")
res_bh['lgbm'] = cv_eval_reg_pipe(build_reg_pipeline_lgbm, bh_X.values, bh_y, folds=5)

print(f"[{dset_name}] Running CatBoost...")
res_bh['catboost'] = cv_eval_reg_pipe(build_reg_pipeline_catboost, bh_X.values, bh_y, folds=5)

# unified printer
def _print_save(df_rows, fname, title):
    if not df_rows:
        print(f"No results for {title}."); return None
    df = pd.DataFrame(df_rows)
    metric_cols = [c for c in df.columns if c not in ("dataset", "model")]
    df = df[["dataset", "model"] + sorted(metric_cols)]
    df_show = df.copy()
    for c in metric_cols:
        if pd.api.types.is_numeric_dtype(df_show[c]):
            df_show[c] = df_show[c].astype(float).round(4)
    print(f"\n=== {title} ===")
    print(df_show.to_string(index=False))
    out = RESULTS_DIR / fname
    df.to_csv(out, index=False)
    print(f"Saved CSV -> {out.resolve()}")
    return df

# ---- print & save classical ----
rows_classical = []
for k in ["xgb","lgbm","catboost"]:
    if isinstance(res_bh.get(k), dict):
        rows_classical.append({"dataset": dset_name, "model": k, **res_bh[k]})
_ = _print_save(rows_classical, f"classical_{dset_name}.csv", f"Classical models on {dset_name} (RMSE/MAE ± std)")

# ---- DL: TabNet (uses your stabilized CV helper) ----
print(f"[{dset_name}] Running TabNet...")
res_bh['tabnet'] = _tabnet_cv_reg(bh_X_std, bh_y, n_splits=5)
#print(f"[{dset_name}] Best TabNet config: {res_bh['tabnet'].get('config')}")

# ---- DL: TabPFN Regressor (if available) ----
print(f"[{dset_name}] Running TabPFN-Regressor...")
try:
    res_bh['tabpfn'] = run_tabpfn_reg_cv(bh_X.values, bh_y, n_splits=5)
except NameError:
    print("run_tabpfn_reg_cv not found. Make sure you ran the TabPFN CV cell.")
    res_bh['tabpfn'] = None

# ---- print & save DL ----
rows_dl = []
for k in ["tabnet","tabpfn"]:
    if isinstance(res_bh.get(k), dict):
        row = {"dataset": dset_name, "model": k, **{kk: vv for kk, vv in res_bh[k].items() if kk != "config"}}
        rows_dl.append(row)
_ = _print_save(rows_dl, f"dl_{dset_name}.csv", f"DL models on {dset_name} (RMSE/MAE ± std)")

# stash to global results
if "results" not in globals() or not isinstance(globals().get("results"), dict):
    results = {}
results[dset_name] = res_bh
print(f"[{dset_name}] Done.")


[sarcos_y1] Running XGBoost...
[sarcos_y1] Running LightGBM...
[sarcos_y1] Running CatBoost...

=== Classical models on sarcos_y1 (RMSE/MAE ± std) ===
  dataset    model    MAE  MAE_std   RMSE  RMSE_std
sarcos_y1      xgb 1.5322   0.0145 2.2181    0.0453
sarcos_y1     lgbm 1.3980   0.0154 2.0665    0.0290
sarcos_y1 catboost 1.3388   0.0079 1.9641    0.0279
Saved CSV -> /home/kutaytire/RL_training/results/classical_sarcos_y1.csv
[sarcos_y1] Running TabNet...


TabNet CV (folds):   0%|          | 0/5 [00:00<?, ?it/s]

## Ensembling (DL + Tree Baseline)

**Task.** For **one dataset** (**credit-g**), combine a DL model (e.g., **TabPFN**) with a tree model (e.g., **XGBoost**).

In [11]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV

# Expect these to exist: cg_X (DataFrame), cg_y (array-like 0/1),
# cg_cat_cols, cg_num_cols, build_cls_pipeline_xgb

# -- TabPFN availability
try:
    from tabpfn import TabPFNClassifier
    _TABPFN_OK = True
except Exception as e:
    _TABPFN_OK = False
    raise RuntimeError(f"TabPFN not available: {e}")

SEED = globals().get("SEED", 42)

# -- helper: encode DataFrame for TabPFN
def _encode_for_tabpfn(df: pd.DataFrame) -> np.ndarray:
    enc = pd.DataFrame(index=df.index)
    for c in df.columns:
        if df[c].dtype in ("object", "category"):
            col = df[c].astype("category")
            enc[c] = (col.cat.codes + 1).astype("int32")  # NaN -> -1 -> 0
        else:
            col = pd.to_numeric(df[c], errors="coerce").astype("float32")
            if col.isna().any():
                col = col.fillna(col.median())
            enc[c] = col
    return np.ascontiguousarray(enc.values, dtype=np.float32)

X_tab = _encode_for_tabpfn(cg_X)
y_int  = pd.Series(cg_y).astype("category").cat.codes.to_numpy()  # ensure 0/1 ints

# -- robust TabPFN constructor
def _make_tabpfn(n_ens=64):
    for k in ("N_ensemble_configurations", "N_ensembles", "n_ensembles"):
        try:
            return TabPFNClassifier(**{k: int(n_ens)})
        except TypeError:
            continue
    return TabPFNClassifier()

# -- base XGB pipeline
xgb_base = build_cls_pipeline_xgb(cat_cols=cg_cat_cols, num_cols=cg_num_cols, multiclass=False)

# -- metrics helper
def _metrics(y_true, proba):
    pred = proba.argmax(1)
    return dict(
        roc_auc=float(roc_auc_score(y_true, proba[:,1])),
        acc=float(accuracy_score(y_true, pred)),
        macro_f1=float(f1_score(y_true, pred, average="macro")),
    )

# Shared outer folds across all runs
outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# ===============================
# 1) XGBoost (CALIBRATED) — OOF
# ===============================
proba_xgbcal_oof = np.zeros((len(y_int), 2), dtype=np.float32)
for tr, te in outer.split(np.zeros(len(y_int)), y_int):
    # Calibrate XGB on the outer-train split (3-fold isotonic)
    cal_full = CalibratedClassifierCV(clone(xgb_base), method="isotonic", cv=3)
    cal_full.fit(cg_X.iloc[tr], y_int[tr])
    proba_xgbcal_oof[te] = cal_full.predict_proba(cg_X.iloc[te])
m_xgb_cal = _metrics(y_int, proba_xgbcal_oof)

# =========================
# 2) TabPFN only — OOF
# =========================
proba_tpfn_oof = np.zeros((len(y_int), 2), dtype=np.float32)
for tr, te in outer.split(np.zeros(len(y_int)), y_int):
    clf = _make_tabpfn(n_ens=64)
    clf.fit(X_tab[tr], y_int[tr])
    proba_tpfn_oof[te] = clf.predict_proba(X_tab[te])
m_tpfn = _metrics(y_int, proba_tpfn_oof)

# ===============================================
# 3) Stacking: TabPFN + CALIBRATED XGB — nested
# ===============================================
inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED+1)
proba_stack_oof = np.zeros((len(y_int), 2), dtype=np.float32)

for tr_idx, te_idx in outer.split(np.zeros(len(y_int)), y_int):
    X_tr_df, X_te_df = cg_X.iloc[tr_idx], cg_X.iloc[te_idx]
    y_tr, y_te       = y_int[tr_idx], y_int[te_idx]
    X_tr_tab, X_te_tab = X_tab[tr_idx], X_tab[te_idx]

    # inner OOF for meta features (CALIBRATED XGB)
    oof_tab = np.zeros((len(tr_idx), 2), dtype=np.float32)
    oof_xgb = np.zeros((len(tr_idx), 2), dtype=np.float32)

    for itr, iva in inner.split(np.zeros(len(tr_idx)), y_tr):
        # ---- calibrated XGB on inner training split
        cal = CalibratedClassifierCV(clone(xgb_base), method="isotonic", cv=3)
        cal.fit(X_tr_df.iloc[itr], y_tr[itr])
        oof_xgb[iva] = cal.predict_proba(X_tr_df.iloc[iva])

        # ---- TabPFN on encoded arrays
        mt = _make_tabpfn(n_ens=64)
        mt.fit(X_tr_tab[itr], y_tr[itr])
        oof_tab[iva] = mt.predict_proba(X_tr_tab[iva])

    # meta-learner on inner OOF probs (use raw probs; logits also work well)
    X_meta = np.c_[oof_tab[:,1], oof_xgb[:,1]]
    meta = LogisticRegression(max_iter=1000)
    meta.fit(X_meta, y_tr)

    # refit bases on full outer-train and predict outer-test
    cal_full = CalibratedClassifierCV(clone(xgb_base), method="isotonic", cv=3)
    cal_full.fit(X_tr_df, y_tr)
    p_xgb_te = cal_full.predict_proba(X_te_df)[:, 1]

    mt_full = _make_tabpfn(n_ens=64).fit(X_tr_tab, y_tr)
    p_tab_te = mt_full.predict_proba(X_te_tab)[:, 1]

    p_meta_te = meta.predict_proba(np.c_[p_tab_te, p_xgb_te])[:, 1]
    proba_stack_oof[te_idx] = np.c_[1 - p_meta_te, p_meta_te]

m_stack = _metrics(y_int, proba_stack_oof)

# =========================
# Print comparison + deltas
# =========================
def _fmt(d): return {k: round(v, 4) for k, v in d.items()}
print("XGBoost (OOF):   ", _fmt(m_xgb_cal))
print("TabPFN (OOF):                ", _fmt(m_tpfn))
print("Stacking [TabPFN + XGB]:  ", _fmt(m_stack))

def _delta(base, stacked):
    return {k: round(stacked[k] - base[k], 4) for k in base}

print("\nΔ vs XGB:", _delta(m_xgb_cal, m_stack))
print("Δ vs TabPFN:  ", _delta(m_tpfn, m_stack))


XGBoost (OOF):    {'roc_auc': 0.7798, 'acc': 0.769, 'macro_f1': 0.6773}
TabPFN (OOF):                 {'roc_auc': 0.7979, 'acc': 0.77, 'macro_f1': 0.7041}
Stacking [TabPFN + XGB]:   {'roc_auc': 0.799, 'acc': 0.772, 'macro_f1': 0.6927}

Δ vs XGB: {'roc_auc': 0.0192, 'acc': 0.003, 'macro_f1': 0.0154}
Δ vs TabPFN:   {'roc_auc': 0.0012, 'acc': 0.002, 'macro_f1': -0.0114}


## Fine-Tuning

Fine-tune the same TABNet model on different datasets:
  - Train on credit-g dataset.
  - Fine-tune on mfeat-fourier dataset.

To match the feature size, we use PCA to select the dominant ones.

In [12]:
## -- Importing essential libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [13]:
## -- PCA function to match input dimensionality of both datasets
# Expect these to exist: cg_X_scaled, mf_X_std
def pca_function(x1, x2):
  n_features_1 = x1.shape[1]
  n_features_2 = x2.shape[1]

  print(f"Input dimensions:")
  print(f"  X1: {x1.shape} ({n_features_1} features)")
  print(f"  X2: {x2.shape} ({n_features_2} features)")
  if n_features_1 == n_features_2:
      print(f"\nBoth datasets have same number of features ({n_features_1})")
      print("No PCA needed!")
      return x1, x2
  elif n_features_1 > n_features_2:
      X_high = x1
      X_low = x2
      n_high = n_features_1
      n_low = n_features_2
      print(f"\nX1 has MORE features ({n_features_1} > {n_features_2})")
      print(f"Applying PCA to X1...")
  else:
      X_high = x2
      X_low = x1
      n_high = n_features_2
      n_low = n_features_1
      print(f"\nX2 has MORE features ({n_features_2} > {n_features_1})")
      print(f"Applying PCA to X2...")

  pca = PCA(n_components=n_low, random_state=42)
  X_high_reduced = pca.fit_transform(X_high)

  print(f"\nPCA complete!")
  if n_features_1 > n_features_2:
    print(f"  X1: {x1.shape}->{X_high_reduced.shape}")
    print(f"  X2: {x2.shape}->{X_low.shape}")
  else:
    print(f"  X1: {x1.shape}->{X_low.shape}")
    print(f"  X2: {x2.shape}->{X_high_reduced.shape}")

  variance_explained = pca.explained_variance_ratio_.sum()
  print(f"Total variance explained: {variance_explained*100:.2f}%")
  return X_high_reduced, X_low

x1, x2 = pca_function(mf_X_std, cg_X_scaled)

Input dimensions:
  X1: (2000, 76) (76 features)
  X2: (1000, 20) (20 features)

X1 has MORE features (76 > 20)
Applying PCA to X1...

PCA complete!
  X1: (2000, 76)->(2000, 20)
  X2: (1000, 20)->(1000, 20)
Total variance explained: 66.65%


In [14]:
## -- Splitting data
def split_data(X, y, test_size=0.2, random_state=42):

  x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

  return x_train, x_test, y_train, y_test

xtrain1, xtest1, ytrain1, ytest1 = split_data(x2.to_numpy(), cg_y, test_size=0.2, random_state=42)
xtrain2, xtest2, ytrain2, ytest2 = split_data(x1, mf_y, test_size=0.2, random_state=42)
print('Done split_data.')

Done split_data.


In [15]:
## -- Pre-training TABNet function
def _tabnet_singletrain_cls(X, y, xtest, ytest, val_size=0.125, ckpt_path="tabnet_A", is_multiclass=False, seed=42, save_path=False):
    if not TABNET_OK:
        return None

    sss = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=seed)
    tr_idx, va_idx = next(sss.split(X, y))
    Xtr, Xva = X[tr_idx], X[va_idx]
    ytr, yva = y[tr_idx], y[va_idx]

    metric = ['auc'] if not is_multiclass else ['accuracy']

    model = TabNetClassifier(
        n_d=32, n_a=32, n_steps=5, gamma=1.3,
        lambda_sparse=1e-4,
        optimizer_params=dict(lr=2e-2),
        seed=seed, verbose=0
    )
    model.fit(
        X_train=Xtr, y_train=ytr,
        eval_set=[(Xva, yva)], eval_name=['val'],
        eval_metric=metric,
        max_epochs=250, patience=20,
        batch_size=1024, virtual_batch_size=128
    )

    if save_path:
      model.save_model(ckpt_path)

    proba = model.predict_proba(xtest)
    m = cls_metrics(ytest, proba, is_multiclass=is_multiclass)
    return {k: float(m[k]) for k in m}

In [16]:
## -- Fine-tuning TABNet function
def _tabnet_finetune_cls(X, y, xtest, ytest, val_size=0.125, ckpt_path="tabnet_A", is_multiclass=False, seed=42):
    if not TABNET_OK:
        return None

    sss = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=seed)
    tr_idx, va_idx = next(sss.split(X, y))
    Xtr, Xva = X[tr_idx], X[va_idx]
    ytr, yva = y[tr_idx], y[va_idx]

    model = TabNetClassifier(
        n_d=32, n_a=32, n_steps=5, gamma=1.3,
        lambda_sparse=1e-4,
        optimizer_params=dict(lr=2e-3),
        seed=seed, verbose=0
    )

    load_path = ckpt_path if ckpt_path.endswith(".zip") else (ckpt_path + ".zip")
    print(f"Loading pre-trained model from {load_path}...")
    model.load_model(load_path)
    print("Done.")

    weights_tr = compute_sample_weight(class_weight="balanced", y=ytr)

    metric = ['auc'] if not is_multiclass else ['accuracy']
    model.fit(
        X_train=Xtr, y_train=ytr,
        eval_set=[(Xva, yva)], eval_name=['val'],
        eval_metric=metric,
        max_epochs=250, patience=20,
        batch_size=1024, virtual_batch_size=128,
        weights=weights_tr
    )

    proba = model.predict_proba(xtest)
    m = cls_metrics(ytest, proba, is_multiclass=is_multiclass)
    return {k: float(m[k]) for k in m}

In [17]:
## -- Running the pre-trained, fine-tuned, and baseline models
print(f"Pre-training on credit-g dataset...")
Pre_cg = _tabnet_singletrain_cls(xtrain1, ytrain1, xtest1, ytest1, val_size=0.2, is_multiclass=False, seed=42, save_path=True)
print(f"\nFine-tuning on mfeat-fourier dataset...")
FT_mf = _tabnet_finetune_cls(xtrain2, ytrain2, xtest2, ytest2, val_size=0.2, is_multiclass=True, seed=42)
print(f"\nBaseline on credit-g dataset...")
Baseline_mf = _tabnet_singletrain_cls(xtrain2, ytrain2, xtest2, ytest2, val_size=0.2, is_multiclass=True, seed=42)
print("\nDone.")

Pre-training on credit-g dataset...

Early stopping occurred at epoch 20 with best_epoch = 0 and best_val_auc = 0.53795
Successfully saved model at tabnet_A.zip

Fine-tuning on mfeat-fourier dataset...
Loading pre-trained model from tabnet_A.zip...
Done.

Early stopping occurred at epoch 39 with best_epoch = 19 and best_val_accuracy = 0.56875

Baseline on credit-g dataset...

Early stopping occurred at epoch 112 with best_epoch = 92 and best_val_accuracy = 0.6875

Done.


In [18]:
## -- Results of fine-tuning
results_df = pd.DataFrame({
    'Fine-tuned': FT_mf,
    'Baseline': Baseline_mf}).T
results_df.loc['Difference'] = results_df.loc['Fine-tuned'] - results_df.loc['Baseline']
print("\tResults of fine-tuned vs baseline:")
print(results_df)

	Results of fine-tuned vs baseline:
             roc_auc  macro_f1     acc
Fine-tuned  0.855368  0.482453  0.5000
Baseline    0.935868  0.669167  0.6725
Difference -0.080500 -0.186714 -0.1725


## Transfer Learning (TransTab)

**Goal.** Train on dataset **A** and transfer to **B** for at least **two** pairs. TransTab can handle **unaligned or partially aligned schemas**.

**Protocol**
1. **Pretrain** on A with checkpoint saving.  
2. **Build TransTab** model from checkpoint and update schema: `{'cat': ..., 'num': ..., 'bin': ...}`.  
3. **Transfer-train** on B using GPU

**Reporting**
- Compare **transfer vs. training-from-scratch** on B.  
- Include mean ± std over folds

Transfer learning is performed using the following two pair of datasets:
1. heart-statlog (ID: 49) and heart-c (ID: 53): datasets have partially aligned features.
2. monks-problems-1 (ID: 333) and monks-problems-2 (ID: 335): datasets have fully aligned features.

These datasets are selected from the TabPFN paper as they have matching features for transfer-learning.

In [11]:
## -- Importing essential libraries
!pip install -q transtab
import transtab
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
if not hasattr(np, "Inf"):
    np.Inf = np.inf

In [12]:
## -- Divide columns into categorical, numerical, and binary
def column_cat(X):
  cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
  num_cols = X.select_dtypes(include=['number']).columns.tolist()
  bin_cols = [c for c in X.columns if X[c].nunique(dropna=False) == 2]

  cat_cols = [c for c in cat_cols if c not in bin_cols]
  num_cols = [c for c in num_cols if c not in bin_cols]
  return cat_cols, num_cols, bin_cols

## -- Loading datasets from opem_ml
def load_dataset(id):
  d_df, d_df_y, _, _ = load_openml_dataset(id)
  d_df = d_df.dropna()
  d_X = d_df.drop(columns=[d_df_y])
  d_y = d_df[d_df_y].astype('category').cat.codes
  d_y = d_y.squeeze() if hasattr(d_y, 'squeeze') else np.asarray(d_y).ravel()
  if id == 49:
    d_X["slope"] = (d_X["slope"].astype(str).str.lower().map({"up":1,"down":3, "flat":2}).astype("int8"))
    d_X["thal"] = (d_X["thal"].astype(str).str.lower().map({"normal":3,"fixed_defect":6,"reversable_defect":7}).astype("int8"))
    d_X["sex"] = (d_X["sex"].astype(str).str.lower().map({"male":1,"female":0}).astype("int8"))
    d_X["fbs"] = d_X['fbs'].astype(str).str.strip().str.lower().map({'t': 1, 'f': 0})
    d_X["exang"] = (d_X["exang"].astype(str).str.lower().map({"yes":1,"no":0}).astype("int8"))
  cat_cols, num_cols, bin_cols = column_cat(d_X)
  if id == 333 or id == 334 or id == 335:
    for col in cat_cols:
      d_X[col] = d_X[col].astype(str)
    for col in bin_cols:
      d_X[col] = d_X[col].astype(int)
  return d_X, d_y, cat_cols, num_cols, bin_cols

d_X1, d_y1, cat_cols1, num_cols1, bin_cols1 = load_dataset(53)
d_X2, d_y2, cat_cols2, num_cols2, bin_cols2 = load_dataset(49)
d_X3, d_y3, cat_cols3, num_cols3, bin_cols3 = load_dataset(333)
d_X4, d_y4, cat_cols4, num_cols4, bin_cols4 = load_dataset(335)

print('Loaded datasets:')
print('First pair of datasets:')
print('heart-statlog:', d_X1.shape, f'(cats={len(cat_cols1)}, nums={len(num_cols1)}, bin={len(bin_cols1)})')
print('heart-c    :', d_X2.shape, f'(cats={len(cat_cols2)}, nums={len(num_cols2)}, bin={len(bin_cols2)})')
print('common features  :', (d_X1.columns.intersection(d_X2.columns)).to_list())
print('Features of first pair of dataset are partially aligned.')
print('\nSecond pair of datasets:')
print('monks-problems-1:', d_X3.shape, f'(cats={len(cat_cols3)}, nums={len(num_cols3)}, bin={len(bin_cols3)})')
print('monks-problems-2:', d_X4.shape, f'(cats={len(cat_cols4)}, nums={len(num_cols4)}, bin={len(bin_cols4)})')
print('common features :', (d_X3.columns.intersection(d_X4.columns)).to_list())
print('Features of second pair of dataset are fully aligned.')

Loaded datasets:
First pair of datasets:
heart-statlog: (270, 13) (cats=0, nums=10, bin=3)
heart-c    : (296, 13) (cats=2, nums=8, bin=3)
common features  : ['age', 'sex', 'oldpeak', 'slope', 'thal']
Features of first pair of dataset are partially aligned.

Second pair of datasets:
monks-problems-1: (556, 6) (cats=4, nums=0, bin=2)
monks-problems-2: (554, 6) (cats=4, nums=0, bin=2)
common features : ['attr1', 'attr2', 'attr3', 'attr4', 'attr5', 'attr6']
Features of second pair of dataset are fully aligned.


In [17]:
## -- Transtab function
def cv_transtab_5fold(X, y, cat_cols, num_cols, bin_cols, file_name='',
                      num_epoch=50, batch_size=64, lr=5e-5,
                      patience=15, transfer=False, save_ckpt=False, device=DEVICE, pin_memory=True):

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accs, f1s, aucs = [], [], []

    for tr_idx, va_idx in skf.split(X, y):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        if transfer == False and save_ckpt == True:
          model = transtab.build_classifier(cat_cols, num_cols, bin_cols, pin_memory=pin_memory, device=DEVICE)
          transtab.train(
              model,
              (X_tr, y_tr),
              (X_va, y_va),
              num_epoch=num_epoch,
              batch_size=batch_size,
              lr=lr,
              patience=patience,
              output_dir= "./" + file_name,
              verbose=0
          )
        elif transfer == False and save_ckpt == False:
          model = transtab.build_classifier(cat_cols, num_cols, bin_cols, pin_memory=pin_memory, device=DEVICE)
          transtab.train(
              model,
              (X_tr, y_tr),
              (X_va, y_va),
              num_epoch=num_epoch,
              batch_size=batch_size,
              lr=lr,
              patience=patience,
              pin_memory=True,
              verbose=0
          )
        else:
          ckpt_path = os.path.join(os.getcwd(), file_name)  # getcwd() takes no args
          model = transtab.build_classifier(
                checkpoint=ckpt_path,
                device=DEVICE,
                pin_memory=True
            )
          model.update({'cat':cat_cols,'num':num_cols,'bin':bin_cols})
          transtab.train(
              model,
              (X_tr, y_tr),
              (X_va, y_va),
              num_epoch=50,
              batch_size=64,
              lr=1e-4,
              patience=10, pin_memory=False,
              verbose=0
              )

        ypred = transtab.predict(model, X_va, y_va)
        proba = np.asarray(ypred).ravel()
        yhat = (proba.ravel() >= 0.5).astype(int)
        auc  = roc_auc_score(y_va, proba.ravel())
        f1   = f1_score(y_va, yhat)

        acc = accuracy_score(y_va, yhat)
        accs.append(acc); f1s.append(f1); aucs.append(auc)

    return {
        'acc_mean': float(np.mean(accs)), 'acc_std': float(np.std(accs, ddof=1)),
        'f1_mean':  float(np.mean(f1s)),  'f1_std':  float(np.std(f1s,  ddof=1)),
        'auc_mean': float(np.mean(aucs)), 'auc_std': float(np.std(aucs, ddof=1)),
        'folds': 5
    }

In [18]:
## -- Perform Transfer Learning on the first pai of datasets
PreTrain1 = cv_transtab_5fold(d_X1, d_y1, cat_cols1, num_cols1, bin_cols1,
                              file_name='pretrain_checkpoint1',save_ckpt=True)
TransferLearning1 = cv_transtab_5fold(d_X2, d_y2, cat_cols2, num_cols2, bin_cols2,
                                      transfer=True, file_name='pretrain_checkpoint1')
Baseline1 = cv_transtab_5fold(d_X2, d_y2, cat_cols2, num_cols2, bin_cols2)

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.817529
epoch: 0, train loss: 2.7279, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.811782
EarlyStopping counter: 1 out of 15
epoch: 1, train loss: 2.6094, lr: 0.000050, spent: 0.4 secs
epoch: 2, test auc: 0.807471
EarlyStopping counter: 2 out of 15
epoch: 2, train loss: 2.5407, lr: 0.000050, spent: 0.5 secs
epoch: 3, test auc: 0.803161
EarlyStopping counter: 3 out of 15
epoch: 3, train loss: 2.5210, lr: 0.000050, spent: 0.6 secs
epoch: 4, test auc: 0.804598
EarlyStopping counter: 4 out of 15
epoch: 4, train loss: 2.3840, lr: 0.000050, spent: 0.7 secs
epoch: 5, test auc: 0.810345
EarlyStopping counter: 5 out of 15
epoch: 5, train loss: 2.3702, lr: 0.000050, spent: 0.8 secs
epoch: 6, test auc: 0.811782
EarlyStopping counter: 6 out of 15
epoch: 6, train loss: 2.2940, lr: 0.000050, spent: 0.9 secs
epoch: 7, test auc: 0.811782
EarlyStopping counter: 7 out of 15
epoch: 7, train loss: 2.2251, lr: 0.000050, spent: 1.0 secs
epoch: 8, test auc: 0.811782
EarlyStopping 

[32m2025-10-16 20:21:48.362[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint1[0m
[32m2025-10-16 20:21:48.379[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint1[0m


epoch: 48, test auc: 0.889368
EarlyStopping counter: 12 out of 15
epoch: 48, train loss: 1.2915, lr: 0.000050, spent: 5.9 secs
epoch: 49, test auc: 0.885057
EarlyStopping counter: 13 out of 15
epoch: 49, train loss: 1.3999, lr: 0.000050, spent: 6.0 secs


[32m2025-10-16 20:21:48.614[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 6.2 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.808908
epoch: 0, train loss: 2.7217, lr: 0.000050, spent: 0.1 secs
epoch: 1, test auc: 0.827586
epoch: 1, train loss: 2.6936, lr: 0.000050, spent: 0.3 secs
epoch: 2, test auc: 0.827586
epoch: 2, train loss: 2.6019, lr: 0.000050, spent: 0.4 secs
epoch: 3, test auc: 0.821839
EarlyStopping counter: 1 out of 15
epoch: 3, train loss: 2.5609, lr: 0.000050, spent: 0.5 secs
epoch: 4, test auc: 0.818966
EarlyStopping counter: 2 out of 15
epoch: 4, train loss: 2.4978, lr: 0.000050, spent: 0.6 secs
epoch: 5, test auc: 0.820402
EarlyStopping counter: 3 out of 15
epoch: 5, train loss: 2.4455, lr: 0.000050, spent: 0.7 secs
epoch: 6, test auc: 0.824713
EarlyStopping counter: 4 out of 15
epoch: 6, train loss: 2.4415, lr: 0.000050, spent: 1.0 secs
epoch: 7, test auc: 0.826149
EarlyStopping counter: 5 out of 15
epoch: 7, train loss: 2.3717, lr: 0.000050, spent: 1.1 secs
epoch: 8, test auc: 0.826149
EarlyStopping counter: 6 out of 15
epoch: 8, train loss: 2.3049, lr: 0.000050, spent

[32m2025-10-16 20:21:54.719[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint1[0m
[32m2025-10-16 20:21:54.737[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint1[0m
[32m2025-10-16 20:21:54.888[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 6.1 secs.[0m


epoch: 49, test auc: 0.872126
EarlyStopping counter: 3 out of 15
epoch: 49, train loss: 1.4519, lr: 0.000050, spent: 6.0 secs


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.650862
epoch: 0, train loss: 2.8417, lr: 0.000050, spent: 0.1 secs
epoch: 1, test auc: 0.659483
epoch: 1, train loss: 2.7570, lr: 0.000050, spent: 0.3 secs
epoch: 2, test auc: 0.639368
EarlyStopping counter: 1 out of 15
epoch: 2, train loss: 2.6564, lr: 0.000050, spent: 0.4 secs
epoch: 3, test auc: 0.630747
EarlyStopping counter: 2 out of 15
epoch: 3, train loss: 2.6082, lr: 0.000050, spent: 0.5 secs
epoch: 4, test auc: 0.623563
EarlyStopping counter: 3 out of 15
epoch: 4, train loss: 2.5585, lr: 0.000050, spent: 0.6 secs
epoch: 5, test auc: 0.616379
EarlyStopping counter: 4 out of 15
epoch: 5, train loss: 2.4568, lr: 0.000050, spent: 0.6 secs
epoch: 6, test auc: 0.617816
EarlyStopping counter: 5 out of 15
epoch: 6, train loss: 2.4423, lr: 0.000050, spent: 0.7 secs
epoch: 7, test auc: 0.614943
EarlyStopping counter: 6 out of 15
epoch: 7, train loss: 2.3292, lr: 0.000050, spent: 0.8 secs
epoch: 8, test auc: 0.616379
EarlyStopping counter: 7 out of 15
epoch: 8, trai

[32m2025-10-16 20:21:56.884[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint1[0m


epoch: 14, test auc: 0.627874
EarlyStopping counter: 13 out of 15
epoch: 14, train loss: 1.9015, lr: 0.000050, spent: 1.7 secs
epoch: 15, test auc: 0.636494
EarlyStopping counter: 14 out of 15
epoch: 15, train loss: 1.7887, lr: 0.000050, spent: 1.8 secs
epoch: 16, test auc: 0.633621
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:21:56.901[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint1[0m
[32m2025-10-16 20:21:57.080[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 2.1 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.862319
epoch: 0, train loss: 2.8986, lr: 0.000050, spent: 0.1 secs
epoch: 1, test auc: 0.860870
EarlyStopping counter: 1 out of 15
epoch: 1, train loss: 2.6041, lr: 0.000050, spent: 0.2 secs
epoch: 2, test auc: 0.866667
epoch: 2, train loss: 2.5971, lr: 0.000050, spent: 0.4 secs
epoch: 3, test auc: 0.868116
epoch: 3, train loss: 2.5381, lr: 0.000050, spent: 0.5 secs
epoch: 4, test auc: 0.868116
epoch: 4, train loss: 2.4853, lr: 0.000050, spent: 0.6 secs
epoch: 5, test auc: 0.868116
epoch: 5, train loss: 2.4853, lr: 0.000050, spent: 0.7 secs
epoch: 6, test auc: 0.872464
epoch: 6, train loss: 2.4222, lr: 0.000050, spent: 0.9 secs
epoch: 7, test auc: 0.872464
epoch: 7, train loss: 2.3709, lr: 0.000050, spent: 1.0 secs
epoch: 8, test auc: 0.875362
epoch: 8, train loss: 2.3206, lr: 0.000050, spent: 1.1 secs
epoch: 9, test auc: 0.878261
epoch: 9, train loss: 2.3255, lr: 0.000050, spent: 1.2 secs
epoch: 10, test auc: 0.876812
EarlyStopping counter: 1 out of 15
epoch: 10,

[32m2025-10-16 20:22:03.235[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint1[0m
[32m2025-10-16 20:22:03.252[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint1[0m


epoch: 49, test auc: 0.881159
EarlyStopping counter: 9 out of 15
epoch: 49, train loss: 1.4785, lr: 0.000050, spent: 6.0 secs


[32m2025-10-16 20:22:03.475[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 6.3 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.376812
epoch: 0, train loss: 2.8697, lr: 0.000050, spent: 0.1 secs
epoch: 1, test auc: 0.568116
epoch: 1, train loss: 2.7563, lr: 0.000050, spent: 0.3 secs
epoch: 2, test auc: 0.681159
epoch: 2, train loss: 2.7447, lr: 0.000050, spent: 0.4 secs
epoch: 3, test auc: 0.753623
epoch: 3, train loss: 2.6745, lr: 0.000050, spent: 0.5 secs
epoch: 4, test auc: 0.808696
epoch: 4, train loss: 2.6183, lr: 0.000050, spent: 0.6 secs
epoch: 5, test auc: 0.821739
epoch: 5, train loss: 2.6628, lr: 0.000050, spent: 0.8 secs
epoch: 6, test auc: 0.830435
epoch: 6, train loss: 2.6130, lr: 0.000050, spent: 0.9 secs
epoch: 7, test auc: 0.850725
epoch: 7, train loss: 2.5493, lr: 0.000050, spent: 1.0 secs
epoch: 8, test auc: 0.863768
epoch: 8, train loss: 2.4944, lr: 0.000050, spent: 1.2 secs
epoch: 9, test auc: 0.876812
epoch: 9, train loss: 2.4804, lr: 0.000050, spent: 1.3 secs
epoch: 10, test auc: 0.886957
epoch: 10, train loss: 2.4805, lr: 0.000050, spent: 1.4 secs
epoch: 11, test auc

[32m2025-10-16 20:22:08.094[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint1[0m
[32m2025-10-16 20:22:08.111[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint1[0m


epoch: 34, test auc: 0.913043
EarlyStopping counter: 14 out of 15
epoch: 34, train loss: 1.7057, lr: 0.000050, spent: 4.4 secs
epoch: 35, test auc: 0.917391
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:22:08.294[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 4.7 secs.[0m
[32m2025-10-16 20:22:08.424[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:22:08.425[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:22:08.425[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint1[0m
[32m2025-10-16 20:22:08.456[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint1/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.936343
epoch: 0, train loss: 2.7731, lr: 0.000100, spent: 0.2 secs
epoch: 1, test auc: 0.944444
epoch: 1, train loss: 2.6060, lr: 0.000100, spent: 0.4 secs
epoch: 2, test auc: 0.939815
EarlyStopping counter: 1 out of 10
epoch: 2, train loss: 2.4853, lr: 0.000100, spent: 0.5 secs
epoch: 3, test auc: 0.940972
EarlyStopping counter: 2 out of 10
epoch: 3, train loss: 2.4416, lr: 0.000100, spent: 0.7 secs
epoch: 4, test auc: 0.943287
EarlyStopping counter: 3 out of 10
epoch: 4, train loss: 2.3743, lr: 0.000100, spent: 0.8 secs
epoch: 5, test auc: 0.944444
epoch: 5, train loss: 2.3037, lr: 0.000100, spent: 1.2 secs
epoch: 6, test auc: 0.945602
epoch: 6, train loss: 2.2453, lr: 0.000100, spent: 1.3 secs
epoch: 7, test auc: 0.949074
epoch: 7, train loss: 2.1772, lr: 0.000100, spent: 1.5 secs
epoch: 8, test auc: 0.950231
epoch: 8, train loss: 2.0909, lr: 0.000100, spent: 1.7 secs
epoch: 9, test auc: 0.952546
epoch: 9, train loss: 2.0464, lr: 0.000100, spent: 1.9 secs
epoch

[32m2025-10-16 20:22:15.299[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:22:15.317[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 33, test auc: 0.937500
EarlyStopping counter: 9 out of 10
epoch: 33, train loss: 1.3188, lr: 0.000100, spent: 6.7 secs
epoch: 34, test auc: 0.956019
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:22:15.557[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 7.1 secs.[0m
[32m2025-10-16 20:22:15.686[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:22:15.687[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:22:15.688[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint1[0m
[32m2025-10-16 20:22:15.717[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint1/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.811298
epoch: 0, train loss: 2.6448, lr: 0.000100, spent: 0.2 secs
epoch: 1, test auc: 0.777644
EarlyStopping counter: 1 out of 10
epoch: 1, train loss: 2.4074, lr: 0.000100, spent: 0.3 secs
epoch: 2, test auc: 0.770433
EarlyStopping counter: 2 out of 10
epoch: 2, train loss: 2.3296, lr: 0.000100, spent: 0.5 secs
epoch: 3, test auc: 0.795673
EarlyStopping counter: 3 out of 10
epoch: 3, train loss: 2.2327, lr: 0.000100, spent: 0.6 secs
epoch: 4, test auc: 0.811298
EarlyStopping counter: 4 out of 10
epoch: 4, train loss: 2.1173, lr: 0.000100, spent: 0.8 secs
epoch: 5, test auc: 0.817308
epoch: 5, train loss: 2.0610, lr: 0.000100, spent: 1.2 secs
epoch: 6, test auc: 0.823317
epoch: 6, train loss: 1.9479, lr: 0.000100, spent: 1.3 secs
epoch: 7, test auc: 0.844952
epoch: 7, train loss: 1.9189, lr: 0.000100, spent: 1.5 secs
epoch: 8, test auc: 0.853365
epoch: 8, train loss: 1.8338, lr: 0.000100, spent: 1.7 secs
epoch: 9, test auc: 0.862981
epoch: 9, train loss: 1.7884, 

[32m2025-10-16 20:22:21.944[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:22:21.961[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 32, test auc: 0.896635
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:22:22.206[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 6.5 secs.[0m
[32m2025-10-16 20:22:22.343[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:22:22.344[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:22:22.345[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint1[0m
[32m2025-10-16 20:22:22.376[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint1/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.789663
epoch: 0, train loss: 2.7441, lr: 0.000100, spent: 0.4 secs
epoch: 1, test auc: 0.772837
EarlyStopping counter: 1 out of 10
epoch: 1, train loss: 2.5255, lr: 0.000100, spent: 0.5 secs
epoch: 2, test auc: 0.750000
EarlyStopping counter: 2 out of 10
epoch: 2, train loss: 2.3610, lr: 0.000100, spent: 0.7 secs
epoch: 3, test auc: 0.754808
EarlyStopping counter: 3 out of 10
epoch: 3, train loss: 2.3534, lr: 0.000100, spent: 0.8 secs
epoch: 4, test auc: 0.771635
EarlyStopping counter: 4 out of 10
epoch: 4, train loss: 2.2446, lr: 0.000100, spent: 1.0 secs
epoch: 5, test auc: 0.765625
EarlyStopping counter: 5 out of 10
epoch: 5, train loss: 2.1252, lr: 0.000100, spent: 1.1 secs
epoch: 6, test auc: 0.774038
EarlyStopping counter: 6 out of 10
epoch: 6, train loss: 2.1316, lr: 0.000100, spent: 1.3 secs
epoch: 7, test auc: 0.789663
epoch: 7, train loss: 2.0185, lr: 0.000100, spent: 1.7 secs
epoch: 8, test auc: 0.788462
EarlyStopping counter: 1 out of 10
epoch: 8, trai

[32m2025-10-16 20:22:30.053[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:22:30.070[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 38, test auc: 0.852163
EarlyStopping counter: 9 out of 10
epoch: 38, train loss: 1.1947, lr: 0.000100, spent: 7.5 secs
epoch: 39, test auc: 0.849760
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:22:30.251[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 7.8 secs.[0m
[32m2025-10-16 20:22:30.381[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:22:30.382[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:22:30.383[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint1[0m
[32m2025-10-16 20:22:30.412[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint1/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.784856
epoch: 0, train loss: 2.7788, lr: 0.000100, spent: 0.2 secs
epoch: 1, test auc: 0.796875
epoch: 1, train loss: 2.6024, lr: 0.000100, spent: 0.4 secs
epoch: 2, test auc: 0.801683
epoch: 2, train loss: 2.5204, lr: 0.000100, spent: 0.6 secs
epoch: 3, test auc: 0.800481
EarlyStopping counter: 1 out of 10
epoch: 3, train loss: 2.3583, lr: 0.000100, spent: 0.9 secs
epoch: 4, test auc: 0.788462
EarlyStopping counter: 2 out of 10
epoch: 4, train loss: 2.3013, lr: 0.000100, spent: 1.0 secs
epoch: 5, test auc: 0.799279
EarlyStopping counter: 3 out of 10
epoch: 5, train loss: 2.2120, lr: 0.000100, spent: 1.2 secs
epoch: 6, test auc: 0.800481
EarlyStopping counter: 4 out of 10
epoch: 6, train loss: 2.1406, lr: 0.000100, spent: 1.3 secs
epoch: 7, test auc: 0.800481
EarlyStopping counter: 5 out of 10
epoch: 7, train loss: 2.0683, lr: 0.000100, spent: 1.5 secs
epoch: 8, test auc: 0.800481
EarlyStopping counter: 6 out of 10
epoch: 8, train loss: 1.9517, lr: 0.000100, spent

[32m2025-10-16 20:22:39.854[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:22:39.872[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 47, test auc: 0.871394
EarlyStopping counter: 9 out of 10
epoch: 47, train loss: 1.0879, lr: 0.000100, spent: 9.3 secs
epoch: 48, test auc: 0.868990
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:22:40.101[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 9.7 secs.[0m
[32m2025-10-16 20:22:40.230[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:22:40.231[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:22:40.232[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint1[0m
[32m2025-10-16 20:22:40.263[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint1/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.818510
epoch: 0, train loss: 2.7177, lr: 0.000100, spent: 0.2 secs
epoch: 1, test auc: 0.736779
EarlyStopping counter: 1 out of 10
epoch: 1, train loss: 2.5148, lr: 0.000100, spent: 0.3 secs
epoch: 2, test auc: 0.728365
EarlyStopping counter: 2 out of 10
epoch: 2, train loss: 2.3950, lr: 0.000100, spent: 0.5 secs
epoch: 3, test auc: 0.731971
EarlyStopping counter: 3 out of 10
epoch: 3, train loss: 2.2716, lr: 0.000100, spent: 0.6 secs
epoch: 4, test auc: 0.746394
EarlyStopping counter: 4 out of 10
epoch: 4, train loss: 2.1993, lr: 0.000100, spent: 0.8 secs
epoch: 5, test auc: 0.766827
EarlyStopping counter: 5 out of 10
epoch: 5, train loss: 2.0917, lr: 0.000100, spent: 1.1 secs
epoch: 6, test auc: 0.783654
EarlyStopping counter: 6 out of 10
epoch: 6, train loss: 2.0613, lr: 0.000100, spent: 1.3 secs
epoch: 7, test auc: 0.800481
EarlyStopping counter: 7 out of 10
epoch: 7, train loss: 2.0112, lr: 0.000100, spent: 1.4 secs
epoch: 8, test auc: 0.824519
epoch: 8, trai

[32m2025-10-16 20:22:46.656[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:22:46.677[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m
[32m2025-10-16 20:22:46.856[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 6.6 secs.[0m


epoch: 33, test auc: 0.890625
EarlyStopping counter: 10 out of 10
early stopped


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.842593
epoch: 0, train loss: 2.7671, lr: 0.000050, spent: 0.2 secs
epoch: 1, test auc: 0.901620
epoch: 1, train loss: 2.7315, lr: 0.000050, spent: 0.4 secs
epoch: 2, test auc: 0.912037
epoch: 2, train loss: 2.6961, lr: 0.000050, spent: 0.6 secs
epoch: 3, test auc: 0.916667
epoch: 3, train loss: 2.6444, lr: 0.000050, spent: 0.7 secs
epoch: 4, test auc: 0.924769
epoch: 4, train loss: 2.6178, lr: 0.000050, spent: 0.9 secs
epoch: 5, test auc: 0.929398
epoch: 5, train loss: 2.5650, lr: 0.000050, spent: 1.1 secs
epoch: 6, test auc: 0.913194
EarlyStopping counter: 1 out of 15
epoch: 6, train loss: 2.5272, lr: 0.000050, spent: 1.5 secs
epoch: 7, test auc: 0.910880
EarlyStopping counter: 2 out of 15
epoch: 7, train loss: 2.4713, lr: 0.000050, spent: 1.6 secs
epoch: 8, test auc: 0.917824
EarlyStopping counter: 3 out of 15
epoch: 8, train loss: 2.4676, lr: 0.000050, spent: 1.7 secs
epoch: 9, test auc: 0.916667
EarlyStopping counter: 4 out of 15
epoch: 9, train loss: 2.4277, 

[32m2025-10-16 20:22:56.360[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:22:56.377[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 49, test auc: 0.956019
EarlyStopping counter: 7 out of 15
epoch: 49, train loss: 1.4239, lr: 0.000050, spent: 9.2 secs


[32m2025-10-16 20:22:56.614[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 9.4 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.633413
epoch: 0, train loss: 3.0018, lr: 0.000050, spent: 0.2 secs
epoch: 1, test auc: 0.695913
epoch: 1, train loss: 2.7077, lr: 0.000050, spent: 0.4 secs
epoch: 2, test auc: 0.745192
epoch: 2, train loss: 2.6937, lr: 0.000050, spent: 0.5 secs
epoch: 3, test auc: 0.769231
epoch: 3, train loss: 2.6633, lr: 0.000050, spent: 0.7 secs
epoch: 4, test auc: 0.776442
epoch: 4, train loss: 2.5935, lr: 0.000050, spent: 0.9 secs
epoch: 5, test auc: 0.775240
EarlyStopping counter: 1 out of 15
epoch: 5, train loss: 2.5395, lr: 0.000050, spent: 1.1 secs
epoch: 6, test auc: 0.777644
epoch: 6, train loss: 2.5150, lr: 0.000050, spent: 1.4 secs
epoch: 7, test auc: 0.775240
EarlyStopping counter: 1 out of 15
epoch: 7, train loss: 2.4342, lr: 0.000050, spent: 1.6 secs
epoch: 8, test auc: 0.778846
epoch: 8, train loss: 2.3733, lr: 0.000050, spent: 1.8 secs
epoch: 9, test auc: 0.776442
EarlyStopping counter: 1 out of 15
epoch: 9, train loss: 2.3219, lr: 0.000050, spent: 1.9 secs
epoch

[32m2025-10-16 20:23:06.752[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:23:06.770[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m
[32m2025-10-16 20:23:06.935[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 10.0 secs.[0m


epoch: 49, test auc: 0.900240
EarlyStopping counter: 2 out of 15
epoch: 49, train loss: 1.2881, lr: 0.000050, spent: 9.8 secs


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.485577
epoch: 0, train loss: 2.8908, lr: 0.000050, spent: 0.2 secs
epoch: 1, test auc: 0.716346
epoch: 1, train loss: 2.7376, lr: 0.000050, spent: 0.4 secs
epoch: 2, test auc: 0.752404
epoch: 2, train loss: 2.6579, lr: 0.000050, spent: 0.6 secs
epoch: 3, test auc: 0.771635
epoch: 3, train loss: 2.5582, lr: 0.000050, spent: 0.7 secs
epoch: 4, test auc: 0.770433
EarlyStopping counter: 1 out of 15
epoch: 4, train loss: 2.4776, lr: 0.000050, spent: 0.9 secs
epoch: 5, test auc: 0.764423
EarlyStopping counter: 2 out of 15
epoch: 5, train loss: 2.4104, lr: 0.000050, spent: 1.0 secs
epoch: 6, test auc: 0.764423
EarlyStopping counter: 3 out of 15
epoch: 6, train loss: 2.3325, lr: 0.000050, spent: 1.4 secs
epoch: 7, test auc: 0.756010
EarlyStopping counter: 4 out of 15
epoch: 7, train loss: 2.2584, lr: 0.000050, spent: 1.5 secs
epoch: 8, test auc: 0.757212
EarlyStopping counter: 5 out of 15
epoch: 8, train loss: 2.1949, lr: 0.000050, spent: 1.7 secs
epoch: 9, test auc: 0.75

[32m2025-10-16 20:23:16.906[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m


epoch: 48, test auc: 0.873798
epoch: 48, train loss: 1.2100, lr: 0.000050, spent: 9.7 secs
epoch: 49, test auc: 0.870192
EarlyStopping counter: 1 out of 15
epoch: 49, train loss: 1.2440, lr: 0.000050, spent: 9.8 secs


[32m2025-10-16 20:23:16.924[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m
[32m2025-10-16 20:23:17.084[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 10.0 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.448317
epoch: 0, train loss: 2.9109, lr: 0.000050, spent: 0.2 secs
epoch: 1, test auc: 0.677885
epoch: 1, train loss: 2.7827, lr: 0.000050, spent: 0.4 secs
epoch: 2, test auc: 0.748798
epoch: 2, train loss: 2.7541, lr: 0.000050, spent: 0.5 secs
epoch: 3, test auc: 0.752404
epoch: 3, train loss: 2.7131, lr: 0.000050, spent: 0.7 secs
epoch: 4, test auc: 0.763221
epoch: 4, train loss: 2.7080, lr: 0.000050, spent: 0.9 secs
epoch: 5, test auc: 0.766827
epoch: 5, train loss: 2.6741, lr: 0.000050, spent: 1.3 secs
epoch: 6, test auc: 0.768029
epoch: 6, train loss: 2.6495, lr: 0.000050, spent: 1.5 secs
epoch: 7, test auc: 0.769231
epoch: 7, train loss: 2.6413, lr: 0.000050, spent: 1.7 secs
epoch: 8, test auc: 0.759615
EarlyStopping counter: 1 out of 15
epoch: 8, train loss: 2.6159, lr: 0.000050, spent: 1.8 secs
epoch: 9, test auc: 0.758413
EarlyStopping counter: 2 out of 15
epoch: 9, train loss: 2.5736, lr: 0.000050, spent: 1.9 secs
epoch: 10, test auc: 0.763221
EarlyStopp

[32m2025-10-16 20:23:26.668[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:23:26.685[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 49, test auc: 0.838942
EarlyStopping counter: 2 out of 15
epoch: 49, train loss: 1.3946, lr: 0.000050, spent: 9.4 secs


[32m2025-10-16 20:23:26.920[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 9.7 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.496394
epoch: 0, train loss: 2.7856, lr: 0.000050, spent: 0.2 secs
epoch: 1, test auc: 0.612981
epoch: 1, train loss: 2.7110, lr: 0.000050, spent: 0.4 secs
epoch: 2, test auc: 0.663462
epoch: 2, train loss: 2.7106, lr: 0.000050, spent: 0.6 secs
epoch: 3, test auc: 0.697115
epoch: 3, train loss: 2.6453, lr: 0.000050, spent: 0.7 secs
epoch: 4, test auc: 0.721154
epoch: 4, train loss: 2.6272, lr: 0.000050, spent: 1.1 secs
epoch: 5, test auc: 0.737981
epoch: 5, train loss: 2.5319, lr: 0.000050, spent: 1.3 secs
epoch: 6, test auc: 0.741587
epoch: 6, train loss: 2.4875, lr: 0.000050, spent: 1.5 secs
epoch: 7, test auc: 0.740385
EarlyStopping counter: 1 out of 15
epoch: 7, train loss: 2.3755, lr: 0.000050, spent: 1.6 secs
epoch: 8, test auc: 0.750000
epoch: 8, train loss: 2.3438, lr: 0.000050, spent: 1.8 secs
epoch: 9, test auc: 0.758413
epoch: 9, train loss: 2.2578, lr: 0.000050, spent: 2.0 secs
epoch: 10, test auc: 0.782452
epoch: 10, train loss: 2.2567, lr: 0.000050, 

[32m2025-10-16 20:23:36.857[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:23:36.874[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m
[32m2025-10-16 20:23:37.054[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 10.0 secs.[0m


epoch: 49, test auc: 0.884615
EarlyStopping counter: 11 out of 15
epoch: 49, train loss: 1.4370, lr: 0.000050, spent: 9.8 secs


In [19]:
## -- Perform Transfer Learning on the second pair of datasets
PreTrain2 = cv_transtab_5fold(d_X3, d_y3, cat_cols3, num_cols3, bin_cols3,
                              file_name='pretrain_checkpoint2',save_ckpt=True)
TransferLearning2 = cv_transtab_5fold(d_X4, d_y4, cat_cols4, num_cols4, bin_cols4,
                                      transfer=True, file_name='pretrain_checkpoint2')
Baseline2 = cv_transtab_5fold(d_X4, d_y4, cat_cols4, num_cols4, bin_cols4)

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.538149
epoch: 0, train loss: 5.1113, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.579221
epoch: 1, train loss: 4.8753, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.593506
epoch: 2, train loss: 4.8490, lr: 0.000050, spent: 0.7 secs
epoch: 3, test auc: 0.607792
epoch: 3, train loss: 4.7829, lr: 0.000050, spent: 1.0 secs
epoch: 4, test auc: 0.613474
epoch: 4, train loss: 4.7675, lr: 0.000050, spent: 1.4 secs
epoch: 5, test auc: 0.614448
epoch: 5, train loss: 4.7383, lr: 0.000050, spent: 1.7 secs
epoch: 6, test auc: 0.621266
epoch: 6, train loss: 4.7186, lr: 0.000050, spent: 1.9 secs
epoch: 7, test auc: 0.626461
epoch: 7, train loss: 4.6887, lr: 0.000050, spent: 2.2 secs
epoch: 8, test auc: 0.625162
EarlyStopping counter: 1 out of 15
epoch: 8, train loss: 4.6670, lr: 0.000050, spent: 2.4 secs
epoch: 9, test auc: 0.628571
epoch: 9, train loss: 4.6415, lr: 0.000050, spent: 2.7 secs
epoch: 10, test auc: 0.629708
epoch: 10, train loss: 4.6106, lr: 0.000050, 

[32m2025-10-16 20:23:50.333[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint2[0m
[32m2025-10-16 20:23:50.350[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint2[0m


epoch: 49, test auc: 0.682305
EarlyStopping counter: 2 out of 15
epoch: 49, train loss: 3.9282, lr: 0.000050, spent: 13.1 secs


[32m2025-10-16 20:23:50.566[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 13.4 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.663636
epoch: 0, train loss: 4.8272, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.653884
EarlyStopping counter: 1 out of 15
epoch: 1, train loss: 4.8037, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.650248
EarlyStopping counter: 2 out of 15
epoch: 2, train loss: 4.8059, lr: 0.000050, spent: 0.7 secs
epoch: 3, test auc: 0.649091
EarlyStopping counter: 3 out of 15
epoch: 3, train loss: 4.7206, lr: 0.000050, spent: 0.9 secs
epoch: 4, test auc: 0.660331
EarlyStopping counter: 4 out of 15
epoch: 4, train loss: 4.7093, lr: 0.000050, spent: 1.3 secs
epoch: 5, test auc: 0.659835
EarlyStopping counter: 5 out of 15
epoch: 5, train loss: 4.6753, lr: 0.000050, spent: 1.5 secs
epoch: 6, test auc: 0.661818
EarlyStopping counter: 6 out of 15
epoch: 6, train loss: 4.6693, lr: 0.000050, spent: 1.7 secs
epoch: 7, test auc: 0.661818
EarlyStopping counter: 7 out of 15
epoch: 7, train loss: 4.6474, lr: 0.000050, spent: 2.0 secs
epoch: 8, test auc: 0.666281
epoch: 8, trai

[32m2025-10-16 20:24:03.306[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint2[0m
[32m2025-10-16 20:24:03.323[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint2[0m


epoch: 49, test auc: 0.702810
EarlyStopping counter: 9 out of 15
epoch: 49, train loss: 3.9702, lr: 0.000050, spent: 12.6 secs


[32m2025-10-16 20:24:03.526[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 12.8 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.616033
epoch: 0, train loss: 4.8654, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.674380
epoch: 1, train loss: 4.8958, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.719008
epoch: 2, train loss: 4.7971, lr: 0.000050, spent: 0.8 secs
epoch: 3, test auc: 0.722975
epoch: 3, train loss: 4.8167, lr: 0.000050, spent: 1.0 secs
epoch: 4, test auc: 0.717521
EarlyStopping counter: 1 out of 15
epoch: 4, train loss: 4.7530, lr: 0.000050, spent: 1.4 secs
epoch: 5, test auc: 0.716364
EarlyStopping counter: 2 out of 15
epoch: 5, train loss: 4.7429, lr: 0.000050, spent: 1.6 secs
epoch: 6, test auc: 0.712893
EarlyStopping counter: 3 out of 15
epoch: 6, train loss: 4.7161, lr: 0.000050, spent: 1.8 secs
epoch: 7, test auc: 0.718843
EarlyStopping counter: 4 out of 15
epoch: 7, train loss: 4.6960, lr: 0.000050, spent: 2.0 secs
epoch: 8, test auc: 0.717355
EarlyStopping counter: 5 out of 15
epoch: 8, train loss: 4.6779, lr: 0.000050, spent: 2.3 secs
epoch: 9, test auc: 0.70

[32m2025-10-16 20:24:16.506[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint2[0m
[32m2025-10-16 20:24:16.524[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint2[0m


epoch: 49, test auc: 0.777521
EarlyStopping counter: 3 out of 15
epoch: 49, train loss: 4.0643, lr: 0.000050, spent: 12.8 secs


[32m2025-10-16 20:24:16.761[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 13.1 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.510417
epoch: 0, train loss: 5.1910, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.641700
epoch: 1, train loss: 4.9697, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.623016
EarlyStopping counter: 1 out of 15
epoch: 2, train loss: 4.8536, lr: 0.000050, spent: 0.7 secs
epoch: 3, test auc: 0.620370
EarlyStopping counter: 2 out of 15
epoch: 3, train loss: 4.8207, lr: 0.000050, spent: 1.1 secs
epoch: 4, test auc: 0.618882
EarlyStopping counter: 3 out of 15
epoch: 4, train loss: 4.7825, lr: 0.000050, spent: 1.3 secs
epoch: 5, test auc: 0.629795
EarlyStopping counter: 4 out of 15
epoch: 5, train loss: 4.7284, lr: 0.000050, spent: 1.5 secs
epoch: 6, test auc: 0.638062
EarlyStopping counter: 5 out of 15
epoch: 6, train loss: 4.6944, lr: 0.000050, spent: 1.8 secs
epoch: 7, test auc: 0.640046
EarlyStopping counter: 6 out of 15
epoch: 7, train loss: 4.6695, lr: 0.000050, spent: 2.0 secs
epoch: 8, test auc: 0.649967
epoch: 8, train loss: 4.6463, lr: 0.000050, spent

[32m2025-10-16 20:24:29.800[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint2[0m
[32m2025-10-16 20:24:29.818[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint2[0m


epoch: 49, test auc: 0.763558
EarlyStopping counter: 2 out of 15
epoch: 49, train loss: 4.0611, lr: 0.000050, spent: 12.9 secs


[32m2025-10-16 20:24:30.038[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 13.1 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.605820
epoch: 0, train loss: 4.8167, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.593750
EarlyStopping counter: 1 out of 15
epoch: 1, train loss: 4.7562, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.593915
EarlyStopping counter: 2 out of 15
epoch: 2, train loss: 4.7062, lr: 0.000050, spent: 0.7 secs
epoch: 3, test auc: 0.588459
EarlyStopping counter: 3 out of 15
epoch: 3, train loss: 4.7017, lr: 0.000050, spent: 1.1 secs
epoch: 4, test auc: 0.591435
EarlyStopping counter: 4 out of 15
epoch: 4, train loss: 4.6452, lr: 0.000050, spent: 1.3 secs
epoch: 5, test auc: 0.590443
EarlyStopping counter: 5 out of 15
epoch: 5, train loss: 4.6226, lr: 0.000050, spent: 1.5 secs
epoch: 6, test auc: 0.591931
EarlyStopping counter: 6 out of 15
epoch: 6, train loss: 4.5934, lr: 0.000050, spent: 1.7 secs
epoch: 7, test auc: 0.592758
EarlyStopping counter: 7 out of 15
epoch: 7, train loss: 4.5832, lr: 0.000050, spent: 2.0 secs
epoch: 8, test auc: 0.593750
EarlyStopping 

[32m2025-10-16 20:24:34.044[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./pretrain_checkpoint2[0m
[32m2025-10-16 20:24:34.061[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./pretrain_checkpoint2[0m


epoch: 15, test auc: 0.601521
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:24:34.301[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 4.1 secs.[0m
[32m2025-10-16 20:24:34.446[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:24:34.447[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:24:34.447[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint2[0m
[32m2025-10-16 20:24:34.478[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint2/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.802885
epoch: 0, train loss: 4.7648, lr: 0.000100, spent: 0.3 secs
epoch: 1, test auc: 0.810842
epoch: 1, train loss: 4.5930, lr: 0.000100, spent: 0.7 secs
epoch: 2, test auc: 0.815153
epoch: 2, train loss: 4.3734, lr: 0.000100, spent: 1.0 secs
epoch: 3, test auc: 0.816479
epoch: 3, train loss: 4.2106, lr: 0.000100, spent: 1.2 secs
epoch: 4, test auc: 0.818634
epoch: 4, train loss: 3.9683, lr: 0.000100, spent: 1.5 secs
epoch: 5, test auc: 0.821950
epoch: 5, train loss: 3.7641, lr: 0.000100, spent: 1.7 secs
epoch: 6, test auc: 0.854775
epoch: 6, train loss: 3.5534, lr: 0.000100, spent: 2.0 secs
epoch: 7, test auc: 0.860908
epoch: 7, train loss: 3.3435, lr: 0.000100, spent: 2.2 secs
epoch: 8, test auc: 0.865219
epoch: 8, train loss: 3.0710, lr: 0.000100, spent: 2.6 secs
epoch: 9, test auc: 0.871353
epoch: 9, train loss: 3.0247, lr: 0.000100, spent: 2.9 secs
epoch: 10, test auc: 0.876658
epoch: 10, train loss: 2.8208, lr: 0.000100, spent: 3.1 secs
epoch: 11, test auc

[32m2025-10-16 20:24:39.970[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:24:39.987[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 20, test auc: 0.863727
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:24:40.225[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 5.7 secs.[0m
[32m2025-10-16 20:24:40.363[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:24:40.364[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:24:40.365[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint2[0m
[32m2025-10-16 20:24:40.395[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint2/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.756963
epoch: 0, train loss: 4.7419, lr: 0.000100, spent: 0.3 secs
epoch: 1, test auc: 0.756466
EarlyStopping counter: 1 out of 10
epoch: 1, train loss: 4.4536, lr: 0.000100, spent: 0.7 secs
epoch: 2, test auc: 0.759781
epoch: 2, train loss: 4.2365, lr: 0.000100, spent: 0.9 secs
epoch: 3, test auc: 0.771718
epoch: 3, train loss: 4.0506, lr: 0.000100, spent: 1.2 secs
epoch: 4, test auc: 0.773707
epoch: 4, train loss: 3.8150, lr: 0.000100, spent: 1.4 secs
epoch: 5, test auc: 0.757626
EarlyStopping counter: 1 out of 10
epoch: 5, train loss: 3.6104, lr: 0.000100, spent: 1.7 secs
epoch: 6, test auc: 0.807029
epoch: 6, train loss: 3.3627, lr: 0.000100, spent: 1.9 secs
epoch: 7, test auc: 0.813992
epoch: 7, train loss: 3.1306, lr: 0.000100, spent: 2.1 secs
epoch: 8, test auc: 0.819131
epoch: 8, train loss: 2.9406, lr: 0.000100, spent: 2.6 secs
epoch: 9, test auc: 0.822944
epoch: 9, train loss: 2.8209, lr: 0.000100, spent: 2.8 secs
epoch: 10, test auc: 0.824934
epoch: 10,

[32m2025-10-16 20:24:51.505[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:24:51.522[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 42, test auc: 0.835710
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:24:51.761[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 11.3 secs.[0m
[32m2025-10-16 20:24:51.904[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:24:51.905[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:24:51.906[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint2[0m
[32m2025-10-16 20:24:51.936[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint2/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.772878
epoch: 0, train loss: 4.7138, lr: 0.000100, spent: 0.3 secs
epoch: 1, test auc: 0.799403
epoch: 1, train loss: 4.4969, lr: 0.000100, spent: 0.7 secs
epoch: 2, test auc: 0.804708
epoch: 2, train loss: 4.3255, lr: 0.000100, spent: 1.0 secs
epoch: 3, test auc: 0.827752
epoch: 3, train loss: 4.1290, lr: 0.000100, spent: 1.2 secs
epoch: 4, test auc: 0.839191
epoch: 4, train loss: 3.9131, lr: 0.000100, spent: 1.5 secs
epoch: 5, test auc: 0.848806
epoch: 5, train loss: 3.8191, lr: 0.000100, spent: 1.7 secs
epoch: 6, test auc: 0.881134
epoch: 6, train loss: 3.5419, lr: 0.000100, spent: 2.0 secs
epoch: 7, test auc: 0.889423
epoch: 7, train loss: 3.3658, lr: 0.000100, spent: 2.2 secs
epoch: 8, test auc: 0.893070
epoch: 8, train loss: 3.1594, lr: 0.000100, spent: 2.5 secs
epoch: 9, test auc: 0.892407
EarlyStopping counter: 1 out of 10
epoch: 9, train loss: 3.0038, lr: 0.000100, spent: 2.9 secs
epoch: 10, test auc: 0.893568
epoch: 10, train loss: 2.8913, lr: 0.000100, 

[32m2025-10-16 20:24:59.911[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:24:59.929[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 30, test auc: 0.897546
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:25:00.169[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 8.2 secs.[0m
[32m2025-10-16 20:25:00.301[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:25:00.302[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:25:00.302[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint2[0m
[32m2025-10-16 20:25:00.331[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint2/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.773585
epoch: 0, train loss: 4.6277, lr: 0.000100, spent: 0.5 secs
epoch: 1, test auc: 0.777888
epoch: 1, train loss: 4.5104, lr: 0.000100, spent: 0.7 secs
epoch: 2, test auc: 0.783515
epoch: 2, train loss: 4.3565, lr: 0.000100, spent: 0.9 secs
epoch: 3, test auc: 0.800397
epoch: 3, train loss: 4.0909, lr: 0.000100, spent: 1.2 secs
epoch: 4, test auc: 0.814465
epoch: 4, train loss: 3.8884, lr: 0.000100, spent: 1.4 secs
epoch: 5, test auc: 0.826216
epoch: 5, train loss: 3.7096, lr: 0.000100, spent: 1.7 secs
epoch: 6, test auc: 0.861470
epoch: 6, train loss: 3.4733, lr: 0.000100, spent: 2.2 secs
epoch: 7, test auc: 0.867097
epoch: 7, train loss: 3.2578, lr: 0.000100, spent: 2.4 secs
epoch: 8, test auc: 0.875041
epoch: 8, train loss: 2.9962, lr: 0.000100, spent: 2.7 secs
epoch: 9, test auc: 0.872724
EarlyStopping counter: 1 out of 10
epoch: 9, train loss: 3.0764, lr: 0.000100, spent: 2.9 secs
epoch: 10, test auc: 0.874379
EarlyStopping counter: 2 out of 10
epoch: 10,

[32m2025-10-16 20:25:08.674[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:25:08.691[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 31, test auc: 0.873717
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:25:08.933[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 8.6 secs.[0m
[32m2025-10-16 20:25:09.071[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m774[0m - [1mmissing keys: [][0m
[32m2025-10-16 20:25:09.072[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m775[0m - [1munexpected keys: [][0m
[32m2025-10-16 20:25:09.073[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m776[0m - [1mload model from /home/kutaytire/RL_training/pretrain_checkpoint2[0m
[32m2025-10-16 20:25:09.103[0m | [1mINFO    [0m | [36mtranstab.modeling_transtab[0m:[36mload[0m:[36m222[0m - [1mload feature extractor from /home/kutaytire/RL_training/pretrain_checkpoint2/extractor/extractor.json[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.775641
epoch: 0, train loss: 4.7859, lr: 0.000100, spent: 0.3 secs
epoch: 1, test auc: 0.789642
epoch: 1, train loss: 4.5862, lr: 0.000100, spent: 0.5 secs
epoch: 2, test auc: 0.790317
epoch: 2, train loss: 4.4125, lr: 0.000100, spent: 0.8 secs
epoch: 3, test auc: 0.796559
epoch: 3, train loss: 4.2559, lr: 0.000100, spent: 1.2 secs
epoch: 4, test auc: 0.819332
epoch: 4, train loss: 4.0262, lr: 0.000100, spent: 1.5 secs
epoch: 5, test auc: 0.847503
epoch: 5, train loss: 3.8962, lr: 0.000100, spent: 1.7 secs
epoch: 6, test auc: 0.868084
epoch: 6, train loss: 3.6878, lr: 0.000100, spent: 2.0 secs
epoch: 7, test auc: 0.896255
epoch: 7, train loss: 3.4819, lr: 0.000100, spent: 2.2 secs
epoch: 8, test auc: 0.908738
epoch: 8, train loss: 3.3264, lr: 0.000100, spent: 2.5 secs
epoch: 9, test auc: 0.909413
epoch: 9, train loss: 3.0751, lr: 0.000100, spent: 2.7 secs
epoch: 10, test auc: 0.905196
EarlyStopping counter: 1 out of 10
epoch: 10, train loss: 2.9931, lr: 0.000100, 

[32m2025-10-16 20:25:16.374[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:25:16.391[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 27, test auc: 0.896930
EarlyStopping counter: 10 out of 10
early stopped


[32m2025-10-16 20:25:16.629[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 7.5 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.683190
epoch: 0, train loss: 4.7738, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.774038
epoch: 1, train loss: 4.6882, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.783156
epoch: 2, train loss: 4.6167, lr: 0.000050, spent: 0.8 secs
epoch: 3, test auc: 0.793435
epoch: 3, train loss: 4.5302, lr: 0.000050, spent: 1.0 secs
epoch: 4, test auc: 0.817308
epoch: 4, train loss: 4.4277, lr: 0.000050, spent: 1.5 secs
epoch: 5, test auc: 0.821618
epoch: 5, train loss: 4.3208, lr: 0.000050, spent: 1.7 secs
epoch: 6, test auc: 0.837367
epoch: 6, train loss: 4.1911, lr: 0.000050, spent: 2.0 secs
epoch: 7, test auc: 0.848972
epoch: 7, train loss: 4.0434, lr: 0.000050, spent: 2.2 secs
epoch: 8, test auc: 0.862401
epoch: 8, train loss: 3.8546, lr: 0.000050, spent: 2.4 secs
epoch: 9, test auc: 0.863893
epoch: 9, train loss: 3.6178, lr: 0.000050, spent: 2.7 secs
epoch: 10, test auc: 0.871187
epoch: 10, train loss: 3.3727, lr: 0.000050, spent: 2.9 secs
epoch: 11, test auc

[32m2025-10-16 20:25:23.739[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:25:23.755[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 26, test auc: 0.870027
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:25:23.997[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 7.2 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.761439
epoch: 0, train loss: 4.8939, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.752653
EarlyStopping counter: 1 out of 15
epoch: 1, train loss: 4.7829, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.745524
EarlyStopping counter: 2 out of 15
epoch: 2, train loss: 4.6576, lr: 0.000050, spent: 0.7 secs
epoch: 3, test auc: 0.739887
EarlyStopping counter: 3 out of 15
epoch: 3, train loss: 4.5437, lr: 0.000050, spent: 0.9 secs
epoch: 4, test auc: 0.744529
EarlyStopping counter: 4 out of 15
epoch: 4, train loss: 4.4388, lr: 0.000050, spent: 1.1 secs
epoch: 5, test auc: 0.746187
EarlyStopping counter: 5 out of 15
epoch: 5, train loss: 4.3175, lr: 0.000050, spent: 1.3 secs
epoch: 6, test auc: 0.754310
EarlyStopping counter: 6 out of 15
epoch: 6, train loss: 4.2194, lr: 0.000050, spent: 1.7 secs
epoch: 7, test auc: 0.753813
EarlyStopping counter: 7 out of 15
epoch: 7, train loss: 4.0824, lr: 0.000050, spent: 1.9 secs
epoch: 8, test auc: 0.754973
EarlyStopping 

[32m2025-10-16 20:25:35.757[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:25:35.774[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 46, test auc: 0.831897
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:25:36.013[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 11.9 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.702089
epoch: 0, train loss: 4.8416, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.806366
epoch: 1, train loss: 4.7431, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.820623
epoch: 2, train loss: 4.6606, lr: 0.000050, spent: 0.8 secs
epoch: 3, test auc: 0.836704
epoch: 3, train loss: 4.5751, lr: 0.000050, spent: 1.2 secs
epoch: 4, test auc: 0.836870
epoch: 4, train loss: 4.4711, lr: 0.000050, spent: 1.5 secs
epoch: 5, test auc: 0.842507
epoch: 5, train loss: 4.3606, lr: 0.000050, spent: 1.7 secs
epoch: 6, test auc: 0.852619
epoch: 6, train loss: 4.2036, lr: 0.000050, spent: 1.9 secs
epoch: 7, test auc: 0.877653
epoch: 7, train loss: 4.0036, lr: 0.000050, spent: 2.2 secs
epoch: 8, test auc: 0.888097
epoch: 8, train loss: 3.7528, lr: 0.000050, spent: 2.5 secs
epoch: 9, test auc: 0.893402
epoch: 9, train loss: 3.5275, lr: 0.000050, spent: 2.7 secs
epoch: 10, test auc: 0.893568
epoch: 10, train loss: 3.3194, lr: 0.000050, spent: 2.9 secs
epoch: 11, test auc

[32m2025-10-16 20:25:45.798[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:25:45.815[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 37, test auc: 0.895391
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:25:46.057[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 9.9 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.872890
epoch: 0, train loss: 4.8115, lr: 0.000050, spent: 0.3 secs
epoch: 1, test auc: 0.863787
EarlyStopping counter: 1 out of 15
epoch: 1, train loss: 4.7619, lr: 0.000050, spent: 0.5 secs
epoch: 2, test auc: 0.822410
EarlyStopping counter: 2 out of 15
epoch: 2, train loss: 4.6174, lr: 0.000050, spent: 0.9 secs
epoch: 3, test auc: 0.822079
EarlyStopping counter: 3 out of 15
epoch: 3, train loss: 4.5146, lr: 0.000050, spent: 1.1 secs
epoch: 4, test auc: 0.819762
EarlyStopping counter: 4 out of 15
epoch: 4, train loss: 4.4108, lr: 0.000050, spent: 1.3 secs
epoch: 5, test auc: 0.819596
EarlyStopping counter: 5 out of 15
epoch: 5, train loss: 4.2894, lr: 0.000050, spent: 1.5 secs
epoch: 6, test auc: 0.823734
EarlyStopping counter: 6 out of 15
epoch: 6, train loss: 4.1289, lr: 0.000050, spent: 1.7 secs
epoch: 7, test auc: 0.830023
EarlyStopping counter: 7 out of 15
epoch: 7, train loss: 3.9752, lr: 0.000050, spent: 1.9 secs
epoch: 8, test auc: 0.840947
EarlyStopping 

[32m2025-10-16 20:25:50.037[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:25:50.054[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 15, test auc: 0.871897
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:25:50.293[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 4.1 secs.[0m


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

epoch: 0, test auc: 0.562922
epoch: 0, train loss: 5.1729, lr: 0.000050, spent: 0.4 secs
epoch: 1, test auc: 0.763664
epoch: 1, train loss: 4.8613, lr: 0.000050, spent: 0.7 secs
epoch: 2, test auc: 0.825911
epoch: 2, train loss: 4.7430, lr: 0.000050, spent: 0.9 secs
epoch: 3, test auc: 0.838900
epoch: 3, train loss: 4.6310, lr: 0.000050, spent: 1.2 secs
epoch: 4, test auc: 0.839912
epoch: 4, train loss: 4.5405, lr: 0.000050, spent: 1.5 secs
epoch: 5, test auc: 0.851552
epoch: 5, train loss: 4.4316, lr: 0.000050, spent: 1.7 secs
epoch: 6, test auc: 0.854926
epoch: 6, train loss: 4.3167, lr: 0.000050, spent: 2.0 secs
epoch: 7, test auc: 0.869433
epoch: 7, train loss: 4.1655, lr: 0.000050, spent: 2.4 secs
epoch: 8, test auc: 0.884278
epoch: 8, train loss: 3.9868, lr: 0.000050, spent: 2.6 secs
epoch: 9, test auc: 0.894399
epoch: 9, train loss: 3.7628, lr: 0.000050, spent: 2.9 secs
epoch: 10, test auc: 0.902328
epoch: 10, train loss: 3.5238, lr: 0.000050, spent: 3.1 secs
epoch: 11, test auc

[32m2025-10-16 20:26:01.999[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m136[0m - [1mload best at last from ./ckpt[0m
[32m2025-10-16 20:26:02.016[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36msave_model[0m:[36m247[0m - [1msaving model checkpoint to ./ckpt[0m


epoch: 44, test auc: 0.893050
EarlyStopping counter: 15 out of 15
early stopped


[32m2025-10-16 20:26:02.257[0m | [1mINFO    [0m | [36mtranstab.trainer[0m:[36mtrain[0m:[36m141[0m - [1mtraining complete, cost 11.8 secs.[0m


In [20]:
## -- Preview results of TransTab
def compare_results(transfer, baseline, title="Comparison"):
    import pandas as pd

    mean_metrics = ["acc_mean", "f1_mean", "auc_mean"]
    std_metrics  = ["acc_std",  "f1_std",  "auc_std"]

    df = pd.DataFrame({
        "Metric": ["Accuracy", "F1-score", "AUC"],
        "Transfer_mean": [transfer[m] for m in mean_metrics],
        "Transfer_std":  [transfer[m] for m in std_metrics],
        "Baseline_mean": [baseline[m] for m in mean_metrics],
        "Baseline_std":  [baseline[m] for m in std_metrics],
    })

    df["Δmean"] = df["Transfer_mean"] - df["Baseline_mean"]
    df["Δstd"]  = df["Transfer_std"]  - df["Baseline_std"]

    df["Transfer (±)"] = df["Transfer_mean"].map("{:.4f}".format) + " ± " + df["Transfer_std"].map("{:.4f}".format)
    df["Baseline (±)"] = df["Baseline_mean"].map("{:.4f}".format) + " ± " + df["Baseline_std"].map("{:.4f}".format)
    df["Improvement"]  = df["Δmean"].map("{:+.4f}".format) + "  (Δstd " + df["Δstd"].map("{:+.4f}".format) + ")"

    with pd.option_context("display.precision", 4, "display.colheader_justify", "center"):
        print(f"\n{title}\n")
        print(df[["Metric", "Transfer (±)", "Baseline (±)", "Improvement"]].to_string(index=False, justify="center"))

compare_results(TransferLearning1, Baseline1, title="\tTransfer Learning vs Baseline for First Pair")
compare_results(TransferLearning2, Baseline2, title="\tTransfer Learning vs Baseline for Second Pair")


	Transfer Learning vs Baseline for First Pair

 Metric    Transfer (±)    Baseline (±)        Improvement      
Accuracy 0.8173 ± 0.0492 0.8039 ± 0.0317 +0.0134  (Δstd +0.0175)
F1-score 0.7901 ± 0.0633 0.7826 ± 0.0470 +0.0075  (Δstd +0.0163)
     AUC 0.9015 ± 0.0370 0.8931 ± 0.0453 +0.0083  (Δstd -0.0083)

	Transfer Learning vs Baseline for Second Pair

 Metric    Transfer (±)    Baseline (±)        Improvement      
Accuracy 0.8033 ± 0.0278 0.7366 ± 0.1324 +0.0667  (Δstd -0.1045)
F1-score 0.8273 ± 0.0242 0.7817 ± 0.0777 +0.0456  (Δstd -0.0536)
     AUC 0.8832 ± 0.0282 0.8809 ± 0.0278 +0.0022  (Δstd +0.0004)
