In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics.pairwise import pairwise_kernels
from xgboost import XGBRegressor
# NEW: TabPFN regressor
from tabpfn import TabPFNRegressor

# ============================================================
# Helper: training and evaluation for one split (existing methods)
# ============================================================
def train_and_evaluate(X_train, y_train, X_test, y_test, rho=0.5, beta=2):
    n = len(X_train)
    n1 = int((1 - rho) * n)
    X_train_1, y_train_1 = X_train[:n1], y_train[:n1]
    X_train_2, y_train_2 = X_train[n1:], y_train[n1:]

    # pseudo-labeling
    lbd_tilde = 0.1 / n
    krr_tilde = KernelRidge(kernel="rbf", alpha=lbd_tilde)
    krr_tilde.fit(X_train_2, y_train_2)
    y_tilde = krr_tilde.predict(X_test)

    # lambda grid
    lbd_min, lbd_max = 0.1 / n, 1
    m = int(np.ceil(np.log(lbd_max / lbd_min) / np.log(beta))) + 1
    Lambda = lbd_min * (beta ** np.arange(m))

    Alpha = np.zeros((m, n1))
    err_est_naive = np.zeros(m)
    err_est_pseudo = np.zeros(m)
    err_est_real = np.zeros(m)

    for j, lbd in enumerate(Lambda):
        krr = KernelRidge(kernel="rbf", alpha=lbd)
        krr.fit(X_train_1, y_train_1)
        Alpha[j] = krr.dual_coef_

        err_est_naive[j] = np.mean((krr.predict(X_train_2) - y_train_2) ** 2)
        y_lbd = krr.predict(X_test)
        err_est_pseudo[j] = np.mean((y_lbd - y_tilde) ** 2)
        err_est_real[j] = np.mean((y_lbd - y_test) ** 2)

    j_naive, j_pseudo, j_real = map(np.argmin, [err_est_naive, err_est_pseudo, err_est_real])
    lbd_naive, lbd_pseudo, lbd_real = Lambda[j_naive], Lambda[j_pseudo], Lambda[j_real]

    K = pairwise_kernels(X_test, X_train_1, metric="rbf")
    y_new_naive = K @ Alpha[j_naive]
    y_new_pseudo = K @ Alpha[j_pseudo]
    y_new_real = K @ Alpha[j_real]

    def mse(y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    return {
        "naive": mse(y_test, y_new_naive),
        "pseudo": mse(y_test, y_new_pseudo),
        "real": mse(y_test, y_new_real),
        "lbd_naive": lbd_naive,
        "lbd_pseudo": lbd_pseudo,
        "lbd_real": lbd_real,
    }

# ============================================================
# XGBoost with simple cross-validation (existing)
# ============================================================
def fit_xgboost(X_train, y_train, X_test, y_test, random_state=0):
    """
    Train XGBoost on (X_train, y_train) with a small CV grid.
    Evaluate MSE on X_test/y_test and return (mse, best_params).
    """
    param_grid = {
        "n_estimators": [200, 400],
        "max_depth": [3, 5],
        "learning_rate": [0.05, 0.1],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
    }
    base = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        n_jobs=-1,
        random_state=random_state,
        eval_metric="rmse",
    )
    cv = KFold(n_splits=3, shuffle=True, random_state=random_state)
    gs = GridSearchCV(
        estimator=base,
        param_grid=param_grid,
        scoring="neg_mean_squared_error",
        cv=cv,
        n_jobs=-1,
        verbose=0,
    )
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = float(np.mean((y_test - y_pred) ** 2))
    return mse, gs.best_params_

# ============================================================
# XGBoost (importance-weighted evaluation) reusing tuned params (existing)
# ============================================================
def xgboost_importance_weighted(X_train, y_train, X_test, y_test, weights, best_params, random_state=0):
    """
    Refit XGBoost with the best params from the base XGB CV and compute
    importance-weighted MSE on the test set using 'weights'.
    """
    model = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        n_jobs=-1,
        random_state=random_state,
        eval_metric="rmse",
        **best_params
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    iw_mse = float(np.average((y_test - y_pred) ** 2, weights=weights))
    return iw_mse

# ============================================================
# NEW: TabPFN (regression) — train on all X_train, evaluate on shifted test
# ============================================================
def fit_tabpfn_regressor(X_train, y_train, X_test, y_test, random_state=0, device=None):
    """
    Train TabPFNRegressor on (X_train, y_train) and compute MSE on X_test/y_test.
    device: None | 'cpu' | 'cuda' (if GPU is available). Defaults to library’s behavior.
    """
    kwargs = {}
    if device is not None:
        kwargs["device"] = device

    model = TabPFNRegressor(random_state=random_state, **kwargs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = float(np.mean((y_test - y_pred) ** 2))
    return mse

In [23]:
# ============================================================
# Load and preprocess data
# ============================================================
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat"
colnames = [
    "Frequency",
    "Angle_of_attack",
    "Chord_length",
    "Free_stream_velocity",
    "Suction_side_displacement_thickness",
    "Sound_pressure_level",
]
df = pd.read_csv(url, sep="\t", names=colnames)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
print(f"Loaded Airfoil dataset: {X.shape[0]} samples, {X.shape[1]} features")

# ============================================================
# Repeated experiment (10 random splits)
# ============================================================
rho, beta = 0.5, 2
n_repeats = 10
results = []

for i in range(n_repeats):
    # split + scale
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42 + i
    )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # covariate shift (same as before)
    beta1 = np.array([-1, 1, -1, 1, -1])
    weights = np.exp(X_test @ beta1)
    weights /= np.sum(weights)
    idx_shift = np.random.choice(len(X_test), size=len(X_test), replace=True, p=weights)
    X_test_shift, y_test_shift = X_test[idx_shift], y_test[idx_shift]
    weights_shift = weights[idx_shift]  # align weights to the shifted test set

    # existing KRR-based methods
    res = train_and_evaluate(X_train, y_train, X_test_shift, y_test_shift, rho=rho, beta=beta)

    # existing: XGBoost CV (unweighted)
    xgb_mse, xgb_best = fit_xgboost(X_train, y_train, X_test_shift, y_test_shift, random_state=42 + i)
    res["xgb"] = xgb_mse
    res["xgb_best_params"] = xgb_best

    # existing: XGBoost (importance-weighted evaluation)
    xgb_iw_mse = xgboost_importance_weighted(
        X_train, y_train, X_test_shift, y_test_shift, weights_shift, best_params=xgb_best, random_state=42 + i
    )
    res["xgb_iw"] = xgb_iw_mse

    # NEW: TabPFN regressor (train on all X_train, evaluate on shifted test)
    tabpfn_mse = fit_tabpfn_regressor(
        X_train, y_train, X_test_shift, y_test_shift, random_state=42 + i, device=None  # set to 'cuda' if available
    )
    res["tabpfn"] = tabpfn_mse

    results.append(res)
    print(f"Iteration {i+1}/10 done. XGB best params: {xgb_best}")

# ============================================================
# Aggregate results
# ============================================================
methods = ["real", "pseudo", "naive", "xgb", "xgb_iw", "tabpfn"]
avg_results = {}
for method in methods:
    vals = [r[method] for r in results]
    mean = np.mean(vals)
    ste = np.std(vals, ddof=1) / np.sqrt(len(vals))
    avg_results[method] = (mean, ste)

print("\n=== Average Results over 10 splits ===")
for method in methods:
    mean, ste = avg_results[method]
    print(f"{method.capitalize():<8}: {mean:.4f} ± {ste:.4f}")

# (Optional) average selected λ for KRR-based selectors
lbd_naive_mean = np.mean([r["lbd_naive"] for r in results])
lbd_pseudo_mean = np.mean([r["lbd_pseudo"] for r in results])
lbd_real_mean = np.mean([r["lbd_real"] for r in results])
print("\nAverage selected λ (KRR):")
print(f"λ* Naive : {lbd_naive_mean:.4g}")
print(f"λ* Pseudo: {lbd_pseudo_mean:.4g}")
print(f"λ* Real  : {lbd_real_mean:.4g}")

Loaded Airfoil dataset: 1503 samples, 5 features
Iteration 1/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 2/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 3/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 4/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 5/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 6/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 7/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n

In [24]:
# Try another data: Concrete Compressive Strength

path = "./Concrete_Data.xls"
df = pd.read_excel(path)

print("Original column names:")
print(df.columns.tolist())

df.columns = [
    "cement",          # Cement (kg in m3)
    "slag",            # Blast Furnace Slag
    "fly_ash",         # Fly Ash
    "water",           # Water
    "superplasticizer",# Superplasticizer
    "coarse_agg",      # Coarse Aggregate
    "fine_agg",        # Fine Aggregate
    "age",             # Age (days)
    "strength"         # Concrete compressive strength (MPa)
]

X = df.drop(columns=["strength"])
y = df["strength"]

print(df.head())
print("\nData shape:", df.shape)

Original column names:
['Cement (component 1)(kg in a m^3 mixture)', 'Blast Furnace Slag (component 2)(kg in a m^3 mixture)', 'Fly Ash (component 3)(kg in a m^3 mixture)', 'Water  (component 4)(kg in a m^3 mixture)', 'Superplasticizer (component 5)(kg in a m^3 mixture)', 'Coarse Aggregate  (component 6)(kg in a m^3 mixture)', 'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)', 'Concrete compressive strength(MPa, megapascals) ']
   cement   slag  fly_ash  water  superplasticizer  coarse_agg  fine_agg  age  \
0   540.0    0.0      0.0  162.0               2.5      1040.0     676.0   28   
1   540.0    0.0      0.0  162.0               2.5      1055.0     676.0   28   
2   332.5  142.5      0.0  228.0               0.0       932.0     594.0  270   
3   332.5  142.5      0.0  228.0               0.0       932.0     594.0  365   
4   198.6  132.4      0.0  192.0               0.0       978.4     825.5  360   

    strength  
0  79.986111  
1  61.887366  
2  40.269535  
3  41.0

In [25]:
X = X.values
y = y.values

print(f"Loaded Concrete Strength dataset: {X.shape[0]} samples, {X.shape[1]} features")

Loaded Concrete Strength dataset: 1030 samples, 8 features


In [27]:
rho, beta = 0.5, 2
n_repeats = 10
results = []

for i in range(n_repeats):
    # split + scale
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42 + i
    )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # covariate shift (same as before)
    beta1 = np.array([-1, 1, -1, 1, -1, 1, -1, 1])
    weights = np.exp(X_test @ beta1)
    weights /= np.sum(weights)
    idx_shift = np.random.choice(len(X_test), size=len(X_test), replace=True, p=weights)
    X_test_shift, y_test_shift = X_test[idx_shift], y_test[idx_shift]
    weights_shift = weights[idx_shift]  # align weights to the shifted test set

    # existing KRR-based methods
    res = train_and_evaluate(X_train, y_train, X_test_shift, y_test_shift, rho=rho, beta=beta)

    # existing: XGBoost CV (unweighted)
    xgb_mse, xgb_best = fit_xgboost(X_train, y_train, X_test_shift, y_test_shift, random_state=42 + i)
    res["xgb"] = xgb_mse
    res["xgb_best_params"] = xgb_best

    # existing: XGBoost (importance-weighted evaluation)
    xgb_iw_mse = xgboost_importance_weighted(
        X_train, y_train, X_test_shift, y_test_shift, weights_shift, best_params=xgb_best, random_state=42 + i
    )
    res["xgb_iw"] = xgb_iw_mse

    # NEW: TabPFN regressor (train on all X_train, evaluate on shifted test)
    tabpfn_mse = fit_tabpfn_regressor(
        X_train, y_train, X_test_shift, y_test_shift, random_state=42 + i, device=None  # set to 'cuda' if available
    )
    res["tabpfn"] = tabpfn_mse

    results.append(res)
    print(f"Iteration {i+1}/10 done. XGB best params: {xgb_best}")

# ============================================================
# Aggregate results
# ============================================================
methods = ["real", "pseudo", "naive", "xgb", "xgb_iw", "tabpfn"]
avg_results = {}
for method in methods:
    vals = [r[method] for r in results]
    mean = np.mean(vals)
    ste = np.std(vals, ddof=1) / np.sqrt(len(vals))
    avg_results[method] = (mean, ste)

print("\n=== Average Results over 10 splits ===")
for method in methods:
    mean, ste = avg_results[method]
    print(f"{method.capitalize():<8}: {mean:.4f} ± {ste:.4f}")

# (Optional) average selected λ for KRR-based selectors
lbd_naive_mean = np.mean([r["lbd_naive"] for r in results])
lbd_pseudo_mean = np.mean([r["lbd_pseudo"] for r in results])
lbd_real_mean = np.mean([r["lbd_real"] for r in results])
print("\nAverage selected λ (KRR):")
print(f"λ* Naive : {lbd_naive_mean:.4g}")
print(f"λ* Pseudo: {lbd_pseudo_mean:.4g}")
print(f"λ* Real  : {lbd_real_mean:.4g}")

Iteration 1/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.8}
Iteration 2/10 done. XGB best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.8}
Iteration 3/10 done. XGB best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}
Iteration 4/10 done. XGB best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.8}
Iteration 5/10 done. XGB best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8}


KeyboardInterrupt: 