In [11]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from tabpfn import TabPFNRegressor

def generate_polynomial_data(x, coeffs, noise_std=0.3):
    """
    Generate data (x, y) where y follows a polynomial function of x with Gaussian noise.

    Parameters:
    - x: array-like, input features from any distribution
    - coeffs: list, polynomial coefficients (highest degree first)
    - noise_std: float, standard deviation of Gaussian noise

    Returns:
    - df: pandas DataFrame with columns ['x', 'y']
    """
    y_true = np.polyval(coeffs, x)
    y = y_true + np.random.normal(0, noise_std, size=len(x))
    return pd.DataFrame({'x': x, 'y': y})

def estimate_density_ratio(x_train, x_test):
    """
    Estimate the density ratio p(x_test) / p(x_train) using Kernel Density Estimation (KDE).

    Parameters:
    - x_train: array-like, training data points (1D or 2D)
    - x_test: array-like, test data points (1D or 2D)

    Returns:
    - density_ratio: array, estimated importance weights
    """
    x_train = x_train.reshape(-1, 1)  
    x_test = x_test.reshape(-1, 1)  

    # Automatic bandwidth selection using cross-validation
    bandwidths = np.logspace(-1, 1, 20)
    grid = GridSearchCV(KernelDensity(), {'bandwidth': bandwidths}, cv=5)
    grid.fit(x_train)
    best_bandwidth = grid.best_params_['bandwidth']

    kde_train = KernelDensity(bandwidth=best_bandwidth).fit(x_train)
    kde_test = KernelDensity(bandwidth=best_bandwidth).fit(x_test)

    log_density_ratio = kde_test.score_samples(x_train) - kde_train.score_samples(x_train)
    return np.exp(log_density_ratio)

# Set random seed for reproducibility
np.random.seed(42)

# Simulation Setting in Shimodaira (2000)

# -----------------------------
# Generate Data
# -----------------------------
n_train, n_test = 1000, 1000
mu_train, sigma_train = 0.5, 0.5
mu_test, sigma_test = 0, 0.3

x_train_samples = np.random.normal(mu_train, sigma_train, n_train)
x_test_samples = np.random.normal(mu_test, sigma_test, n_test)

# Define polynomial coefficients (e.g., y = x^3 - x)
polynomial_coeffs = [1, 0, -1, 0]

df_train = generate_polynomial_data(x_train_samples, polynomial_coeffs)
df_test = generate_polynomial_data(x_test_samples, polynomial_coeffs)

# -----------------------------
# Data Preparation
# -----------------------------
X_train = df_train[['x']].values
y_train = df_train['y'].values

X_test = df_test[['x']].values
y_test = df_test['y'].values

# -----------------------------
# Standard Linear Regression (Naive Model)
# -----------------------------
naive_model = LinearRegression()
naive_model.fit(X_train, y_train)

y_pred_naive = naive_model.predict(X_test)
mse_naive = mean_squared_error(y_test, y_pred_naive)

# -----------------------------
# Importance-Weighted (IW) Linear Regression
# -----------------------------
tau_squared = abs(1 / (1 / sigma_train**2 - 1 / sigma_test**2))  
mu_weighted = tau_squared * (mu_test / sigma_test**2 - mu_train / sigma_train**2)

# Compute importance weights (True Importance Weights)
importance_weights_true = np.exp(-((X_train - mu_weighted) ** 2) / (2 * tau_squared)).flatten()

# Compute importance weights using KDE
importance_weights_kde = estimate_density_ratio(X_train, X_test).flatten()

# Train importance-weighted models
iw_model_true = LinearRegression()
iw_model_true.fit(X_train, y_train, sample_weight=importance_weights_true)

iw_model_kde = LinearRegression()
iw_model_kde.fit(X_train, y_train, sample_weight=importance_weights_kde)

# Predict and compute MSE
y_pred_iw_true = iw_model_true.predict(X_test)
mse_iw_true = mean_squared_error(y_test, y_pred_iw_true)

y_pred_iw_kde = iw_model_kde.predict(X_test)
mse_iw_kde = mean_squared_error(y_test, y_pred_iw_kde)

# -----------------------------
# TabPFN Regressor
# -----------------------------
tabpfn_model = TabPFNRegressor()
tabpfn_model.fit(X_train, y_train)

y_pred_tabpfn = tabpfn_model.predict(X_test)
mse_tabpfn = mean_squared_error(y_test, y_pred_tabpfn)

# -----------------------------
# Display Results
# -----------------------------
print(f"Naive Model MSE: {mse_naive:.4f}")
print(f"IW Model (True Weights) MSE: {mse_iw_true:.4f}")
print(f"IW Model (KDE Weights) MSE: {mse_iw_kde:.4f}")
print(f"TabPFN Model MSE: {mse_tabpfn:.4f}")

Naive Model MSE: 0.3492
IW Model (Gaussian Weights) MSE: 0.1002
IW Model (KDE Weights) MSE: 0.1013
TabPFN Model MSE: 0.0956
