In [4]:
import os
# Set up R environment
os.environ["R_HOME"] = r"C:/PROGRA~1/R/R-44~1.0"
os.environ["R_USER"] = os.environ["R_HOME"]

import numpy as np
import pandas as pd
from rpy2.robjects import numpy2ri
import rpy2.robjects as ro
from tabpfn import TabPFNRegressor
from sklearn.linear_model import LinearRegression

def calculate_and_print_metrics(results_dict, target_parameter):
    """
    Calculates and prints MSE, bias, and variance for multiple methods.

    Args:
        results_dict (dict): A dictionary where keys are method names (strings)
                             and values are lists of result arrays.
        target_parameter (np.array): The true parameter values.
    """

    print("Performance Comparison:")
    print(f"Target Parameter: {target_parameter}")

    for method_name, results_list in results_dict.items():
        results = np.array(results_list)

        # Calculate MSE
        mse = np.mean(np.mean((results - target_parameter)**2, axis=1))

        # Calculate bias
        bias = np.mean(np.mean(results - target_parameter, axis=0))

        # Calculate variance
        variance = np.mean(np.var(results, axis=0))

        print(f"\n{method_name}:")
        print(f"  MSE: {mse}")
        print(f"  Bias: {bias}")
        print(f"  Variance: {variance}")

In [5]:
# Source required R scripts
r_files = ["semi_supervised_methods.R", "dataGeneration.R", "SupervisedEstimation.R"]
for file in r_files:
    ro.r['source'](file)

ro.r('library(MASS)')
ro.r('library(caret)')
ro.r('library(quantreg)')

# Extract R functions
GenerateData = ro.r['GenerateData']
SupervisedEst = ro.r['SupervisedEst']   
PSSE = ro.r['PSSE']

In [None]:
# Global parameters
n = 500   # Labeled data size
N = 700   # Unlabeled data size
p = 5     # Number of predictors
rep = 1000  # Number of replications
option = "i"  # Data setting
polyOrder = 4  # Polynomial order
tau_0 = 0.5  # Quantile level

# Compute target parameter using large labeled data
np.random.seed(1230988)
LargeLabelledData = GenerateData(n=10**7, p=p, option=option).rx2("Data.labelled")
target_parameter = np.array(SupervisedEst(LargeLabelledData, tau=tau_0, option=option).rx2("Est.coef"))

results_proposed = []
results_supervised = []
results_pfnimputed = []
results_pfnppi = []

for k in range(rep+1):
    # Generate data
    np.random.seed(k + 20220122)
    DesiredData = GenerateData(n=n, N=N, p=p, option=option)
    data_labelled = np.array(DesiredData.rx2("Data.labelled"))
    data_unlabelled = np.array(DesiredData.rx2("Data.unlabelled"))

    # Model type
    if option in ["i", "W1", "S1"]:
        model_type = "linear"
    elif option in ["ii", "W2", "S2"]:
        model_type = "logistic"
    elif option in ["iii", "W3", "S3"]:
        model_type = "quantile"
    else:
        raise ValueError("Unknown option value!")
    
    # Proposed and supervised method
    estimation_proposed = PSSE(numpy2ri.py2rpy(data_labelled), numpy2ri.py2rpy(data_unlabelled),
                               type=model_type, tau=tau_0, alpha=polyOrder)
    hattheta_proposed = np.array(estimation_proposed.rx2("Hattheta"))
    results_proposed.append(hattheta_proposed)
    hattheta_supervised=np.array(SupervisedEst(numpy2ri.py2rpy(data_labelled),tau=tau_0,option=option).rx2("Est.coef"))
    results_supervised.append(hattheta_supervised)

    # TabPFN
    y_train = data_labelled[:, 0]
    X_train = data_labelled[:, 1:]
    X_test = data_unlabelled
    tabpfn_model = TabPFNRegressor()
    tabpfn_model.fit(X_train, y_train)
    y_hat_test = tabpfn_model.predict(X_test)
    y_hat_train = tabpfn_model.predict(X_train)

    ## TabPFN - Imputation
    X_all = np.vstack((X_test, X_train))
    y_imputed_all = np.hstack((y_hat_test, y_train))
    lin_imputation = LinearRegression()
    lin_imputation.fit(X_all, y_imputed_all)
    results_pfnimputed.append(np.concatenate(([lin_imputation.intercept_], lin_imputation.coef_)))

    ## PPI Rectifier
    lin_imputed_train = LinearRegression()
    lin_imputed_train.fit(X_train, y_hat_train)
    delta = np.concatenate(([lin_imputed_train.intercept_], lin_imputed_train.coef_)) - hattheta_supervised

    ## TabPFN - PPI (use all)
    y_hat_all = np.hstack((y_hat_test, y_hat_train))
    lin_ppi = LinearRegression()
    lin_ppi.fit(X_all, y_hat_all)
    results_pfnppi.append(np.concatenate(([lin_ppi.intercept_], lin_ppi.coef_)) - delta)


[1] "NOTE: The coefficients estimate is computed only using labelled data of size 10000000."


In [None]:
results_dict = {
    "Proposed Method (PSSE)": results_proposed,
    "Supervised Method (SupervisedEst)": results_supervised,
    "PFN Imputed Method": results_pfnimputed,
    "PFN PPI Method": results_pfnppi,
}

calculate_and_print_metrics(results_dict, target_parameter)