In [None]:
import os
# Set up R environment
os.environ["R_HOME"] = r"C:/PROGRA~1/R/R-44~1.0"
os.environ["R_USER"] = os.environ["R_HOME"]

import rpy2.robjects as ro
from rpy2.robjects import numpy2ri
import numpy as np
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.linear_model import LogisticRegression

def calculate_and_print_metrics(results_dict, target_parameter):
    """
    Calculates and prints MSE, bias, and variance for multiple methods.

    Args:
        results_dict (dict): A dictionary where keys are method names (strings)
                             and values are lists of result arrays.
        target_parameter (np.array): The true parameter values.
    """

    print("Performance Comparison:")
    print(f"Target Parameter: {target_parameter}")

    for method_name, results_list in results_dict.items():
        results = np.array(results_list)

        # Calculate MSE
        mse = np.mean(np.mean((results - target_parameter)**2, axis=1))

        # Calculate bias
        bias = np.mean(np.mean(results - target_parameter, axis=0))

        # Calculate variance
        variance = np.mean(np.var(results, axis=0))

        print(f"\n{method_name}:")
        print(f"  MSE: {mse}")
        print(f"  Bias: {bias}")
        print(f"  Variance: {variance}")

# Activate automatic conversion between R and numpy
numpy2ri.activate()

# Load required R packages
ro.r('library(MASS)')
ro.r('library(caret)')
ro.r('library(quantreg)')

# Source necessary R scripts
ro.r('source("semi_supervised_methods.R")')
ro.r('source("dataGeneration.R")')
ro.r('source("SupervisedEstimation.R")')

# Global parameters
n = 300       # Labeled data size
N = 500       # Unlabeled data size
p = 7         # Dimension of predictor vector
rep = 1000    # Number of replications
option = "ii" # Data setting option
polyOrder = 4 # Polynomial order
tau_0 = 0.5   # Quantile level

# Compute target parameter using a large dataset
ro.r('set.seed(1230988)')
LargeLabelledData = ro.r('GenerateData(n=10^7, p={}, option="{}")$Data.labelled'.format(p, option))
target_parameter = ro.r('SupervisedEst')(LargeLabelledData, tau=tau_0, option=option).rx2("Est.coef")

# Storage for results
results_supervised = []
results_proposed = []
results_pfnimputed = []
results_pfnppi = []

# Run multiple replications
for k in range(1, rep + 1):
    ro.r('set.seed({})'.format(k + 20220122))
    
    # Data generation
    DesiredData = ro.r('GenerateData(n={}, N={}, p={}, option="{}")'.format(n, N, p, option))
    data_labelled = DesiredData.rx2("Data.labelled")
    data_unlabelled = DesiredData.rx2("Data.unlabelled")

    # Supervised estimator
    hattheta_supervised = ro.r('SupervisedEst')(data_labelled, tau=tau_0, option=option).rx2("Est.coef")

    # Determine model type
    if option in ["i", "W1", "S1"]:
        model_type = "linear"
    elif option in ["ii", "W2", "S2"]:
        model_type = "logistic"
    elif option in ["iii", "W3", "S3"]:
        model_type = "quantile"
    
    # Proposed estimator (PSSE)
    estimation_proposed = ro.r('PSSE')(data_labelled, data_unlabelled, type=model_type, sd=True, tau=tau_0, alpha=polyOrder)
    hattheta_proposed = estimation_proposed.rx2("Hattheta")
    
    # TabPFN
    y_train = data_labelled[:, 0]
    X_train = data_labelled[:, 1:]
    X_test = data_unlabelled
    clf = TabPFNClassifier()
    clf.fit(X_train, y_train)
    y_hat_test = clf.predict(X_test)
    y_hat_train = clf.predict(X_train)

    ## TabPFN - Imputation
    X_all = np.vstack((X_test, X_train))
    y_imputed_all = np.hstack((y_hat_test, y_train))
    lin_imputation = LogisticRegression()
    lin_imputation.fit(X_all,  y_imputed_all)
    results_pfnimputed.append(np.concatenate(([lin_imputation.intercept_[0]], lin_imputation.coef_.ravel())))

    ## PPI Rectifier
    lin_imputed_train = LogisticRegression()
    lin_imputed_train.fit(X_train, y_hat_train)
    delta = np.concatenate(([lin_imputed_train.intercept_[0]], lin_imputed_train.coef_.ravel())) - hattheta_supervised

    ## TabPFN - PPI (use all)
    y_hat_all = np.hstack((y_hat_test, y_hat_train))
    lin_ppi = LogisticRegression()
    lin_ppi.fit(X_all, y_hat_all)
    results_pfnppi.append(np.concatenate(([lin_ppi.intercept_[0]], lin_ppi.coef_.ravel()))-delta)

    # Store results
    results_supervised.append(hattheta_supervised)
    results_proposed.append(hattheta_proposed)
    
# Convert lists to numpy arrays
results_supervised = np.array(results_supervised)
results_proposed = np.array(results_proposed)

In [None]:
results_dict = {
    "Proposed Method (PSSE)": results_proposed,
    "Supervised Method (SupervisedEst)": results_supervised,
    "PFN Imputed Method": results_pfnimputed,
    "PFN PPI Method": results_pfnppi,
}

calculate_and_print_metrics(results_dict, target_parameter)