In [1]:
import os
# Set up R environment
os.environ["R_HOME"] = r"C:/PROGRA~1/R/R-44~1.0"
os.environ["R_USER"] = os.environ["R_HOME"]

import rpy2.robjects as ro
from rpy2.robjects import numpy2ri
import numpy as np
import torch
from tabpfn import TabPFNClassifier
from sklearn.linear_model import LogisticRegression
import json  # For saving results

def calculate_and_print_metrics(results_dict, target_parameter):
    """Calculates and prints MSE, bias, and variance (both averaged and vector)."""
    metrics = {}
    for method_name, results_list in results_dict.items():
        results = np.array(results_list)
        # Averaged metrics
        mse_avg = np.mean(np.mean((results - target_parameter)**2, axis=0))
        bias_avg = np.mean(np.mean(results - target_parameter, axis=0))
        variance_avg = np.mean(np.var(results, axis=0))
        metrics[method_name] = {"MSE_Avg": mse_avg, "Bias_Avg": bias_avg, "Variance_Avg": variance_avg}
        # Vector metrics
        mse_vec = np.mean((results - target_parameter)**2, axis=0).tolist()
        bias_vec = np.mean(results - target_parameter, axis=0).tolist()
        variance_vec = np.var(results, axis=0).tolist()
        metrics[method_name]["MSE_Vec"] = mse_vec
        metrics[method_name]["Bias_Vec"] = bias_vec
        metrics[method_name]["Variance_Vec"] = variance_vec
    return metrics

numpy2ri.activate()
ro.r('library(MASS)')
ro.r('library(caret)')
ro.r('library(quantreg)')
ro.r('source("semi_supervised_methods.R")')
ro.r('source("dataGeneration.R")')
ro.r('source("SupervisedEstimation.R")')

# Parameter settings
settings = [
    {"n": n_val, "N": N_val, "p": p_val}
    for p_val in [4,5,6,7,8,9]
    for n_val in [300, 500]
    for N_val in [500, 1000, 2000]
]

rep = 500  # Reduced reps for faster testing
option = "ii"
polyOrder = 4
tau_0 = 0.5

# Store target parameters based on p
target_parameters = {}

unique_p_values = sorted(list(set(s["p"] for s in settings)))
for p_val in unique_p_values:
    ro.r('set.seed(1230988)')
    LargeLabelledData = ro.r('GenerateData(n=10^6, p={}, option="{}")$Data.labelled'.format(p_val, option))
    target_parameter_vec = ro.r('SupervisedEst')(LargeLabelledData, tau=tau_0, option=option).rx2("Est.coef")
    target_parameters[p_val] = np.array(target_parameter_vec)

all_results = {}  # Store results for all settings

for setting in settings:
    n = setting["n"]
    N = setting["N"]
    p = setting["p"]

    target_parameter_vec = target_parameters[p]

    results_supervised = []
    results_proposed = []
    results_pfnimputed = []
    results_pfnppi = []

    for k in range(1, rep + 1):
        ro.r('set.seed({})'.format(k + 20220122))
        DesiredData = ro.r('GenerateData(n={}, N={}, p={}, option="{}")'.format(n, N, p, option))
        data_labelled = DesiredData.rx2("Data.labelled")
        data_unlabelled = DesiredData.rx2("Data.unlabelled")

        hattheta_supervised = ro.r('SupervisedEst')(data_labelled, tau=tau_0, option=option).rx2("Est.coef")

        if option in ["i", "W1", "S1"]:
            model_type = "linear"
        elif option in ["ii", "W2", "S2"]:
            model_type = "logistic"
        elif option in ["iii", "W3", "S3"]:
            model_type = "quantile"

        estimation_proposed = ro.r('PSSE')(data_labelled, data_unlabelled, type=model_type, sd=True, tau=tau_0, alpha=polyOrder)
        hattheta_proposed = estimation_proposed.rx2("Hattheta")

        # TabPFN
        y_train = data_labelled[:, 0]
        X_train = data_labelled[:, 1:]
        X_test = data_unlabelled
        clf = TabPFNClassifier()
        clf.fit(X_train, y_train)
        y_hat_test = clf.predict(X_test)
        y_hat_train = clf.predict(X_train)

        ## TabPFN - Imputation
        X_all = np.vstack((X_test, X_train))
        y_imputed_all = np.hstack((y_hat_test, y_train))
        lin_imputation = LogisticRegression()
        lin_imputation.fit(X_all,  y_imputed_all)
        results_pfnimputed.append(np.concatenate(([lin_imputation.intercept_[0]], lin_imputation.coef_.ravel())))

        ## PPI Rectifier
        lin_imputed_train = LogisticRegression()
        lin_imputed_train.fit(X_train, y_hat_train)
        delta = np.concatenate(([lin_imputed_train.intercept_[0]], lin_imputed_train.coef_.ravel())) - hattheta_supervised

        ## TabPFN - PPI (use all)
        y_hat_all = np.hstack((y_hat_test, y_hat_train))
        lin_ppi = LogisticRegression()
        lin_ppi.fit(X_all, y_hat_all)
        results_pfnppi.append(np.concatenate(([lin_ppi.intercept_[0]], lin_ppi.coef_.ravel()))-delta)

        results_supervised.append(np.array(hattheta_supervised))
        results_proposed.append(np.array(hattheta_proposed))

    results_dict = {
        "SLZ": np.array(results_proposed),
        "Supervised": np.array(results_supervised),
        "TabPFN Imputed": np.array(results_pfnimputed),
        "TabPFN Debiased": np.array(results_pfnppi),
    }

    metrics = calculate_and_print_metrics(results_dict, target_parameter_vec)
    all_results[f"n={n}, N={N}, p={p}"] = metrics  # Store results with settings

# Save results to a JSON file
with open("logistic_copy.json", "w") as f:
    json.dump(all_results, f, indent=4)

print("Simulation complete. Results saved to logistic.json")

R[write to console]: Loading required package: ggplot2

R[write to console]: Loading required package: lattice

R[write to console]: Loading required package: SparseM

R[write to console]: 
Attaching package: 'SparseM'


R[write to console]: The following object is masked from 'package:base':

    backsolve




[1] "NOTE: The coefficients estimate is computed only using labelled data of size 1000000."
[1] "NOTE: The coefficients estimate is computed only using labelled data of size 1000000."
[1] "NOTE: The coefficients estimate is computed only using labelled data of size 1000000."
[1] "NOTE: The coefficients estimate is computed only using labelled data of size 1000000."
[1] "NOTE: The coefficients estimate is computed only using labelled data of size 1000000."
[1] "NOTE: The coefficients estimate is computed only using labelled data of size 1000000."
Simulation complete. Results saved to logistic.json
