In [17]:
import sys, os
import random   # >>> FIXED <<<
# Detect if running inside Jupyter
if "__file__" in globals():
    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
else:
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

sys.path.append(project_root)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from utils.data_utils_newest import gen_star_from_x1, get_true_frequencies, gen_progressive
from utils.metrics import compute_mse
from utils.spl import random_split_perturb, random_split_estimate
from utils.rs_fd import rs_fd_perturb, rs_fd_estimate
from utils.rs_rfd import rs_rfd_perturb, rs_rfd_estimate
from utils.corr_rr_fixed_new import (
    corr_rr_phase1_spl,
    corr_rr_phase2_perturb,
    corr_rr_estimate,
    combine_phase_estimates,
    optimal_p_y,
    build_p_y_table,
)


mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rc('font', family='DejaVu Serif')

mpl.rcParams.update({
    'text.usetex': False,
    'font.size': 16,
    'axes.titlesize': 20,
    'axes.labelsize': 20,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20,
    'legend.fontsize': 20,
    'figure.titlesize': 20,
})
mpl.rcParams['mathtext.fontset'] = 'cm'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.size'] = 30


def print_mse_table(model_name, epsilon, phase1_pcts, means, n_total=200):
    print(f"\n================ MSE TABLE ({model_name} MODEL, ε = {epsilon}) ================\n")

    n1_values = [int(n_total * (p/100)) for p in phase1_pcts]
    headers = ["Phase 1"] + [f"n1={v}" for v in n1_values]

    rows = []
    rows.append(["RS+RFD"] + [f"{means['RS+RFD'][i]:.3e}" for i in range(len(phase1_pcts))])
    rows.append(["Corr-RR"] + [f"{means['Corr-RR'][i]:.3e}" for i in range(len(phase1_pcts))])

    col_widths = [
        max(len(row[i]) for row in rows + [headers]) + 2
        for i in range(len(headers))
    ]

    print("".join(headers[i].ljust(col_widths[i]) for i in range(len(headers))))
    print("-" * sum(col_widths))

    for row in rows:
        print("".join(row[i].ljust(col_widths[i]) for i in range(len(headers))))

    print()


# =================================================================
#                 STAR MODEL SWEEP (PRINT ONLY)
# =================================================================
def sweep_vs_phase1_star_table(
    phase1_pcts,
    epsilon,
    n,
    R,
    domain,
    rho,
    d,
    seed,
    x1_marginal=None,
    q_marginal=None,
    use_corr_rr=True,
):
    # Global seeding for dataset generation
    np.random.seed(seed)      # >>> FIXED <<<
    random.seed(seed)         # >>> FIXED <<<

    if x1_marginal is None:
        x1_marginal = {v: 1 / len(domain) for v in domain}

    df = gen_star_from_x1(
        n=n, domain=domain, d=d,
        x1_marginal=x1_marginal,
        rho=rho, q_marginal=q_marginal,
        seed=seed
    )

    true_freqs = get_true_frequencies(df)

    means = {
        "RS+RFD": np.zeros(len(phase1_pcts)),
        "Corr-RR": np.zeros(len(phase1_pcts)),
    }

    for idx, pct in enumerate(phase1_pcts):
        frac = pct / 100

        for r in range(R):
            # Re-seed for deterministic perturbation
            np.random.seed(seed + idx*1000 + r)  # >>> FIXED <<<
            random.seed(seed + idx*1000 + r)     # >>> FIXED <<<

            # === RS+RFD ===
            est_I1, df_B1, dom1 = corr_rr_phase1_spl(df, epsilon, frac=frac)
            n1 = len(df) - len(df_B1)
            n2 = len(df_B1)

            pert_fd = rs_rfd_perturb(df_B1, dom1, est_I1, epsilon)
            est_II = rs_rfd_estimate(pert_fd, dom1, est_I1, epsilon)
            comb_fd = combine_phase_estimates(est_I1, est_II, n1, n2)

            means["RS+RFD"][idx] += np.mean([compute_mse(true_freqs[c], comb_fd[c]) for c in df.columns])

            # === Corr-RR ===
            if use_corr_rr:
                est_I2, df_B2, dom2 = corr_rr_phase1_spl(df, epsilon, frac=frac)
                n1c = len(df) - len(df_B2)
                n2c = len(df_B2)

                p_y = build_p_y_table(est_I2, n2c, dom2, epsilon)

                pert_corr = corr_rr_phase2_perturb(df_B2, epsilon, est_I2, dom2, p_y)
                est_IIc = corr_rr_estimate(pert_corr, dom2, epsilon)

                comb_rr = combine_phase_estimates(est_I2, est_IIc, n1c, n2c)

                means["Corr-RR"][idx] += np.mean([compute_mse(true_freqs[c], comb_rr[c]) for c in df.columns])

        means["RS+RFD"][idx] /= R
        means["Corr-RR"][idx] /= R

    return means


# =================================================================
#           PROGRESSIVE MODEL SWEEP (PRINT ONLY)
# =================================================================
def sweep_vs_phase1_progressive_table(
    phase1_pcts,
    epsilon,
    n,
    R,
    domain,
    rho,
    d,
    seed,
    x1_marginal=None,
    q_marginal=None,
    use_corr_rr=True,
):
    # Global seeding
    np.random.seed(seed)      # >>> FIXED <<<
    random.seed(seed)         # >>> FIXED <<<

    if x1_marginal is None:
        x1_marginal = {v: 1 / len(domain) for v in domain}

    df = gen_progressive(
        n=n, domain=domain, d=d,
        x1_marginal=x1_marginal,
        rho=rho, q_marginal=q_marginal,
        seed=seed
    )

    true_freqs = get_true_frequencies(df)

    means = {
        "RS+RFD": np.zeros(len(phase1_pcts)),
        "Corr-RR": np.zeros(len(phase1_pcts)),
    }

    for idx, pct in enumerate(phase1_pcts):
        frac = pct / 100

        for r in range(R):
            # Re-seed inside loop
            np.random.seed(seed + idx*1000 + r)  # >>> FIXED <<<
            random.seed(seed + idx*1000 + r)     # >>> FIXED <<<

            # RS+RFD
            est_I1, df_B1, dom1 = corr_rr_phase1_spl(df, epsilon, frac=frac)
            n1 = len(df) - len(df_B1)
            n2 = len(df_B1)

            pert_fd = rs_rfd_perturb(df_B1, dom1, est_I1, epsilon)
            est_II = rs_rfd_estimate(pert_fd, dom1, est_I1, epsilon)
            comb_fd = combine_phase_estimates(est_I1, est_II, n1, n2)

            means["RS+RFD"][idx] += np.mean([compute_mse(true_freqs[c], comb_fd[c]) for c in df.columns])

            # Corr-RR
            if use_corr_rr:
                est_I2, df_B2, dom2 = corr_rr_phase1_spl(df, epsilon, frac=frac)
                n1c = len(df) - len(df_B2)
                n2c = len(df_B2)

                p_y = build_p_y_table(est_I2, n2c, dom2, epsilon)

                pert_corr = corr_rr_phase2_perturb(df_B2, epsilon, est_I2, dom2, p_y)
                est_IIc = corr_rr_estimate(pert_corr, dom2, epsilon)

                comb_rr = combine_phase_estimates(est_I2, est_IIc, n1c, n2c)

                means["Corr-RR"][idx] += np.mean([compute_mse(true_freqs[c], comb_rr[c]) for c in df.columns])

        means["RS+RFD"][idx] /= R
        means["Corr-RR"][idx] /= R

    return means



# =================================================================
#           TOP-LEVEL EXPERIMENT DRIVER
# =================================================================
def run_phase1_experiment(
    model="STAR",
    epsilons=[0.1, 0.3, 0.5],
    phase1_pcts=[5,10,15,20,25,30,35,40,45,50],
    n=200,
    domain=[0,1,2,3],
    R=50,
    rho=0.9,
    d=2,
    seed=42,
    x1_marginal=None,
):

    # Global deterministic seed once
    np.random.seed(seed)      # >>> FIXED <<<
    random.seed(seed)         # >>> FIXED <<<
    
    for epsilon in epsilons:

        if model == "STAR":
            means = sweep_vs_phase1_star_table(
                phase1_pcts=phase1_pcts,
                epsilon=epsilon,
                n=n,
                R=R,
                domain=domain,
                rho=rho,
                d=d,
                seed=seed
            )

        else:
            means = sweep_vs_phase1_progressive_table(
                phase1_pcts=phase1_pcts,
                epsilon=epsilon,
                n=n,
                R=R,
                domain=domain,
                rho=rho,
                d=d,
                seed=seed
            )

        print_mse_table(model, epsilon, phase1_pcts, means, n_total=n)





In [18]:
if __name__ == "__main__":

    # ------------------------------
    # Common experiment settings
    # ------------------------------
    epsilons = [0.1, 0.3, 0.5]          # ε values to test
    phase1_pcts = [5,10,15,20,25,30,35,40,45,50]
    n = 200
    domain = [0,1,2,3]
    rho = 0.1
    d = 2
    R = 1                              # use R=50 for real experiment
    seed = 42
    x1_marginal = {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.1}
    run_phase1_experiment(
        model="STAR",
        epsilons=epsilons,
        phase1_pcts=phase1_pcts,
        n=n,
        domain=domain,
        R=R,
        rho=rho,
        d=d,
        seed=seed,
        x1_marginal = x1_marginal,
    )





Phase 1  n1=10      n1=20      n1=30      n1=40      n1=50      n1=60      n1=70      n1=80      n1=90      n1=100     
-----------------------------------------------------------------------------------------------------------------------
RS+RFD   3.967e-01  5.143e-01  8.452e-01  1.344e+00  1.457e+00  2.166e+00  2.484e+00  2.467e+00  2.603e+00  6.565e+00  
Corr-RR  9.047e-01  8.852e+00  2.570e+00  1.408e+00  4.865e+00  2.656e+00  1.388e+00  1.693e+00  1.846e+00  6.395e+00  



Phase 1  n1=10      n1=20      n1=30      n1=40      n1=50      n1=60      n1=70      n1=80      n1=90      n1=100     
-----------------------------------------------------------------------------------------------------------------------
RS+RFD   6.178e-02  2.006e-01  2.304e-01  2.180e-01  8.971e-02  1.444e-01  3.402e-01  1.759e-01  4.240e-01  5.322e-01  
Corr-RR  4.314e-02  3.089e-01  1.590e-01  1.349e-01  2.118e-01  4.198e-01  1.922e-01  2.639e-01  1.558e-01  1.922e-01  



Phase 1  n1=10      n1=20      n

In [19]:
if __name__ == "__main__":

    # ------------------------------
    # Common experiment settings
    # ------------------------------
    epsilons = [0.1, 0.3, 0.5]          # ε values to test
    phase1_pcts = [5,10,15,20,25,30,35,40,45,50]
    n = 200
    domain = [0,1,2,3]
    rho = 0.1
    d = 2
    R = 1                              # use R=50 for real experiment
    seed = 42
    x1_marginal = {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.1}
    run_phase1_experiment(
        model="PROGRESSIVE",
        epsilons=epsilons,
        phase1_pcts=phase1_pcts,
        n=n,
        domain=domain,
        R=R,
        rho=rho,
        d=d,
        seed=seed,
        x1_marginal = x1_marginal,
    )





Phase 1  n1=10      n1=20      n1=30      n1=40      n1=50      n1=60      n1=70      n1=80      n1=90      n1=100     
-----------------------------------------------------------------------------------------------------------------------
RS+RFD   1.279e-01  9.754e-01  1.370e+00  2.720e-01  1.012e+00  1.298e+00  8.923e-01  6.412e-01  3.158e+00  1.481e+00  
Corr-RR  1.372e+00  4.041e+00  1.516e+00  3.033e+00  4.030e+00  1.759e+00  8.255e+00  2.131e+00  1.477e+00  6.564e+00  



Phase 1  n1=10      n1=20      n1=30      n1=40      n1=50      n1=60      n1=70      n1=80      n1=90      n1=100     
-----------------------------------------------------------------------------------------------------------------------
RS+RFD   2.881e-02  8.077e-02  1.661e-01  2.010e-01  4.581e-01  1.008e-01  2.488e-01  3.972e-01  7.945e-01  3.849e-01  
Corr-RR  6.231e-02  9.674e-02  1.505e-01  6.818e-02  1.815e-01  1.268e-01  2.208e-01  4.483e-01  3.154e-01  6.409e-01  



Phase 1  n1=10      n1=20      n