In [11]:
# p_y_table_check_avg.py

import os
import numpy as np
import pandas as pd

# --- import your project pieces ---
from data_utils_new import correlated_data_generator, get_true_frequencies
from corr_rr_fixed import (
    corr_rr_phase1_spl,
    build_p_y_table,
    optimal_p_y,
)

# ---------- minimal builder (ordered pairs) ----------
def _build_p_y_table_minimal(est_I, epsilon, n2, domain, cols):
    return {
        (a, b): float(optimal_p_y(est_I[a], est_I[b], epsilon=epsilon, n2=n2, domain=domain))
        for a in cols for b in cols if a != b
    }

# ---------- pretty checker ----------
def p_y_tables_for_epsilons(
    df,
    epsilons,
    frac_phase1_corr=0.1,
    use_minimal_builder=True,
    csv_dir=None,
    float_fmt="%.6f",
):
    if csv_dir:
        os.makedirs(csv_dir, exist_ok=True)

    cols = list(df.columns)
    results = {}

    for eps in epsilons:
        est_I, df_B, doms_stable = corr_rr_phase1_spl(df, eps, frac=frac_phase1_corr)
        n2 = len(df_B)

        if use_minimal_builder:
            common_domain = doms_stable[cols[0]]
            pmap = _build_p_y_table_minimal(est_I, eps, n2, common_domain, cols)
        else:
            pmap = build_p_y_table(est_I, n2=n2, domain_map=doms_stable, epsilon=eps)

        mat = pd.DataFrame(index=cols, columns=cols, dtype=float)
        for a in cols:
            for b in cols:
                if a == b:
                    mat.loc[a, b] = float('nan')
                else:
                    mat.loc[a, b] = float(pmap[(a, b)])

        results[eps] = mat
    return results


# ---------- average over multiple runs ----------
def average_p_y_tables(
    df,
    epsilons,
    runs=5,
    frac_phase1_corr=0.1,
    use_minimal_builder=True,
    float_fmt="%.6f",
):
    cols = list(df.columns)
    accum = {eps: pd.DataFrame(0.0, index=cols, columns=cols) for eps in epsilons}

    for r in range(runs):
        #print(f"\n--- Run {r+1}/{runs} ---")
        results = p_y_tables_for_epsilons(
            df,
            epsilons,
            frac_phase1_corr=frac_phase1_corr,
            use_minimal_builder=use_minimal_builder,
            csv_dir=None,
            float_fmt=float_fmt,
        )
        for eps in epsilons:
            accum[eps] = accum[eps].add(results[eps].fillna(0.0), fill_value=0.0)

    avg_results = {}
    for eps in epsilons:
        mat = accum[eps] / runs
        for c in cols:
            mat.loc[c, c] = np.nan
        print(f"\n=== Averaged p_y table over {runs} runs (epsilon = {eps}) ===")
        with pd.option_context('display.float_format', lambda v: float_fmt % v):
            print(mat)
        avg_results[eps] = mat
    return avg_results




Empirical P[X2==X1]: 0.59975
Frequencies:
X1 {0: 0.4056, 1: 0.2933, 2: 0.1521, 3: 0.1018, 4: 0.0471}
X2 {0: 0.2711, 1: 0.2472, 2: 0.2089, 3: 0.17, 4: 0.1029}

=== Averaged p_y table over 40 runs (epsilon = 0.2) ===
         X1       X2
X1      NaN 0.804163
X2 0.803836      NaN

=== Averaged p_y table over 40 runs (epsilon = 0.3) ===
         X1       X2
X1      NaN 0.870766
X2 0.863206      NaN

=== Averaged p_y table over 40 runs (epsilon = 0.4) ===
         X1       X2
X1      NaN 0.920774
X2 0.902454      NaN


In [18]:

 

# ---------------- example ----------------
if __name__ == "__main__":
    domain = [0, 1, 2,3,4]
    base_marginals = {
        'X1': {0:0.40, 1:0.30, 2:0.15, 3:0.10, 4:0.05},    # skewed X1
        'X2': [0.10, 0.20, 0.30, 0.25, 0.15],              # skewed X2
        'default': [0.05, 0.15, 0.20, 0.25, 0.35],         # fallback for any other Xi
    }
    correlations = [('X1','X2', 0.51)]  # P[X2==X1] ≈ 0.6

    df = correlated_data_generator(
        domain=domain,
        n=20000,
        correlations=correlations,
        total_attributes=2,
        seed=7,
        base_marginals=base_marginals,
    )
    freqs = get_true_frequencies(df)
    print("Empirical P[X2==X1]:", (df['X1'] == df['X2']).mean())
    # print("Frequencies:")
    # for col, f in freqs.items():
    #     print(col, {k: round(v,4) for k,v in f.items()})    


    eps_list = [0.2, 0.3, 0.4]

    _ = average_p_y_tables(
        df,
        eps_list,
        runs=100,          # number of repetitions
        frac_phase1_corr=0.2,
        use_minimal_builder=True,
    )


Empirical P[X2==X1]: 0.5107

=== Averaged p_y table over 100 runs (epsilon = 0.2) ===
         X1       X2
X1      NaN 0.764068
X2 0.765367      NaN

=== Averaged p_y table over 100 runs (epsilon = 0.3) ===
         X1       X2
X1      NaN 0.852736
X2 0.825795      NaN

=== Averaged p_y table over 100 runs (epsilon = 0.4) ===
         X1       X2
X1      NaN 0.913300
X2 0.871516      NaN


In [19]:

# ---------------- example ----------------
if __name__ == "__main__":
    domain = [0, 1, 2,3,4]
    base_marginals = {
        'X1': {0:0.40, 1:0.30, 2:0.15, 3:0.10, 4:0.05},    # skewed X1
        'X2': [0.10, 0.20, 0.30, 0.25, 0.15],              # skewed X2
        'default': [0.05, 0.15, 0.20, 0.25, 0.35],         # fallback for any other Xi
    }
    correlations = [('X1','X2', 0.99)]  # P[X2==X1] ≈ 0.6

    df = correlated_data_generator(
        domain=domain,
        n=20000,
        correlations=correlations,
        total_attributes=2,
        seed=7,
        base_marginals=base_marginals,
    )
    freqs = get_true_frequencies(df)
    print("Empirical P[X2==X1]:", (df['X1'] == df['X2']).mean())
    # print("Frequencies:")
    # for col, f in freqs.items():
    #     print(col, {k: round(v,4) for k,v in f.items()})    


    eps_list = [0.2, 0.3, 0.4]

    _ = average_p_y_tables(
        df,
        eps_list,
        runs=100,          # number of repetitions
        frac_phase1_corr=0.2,
        use_minimal_builder=True,
    )


Empirical P[X2==X1]: 0.99065

=== Averaged p_y table over 100 runs (epsilon = 0.2) ===
         X1       X2
X1      NaN 0.785477
X2 0.814095      NaN

=== Averaged p_y table over 100 runs (epsilon = 0.3) ===
         X1       X2
X1      NaN 0.883422
X2 0.867161      NaN

=== Averaged p_y table over 100 runs (epsilon = 0.4) ===
         X1       X2
X1      NaN 0.923638
X2 0.900286      NaN
