In [92]:
import os
import glob
import pickle
import time

os.environ["PRIVBAYES_BIN"] = "./ydnpd/harness/synthesis/privbayes/mac_bin"

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from IPython.display import display, Markdown


import ydnpd
from additional import ADDITIONAL_EXPERIMENTS, ADDITIONAL_PATH
from ydnpd import EVALUATION_METRICS, ALL_EXPERIMENTS, Experiments

ADDITIONAL_DATASETS = sum(list(ADDITIONAL_EXPERIMENTS.values()), [])

with open("./results/harness.pkl", "rb") as f:
    utility_tasks_results = pickle.load(f)

for x in utility_tasks_results:
    if type(x["evaluation"]["error_rate_diff"]) == list:
        assert len(x["evaluation"]["error_rate_diff"]) == 1
        x["evaluation"]["error_rate_diff"] = x["evaluation"]["error_rate_diff"][0]

ALL_EXPERIMENTS_SIMPLIFIED = {name: exp.dev_names for name, exp in ALL_EXPERIMENTS.items()}
ADDITIONAL_EXPERIMENTS_SIMPLIFIED = {name: [x[0] for x in exp] for name, exp in ADDITIONAL_EXPERIMENTS.items()}
ALL_EXPERIMENTS_SIMPLIFIED_MERGED = {
    name: ADDITIONAL_EXPERIMENTS_SIMPLIFIED.get(name, []) + dev_names
    for name, dev_names in ALL_EXPERIMENTS_SIMPLIFIED.items()
}

reference_data_mapping = {
    "acs": "acs/national",
    "edad": "edad/2023",
    "we": "we/2023"
}

ALL_EXPERIMENTS_OBJ = {
    name: Experiments(reference_data_mapping.get(name, ""), dev_names)
    for name, dev_names in ALL_EXPERIMENTS_SIMPLIFIED_MERGED.items()
}

# remove all the real data and public data baselines,
# and now the MIX data
to_remove = [
    # 'acs/national',
    # 'edad/2023',
    # 'we/2023',
    # 'acs/massachusetts_upsampled',
    # 'edad/2020',
    # 'we/2018',
    ## MIX data
    # 'edad/gen-MIX-MAX',
    # 'acs/gen-MIX-MAX',
    # 'we/gen-MIX-MAX',
    # 'acs/gen-llama-MIX-MAX',
    # 'edad/gen-llama-MIX-MAX',
    # 'we/gen-llama-MIX-MAX',
    # 'acs/gen-claude-MIX-MAX',
    # 'edad/gen-claude-MIX-MAX',
    # 'we/gen-claude-MIX-MAX',
    # 'acs/gen-gpt-MIX-MAX',
    # 'edad/gen-gpt-MIX-MAX',
    # 'we/gen-gpt-MIX-MAX',
]

ALL_EXPERIMENTS_OBJ = {
    name: Experiments(exp.test_name, [dev for dev in exp.dev_names if dev not in to_remove])
    for name, exp in ALL_EXPERIMENTS_OBJ.items()
}

# NO MORE ABSOLUTE RANKINGS
# for hyperparameter tuning:
#   - we are going to do percent degradation
#   - as in how much worse is the method compared to the best method
#   - ok to aggregate across epsilon (or give conditional results)

# for privacy utility tradeoff:
#   - we are going to do a pareto frontier analysis
#   - we are going to do some sort of L1 advantage comparison
#   - here we care about epsilon [1,2,4]

# groupings of metrics

    # what is the pareto frontier? 
    # how correlated are the metrics with each other?

METRIC_DIRECTION = {
    # correlation based metrics 
    "total_variation_distance": "closer_to_zero_is_better",
    "pearson_corr_max_abs_diff": "closer_to_zero_is_better",
    "pearson_corr_avg_abs_diff": "closer_to_zero_is_better",
    "cramer_v_corr_max_abs_diff": "closer_to_zero_is_better",
    "cramer_v_corr_avg_abs_diff": "closer_to_zero_is_better",

    # marginals based metrics
    "marginals_3_max_abs_diff_error": "closer_to_zero_is_better",
    "marginals_3_avg_abs_diff_error": "closer_to_zero_is_better",
    "thresholded_marginals_3_max_abs_diff_error": "closer_to_zero_is_better",
    "thresholded_marginals_3_avg_abs_diff_error": "closer_to_zero_is_better",
    
    # classification based metrics
    "error_rate_diff": "closer_to_zero_is_better",  
    "aoc_diff": "closer_to_zero_is_better",  # auc

    # "error_rate_train_dataset": "closer_to_zero_is_better",
    # "error_rate_synth_dataset": "closer_to_zero_is_better",   
    # "aoc_train_dataset": "higher_is_better",
    # "aoc_synth_dataset": "higher_is_better",
}

metrics_marginals = [
    "marginals_3_max_abs_diff_error",
    "marginals_3_avg_abs_diff_error",
    "thresholded_marginals_3_max_abs_diff_error",
    "thresholded_marginals_3_avg_abs_diff_error"
]

metrics_correlations = [
    "total_variation_distance",
    "pearson_corr_max_abs_diff",
    "pearson_corr_avg_abs_diff",
    "cramer_v_corr_max_abs_diff",
    "cramer_v_corr_avg_abs_diff"
]

metrics_classification = [
    "error_rate_diff",
    "aoc_diff",
    # "error_rate_train_dataset",
    # "error_rate_synth_dataset",
    # "aoc_train_dataset",
    # "aoc_synth_dataset"
]

# error_rate_train_dataset / error_rate_synth_dataset / aoc_train_dataset / aoc_synth_dataset


In [93]:
ALL_EXPERIMENTS_OBJ

{'acs': Experiments(test_name='acs/national', dev_names=['acs/gen-MIX-MAX', 'acs/csv-gpt', 'acs/gen-gpt-MIX-MAX', 'acs/sdscm-gpt2', 'acs/gen-llama-MIX-MAX', 'acs/sdscm-olmo-1b-hf', 'acs/gen-gpt-MIX-UNIF', 'acs/gen-claude-MIX-MAX', 'acs/csv-claude', 'acs/gen-claude-MIX-UNIF', 'acs/gen-MIX-UNIF', 'acs/gen-llama-MIX-UNIF', 'acs/sdscm-llama-3-8b', 'acs/csv-llama', 'acs/national', 'acs/massachusetts_upsampled', 'acs/baseline_univariate', 'acs/baseline_domain', 'acs/arbitrary']),
 'edad': Experiments(test_name='edad/2023', dev_names=['edad/gen-MIX-MAX', 'edad/csv-gpt', 'edad/gen-gpt-MIX-MAX', 'edad/sdscm-gpt2', 'edad/gen-llama-MIX-MAX', 'edad/sdscm-olmo-1b-hf', 'edad/gen-gpt-MIX-UNIF', 'edad/gen-claude-MIX-MAX', 'edad/csv-claude', 'edad/gen-claude-MIX-UNIF', 'edad/gen-MIX-UNIF', 'edad/gen-llama-MIX-UNIF', 'edad/sdscm-llama-3-8b', 'edad/csv-llama', 'edad/2023', 'edad/2020', 'edad/baseline_univariate', 'edad/baseline_domain', 'edad/arbitrary']),
 'we': Experiments(test_name='we/2023', dev_name

In [100]:
utility_tasks_results[0:2]

[{'epsilon': 1,
  'synth_name': 'privbayes',
  'dataset_name': 'acs/national',
  'hparams': {'theta': 2, 'epsilon_split': 0.1},
  'evaluation': {'error_rate_train_dataset': 0.28339611706751666,
   'error_rate_synth_dataset': 0.3078817733990148,
   'pearson_corr_max_abs_diff': 0.17128663962155152,
   'pearson_corr_train_dataset': [[0, 0, 0, 0, 0, 0, 0],
    [0.14042261096610978, 0, 0, 0, 0, 0, 0],
    [0.04169438956264132, 0.26182419188444295, 0, 0, 0, 0, 0],
    [0.05412685965547898, 0.38847193160922217, 0.216169388767664, 0, 0, 0, 0],
    [0.15208903067542315,
     0.36340362674626603,
     0.27534127657565277,
     0.3214151651384918,
     0,
     0,
     0],
    [0.08215510231716827,
     0.31089067901984646,
     0.3460615926465102,
     0.2595535407398533,
     0.5219348794142432,
     0,
     0],
    [0.05912784231797163,
     0.2892374654111006,
     0.07127893655076525,
     0.7071067811865476,
     0.2808729271546567,
     0.2841759160273773,
     0]],
   'total_variation_dist

## Absolute rankings

In [94]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore")

column_to_rank_on = 'correspond_test'
for experiments_name, experiments in ALL_EXPERIMENTS_OBJ.items():

    display(Markdown(f"## {experiments_name.upper()}"))

    datasets = {}

    for metric in METRIC_DIRECTION.keys():

        # if metric not in metrics_marginals:
        #     continue

        # if experiments_name != 'acs':
        #     continue

        display(Markdown(f"### {metric}"))
        with pd.option_context("display.max_rows", None):
            try: 
                df_extract_all = ydnpd.UtilityTask.evaluate(utility_tasks_results, experiments, metric).multiply(100).round(2)
                
                # flatten df_extract_all
                df_extract_all = df_extract_all.reset_index()

                df_extract_all_smaller = df_extract_all[['synth_name', 'experiment', 'epsilon', column_to_rank_on]] 

                # # rank based on lower or higher being better (closer_to_zero_is_better is default for lower_is_better)
                # metric_direction = METRIC_DIRECTION[metric]

                # df_extract_all_smaller.loc[:, "_rank_value"] = df_extract_all_smaller[column_to_rank_on]

                # if metric_direction == "closer_to_zero_is_better":
                #     df_extract_all_smaller.loc[:, "_rank_value"] = df_extract_all_smaller["_rank_value"].abs()
                #     ascending = True
                #     aggfunc = "min"
                # elif metric_direction == "lower_is_better":
                #     ascending = True
                #     aggfunc = "min"
                # else:  # "higher_is_better"
                #     ascending = False
                #     aggfunc = "max"

                # idx = df_extract_all_smaller.groupby(
                #     ['synth_name', 'experiment', 'epsilon']
                # )["_rank_value"].idxmin() if ascending else df_extract_all_smaller.groupby(
                #     ['synth_name', 'experiment', 'epsilon']
                # )["_rank_value"].idxmax()

                # df_best = df_extract_all_smaller.loc[idx].reset_index(drop=True)

                # df_experiment_best = df_best.groupby(['synth_name', 'epsilon', 'experiment'])[column_to_rank_on].min().reset_index()
                # df_experiment_best['rank'] = df_experiment_best.groupby(['synth_name', 'epsilon'])[column_to_rank_on].rank(method='dense', ascending=True)
                # df_experiment_best = df_experiment_best.sort_values(['synth_name', 'epsilon', 'rank'])

                # df_final = pd.merge(df_best, df_experiment_best[['synth_name', 'epsilon', 'experiment', 'rank']],
                #                     on=['synth_name', 'epsilon', 'experiment'],
                #                     how='left')

                # df_final_avg = df_final.groupby(['experiment','synth_name'])['rank'].mean().reset_index()

                # # for synth_name in df_final_avg['synth_name'].unique():
                # df_final_avg.groupby(['experiment','synth_name'])['rank'].mean().reset_index()
                df_extract_all_smaller.to_csv(f"./results/all_rankings/{metric}_{experiments_name}.csv")
                # df_final_avg.to_csv(f"./results/all_rankings/{metric}_{experiments_name}.csv")
            except Exception as e:
                print(f"Error: {e}")
                continue




## ACS

### total_variation_distance

### pearson_corr_max_abs_diff

### pearson_corr_avg_abs_diff

### cramer_v_corr_max_abs_diff

### cramer_v_corr_avg_abs_diff

KeyboardInterrupt: 

In [80]:
# load from results/all_rankings
all_results = {}
for filepath in glob.glob("./results/all_rankings/*"):

    df = pd.read_csv(filepath)
    all_results[filepath.split('/')[3].split('.')[0]] = df

In [81]:
all_results.keys()

dict_keys(['pearson_corr_avg_abs_diff_edad', 'marginals_3_max_abs_diff_error_we', 'cramer_v_corr_avg_abs_diff_we', 'cramer_v_corr_max_abs_diff_edad', 'aoc_diff_we', 'total_variation_distance_acs', 'cramer_v_corr_avg_abs_diff_acs', 'marginals_3_avg_abs_diff_error_edad', 'error_rate_diff_acs', 'cramer_v_corr_max_abs_diff_acs', 'total_variation_distance_we', 'cramer_v_corr_avg_abs_diff_edad', 'thresholded_marginals_3_max_abs_diff_error_acs', 'total_variation_distance_edad', 'thresholded_marginals_3_avg_abs_diff_error_edad', 'pearson_corr_avg_abs_diff_we', 'pearson_corr_max_abs_diff_acs', 'pearson_corr_avg_abs_diff_acs', 'error_rate_synth_dataset_we', 'pearson_corr_max_abs_diff_edad', 'error_rate_train_dataset_we', 'super_ranking', 'thresholded_marginals_3_max_abs_diff_error_we', 'thresholded_marginals_3_avg_abs_diff_error_acs', 'marginals_3_max_abs_diff_error_edad', 'marginals_3_avg_abs_diff_error_acs', 'marginals_3_avg_abs_diff_error_we', 'pearson_corr_max_abs_diff_we', 'marginals_3_max_

In [83]:
df_max_marg = all_results['marginals_3_max_abs_diff_error_acs']
df_avg_marg = all_results['marginals_3_avg_abs_diff_error_acs']
df_max_marg_t = all_results['thresholded_marginals_3_max_abs_diff_error_acs']
df_avg_marg_t = all_results['thresholded_marginals_3_avg_abs_diff_error_acs']

### Marginals

In [84]:
df_max_marg[(df_max_marg['epsilon'] == 4)].sort_values('correspond_test').groupby(['synth_name']).describe()['correspond_test']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
synth_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aim_jax,19.0,0.617895,0.34076,0.4,0.42,0.42,0.81,1.31
gem,19.0,36.252632,0.122149,36.15,36.17,36.24,36.27,36.57
privbayes,19.0,3.491053,0.555077,3.08,3.08,3.31,3.555,4.95


In [85]:
df_avg_marg[(df_avg_marg['epsilon'] == 4)].sort_values('correspond_test').groupby(['synth_name']).describe()['correspond_test']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
synth_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aim_jax,19.0,4.937368,0.111597,4.81,4.81,5.03,5.03,5.03
gem,19.0,179.752105,0.455785,178.85,179.37,179.88,180.015,180.63
privbayes,19.0,15.430526,2.81991,14.33,14.33,14.33,15.01,25.38


In [86]:
df_max_marg_t[(df_max_marg_t['epsilon'] == 4)].sort_values('correspond_test').groupby(['synth_name']).describe()['correspond_test']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
synth_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aim_jax,19.0,0.614737,0.296111,0.48,0.48,0.5,0.5,1.28
gem,19.0,52.255263,0.368682,51.63,51.88,52.31,52.56,52.68
privbayes,19.0,4.126316,2.290085,2.58,2.58,2.88,4.74,8.94


In [91]:
df_avg_marg_t[(df_avg_marg_t['epsilon'] == 4)].sort_values('correspond_test').groupby(['synth_name', 'experiment']).describe()['correspond_test'].sort_values('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
synth_name,experiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
aim_jax,acs/national/acs/national,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/baseline_domain,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/csv-claude,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/csv-gpt,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/csv-llama,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/gen-MIX-MAX,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/sdscm-llama-3-8b,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/gen-claude-MIX-MAX,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/sdscm-gpt2,1.0,0.85,,0.85,0.85,0.85,0.85,0.85
aim_jax,acs/national/acs/gen-gpt-MIX-UNIF,1.0,0.85,,0.85,0.85,0.85,0.85,0.85


## Correlations

In [87]:
# df_extract_all_smaller[(df_extract_all_smaller['epsilon'] == 4)].sort_values('correspond_test').groupby(['synth_name', 'experiment']).describe()['correspond_test']

In [48]:
df_final_avg[df_final_avg['synth_name'] == 'privbayes'].sort_values('rank')

Unnamed: 0,experiment,synth_name,rank
47,acs/national/acs/national,privbayes,1.0
56,acs/national/acs/sdscm-olmo-1b-hf,privbayes,1.25
50,acs/national/acs/sdscm-gpt2,privbayes,1.25
11,acs/national/acs/csv-claude,privbayes,1.25
17,acs/national/acs/csv-llama,privbayes,1.25
53,acs/national/acs/sdscm-llama-3-8b,privbayes,1.5
44,acs/national/acs/massachusetts_upsampled,privbayes,1.75
41,acs/national/acs/gen-llama-MIX-UNIF,privbayes,1.75
35,acs/national/acs/gen-gpt-MIX-UNIF,privbayes,1.75
32,acs/national/acs/gen-gpt-MIX-MAX,privbayes,1.75


In [39]:
# slightly hacky, just grabbing all files in the rankings folder
RANKINGS_FOLDER = "./results/all_rankings"

ranking_files = glob.glob(os.path.join(RANKINGS_FOLDER, "*.csv"))

dfs = []

for file_path in ranking_files:
    file_name = os.path.basename(file_path) 
    file_stem, _ = os.path.splitext(file_name)

    parts = file_stem.split("_")
    experiments_name = parts[-1]           
    metric = "_".join(parts[:-1])          

    df = pd.read_csv(file_path)

    df["metric"] = metric
    df["experiments_name"] = experiments_name

    dfs.append(df)

all_ranks_df = pd.concat(dfs, ignore_index=True)

# ccreate a super ranking by averaging ranks across all metrics
# for each (experiment, synth_name) within each experiments_name 
super_rank_df = (
    all_ranks_df
    .groupby(["experiments_name", "experiment", "synth_name"], as_index=False)["rank"]
    .mean()
    .rename(columns={"rank": "super_rank"})
)

super_rank_df = super_rank_df.sort_values(["experiments_name", "super_rank"])

super_rank_df.to_csv(os.path.join(RANKINGS_FOLDER, "super_ranking.csv"), index=False)

print("super ranking saved as ", os.path.join(RANKINGS_FOLDER, "super_ranking.csv"))


super ranking saved as  ./results/all_rankings/super_ranking.csv


In [40]:
df_privbayes = super_rank_df[super_rank_df["synth_name"] == "privbayes"].copy()
df_gem       = super_rank_df[super_rank_df["synth_name"] == "gem"].copy()
df_aim_jax   = super_rank_df[super_rank_df["synth_name"] == "aim_jax"].copy()

In [41]:
df_privbayes[df_privbayes['experiments_name'] == 'acs']

Unnamed: 0,experiments_name,experiment,synth_name,super_rank
47,acs,acs/national/acs/national,privbayes,1.0
17,acs,acs/national/acs/csv-llama,privbayes,1.633333
41,acs,acs/national/acs/gen-llama-MIX-UNIF,privbayes,1.766667
44,acs,acs/national/acs/massachusetts_upsampled,privbayes,2.022727
11,acs,acs/national/acs/csv-claude,privbayes,2.1
14,acs,acs/national/acs/csv-gpt,privbayes,2.1
29,acs,acs/national/acs/gen-claude-MIX-UNIF,privbayes,2.1
38,acs,acs/national/acs/gen-llama-MIX-MAX,privbayes,2.1
26,acs,acs/national/acs/gen-claude-MIX-MAX,privbayes,2.283333
32,acs,acs/national/acs/gen-gpt-MIX-MAX,privbayes,2.283333


In [42]:
df_privbayes[df_privbayes['experiments_name'] == 'we']

Unnamed: 0,experiments_name,experiment,synth_name,super_rank
290,we,we/2023/we/2023,privbayes,1.0
287,we,we/2023/we/2018,privbayes,2.681818
335,we,we/2023/we/sdscm-gpt2,privbayes,2.7
317,we,we/2023/we/gen-claude-MIX-MAX,privbayes,2.883333
341,we,we/2023/we/sdscm-olmo-1b-hf,privbayes,2.983333
338,we,we/2023/we/sdscm-llama-3-8b,privbayes,3.033333
293,we,we/2023/we/arbitrary,privbayes,3.15
320,we,we/2023/we/gen-claude-MIX-UNIF,privbayes,3.15
308,we,we/2023/we/csv-llama,privbayes,3.216667
305,we,we/2023/we/csv-gpt,privbayes,3.3


In [43]:
df_privbayes[df_privbayes['experiments_name'] == 'edad']

Unnamed: 0,experiments_name,experiment,synth_name,super_rank
62,edad,edad/2023/edad/2023,privbayes,1.0
83,edad,edad/2023/edad/gen-MIX-MAX,privbayes,1.944444
59,edad,edad/2023/edad/2020,privbayes,2.055556
65,edad,edad/2023/edad/arbitrary,privbayes,2.194444
74,edad,edad/2023/edad/csv-claude,privbayes,2.194444
77,edad,edad/2023/edad/csv-gpt,privbayes,2.194444
86,edad,edad/2023/edad/gen-MIX-UNIF,privbayes,2.361111
80,edad,edad/2023/edad/csv-llama,privbayes,2.388889
92,edad,edad/2023/edad/gen-claude-MIX-UNIF,privbayes,2.388889
89,edad,edad/2023/edad/gen-claude-MIX-MAX,privbayes,2.527778


In [34]:
super_rank_df["method"] = super_rank_df["experiment"].str.split("/").str[-1]

df_method_avg = (
    super_rank_df
    .groupby("method", as_index=False)["super_rank"]
    .mean()
    .rename(columns={"super_rank": "avg_super_rank"})
)

df_method_avg["super_super_rank"] = df_method_avg["avg_super_rank"].rank(method="dense", ascending=True)

df_method_avg = df_method_avg.sort_values("super_super_rank")

In [35]:
df_method_avg

Unnamed: 0,method,avg_super_rank,super_super_rank
11,gen-claude-MIX-MAX,2.422222,1.0
7,csv-gpt,2.524691,2.0
3,arbitrary,2.591975,3.0
10,gen-MIX-UNIF,2.624074,4.0
12,gen-claude-MIX-UNIF,2.675926,5.0
9,gen-MIX-MAX,2.681481,6.0
13,gen-gpt-MIX-MAX,2.707407,7.0
8,csv-llama,2.724691,8.0
6,csv-claude,2.767901,9.0
14,gen-gpt-MIX-UNIF,2.855556,10.0


## Calculating rankings based on advantage
### TODO: Currently buggy, need to fix.

In [15]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore")

column_to_rank_on = 'correspond_test'
for experiments_name, experiments in ALL_EXPERIMENTS_OBJ.items():

    display(Markdown(f"## {experiments_name.upper()}"))

    datasets = {}

    for metric in ydnpd.EVALUATION_METRICS:

        display(Markdown(f"### {metric}"))
        with pd.option_context("display.max_rows", None):
            try: 
                df_extract_all = ydnpd.UtilityTask.evaluate(utility_tasks_results, experiments, metric).multiply(100).round(2)
                
                # flatten df_extract_all
                df_extract_all = df_extract_all.reset_index()

                df_extract_all_smaller = df_extract_all[['synth_name', 'experiment', 'epsilon', column_to_rank_on]] 

                metric_direction = METRIC_DIRECTION[metric]

                df_extract_all_smaller["_raw_value"] = df_extract_all_smaller[column_to_rank_on]

                grp = df_extract_all_smaller.groupby(["experiment", "epsilon"])["_raw_value"]

                if metric_direction in ["closer_to_zero_is_better", "lower_is_better"]:
                    # baseline is best  or lowest in that group
                    baseline_series = grp.transform("min")
                    df_extract_all_smaller["_advantage"] = (
                        baseline_series - df_extract_all_smaller["_raw_value"]
                    ) / baseline_series
                else:
                    # baseline is best or highest in that group
                    baseline_series = grp.transform("max")
                    df_extract_all_smaller["_advantage"] = (
                        df_extract_all_smaller["_raw_value"] - baseline_series
                    ) / baseline_series

                df_extract_all_smaller = df_extract_all_smaller.sort_values(
                    by=["experiment", "epsilon", "_advantage"],
                    ascending=[True, True, False],
                )

                df_extract_all_smaller["_advantage_rank"] = df_extract_all_smaller.groupby(
                    ["experiment", "epsilon"]
                ).cumcount() + 1

                df_experiment_best = (
                    df_extract_all_smaller
                    .groupby(["synth_name", "experiment"], as_index=False)["_advantage_rank"]
                    .min()
                )

                df_final_avg = (
                    df_experiment_best
                    .groupby(["experiment", "synth_name"])["_advantage_rank"]
                    .mean()
                    .reset_index()
                )

                df_final_avg.to_csv(f"./results/advantage_rankings/{metric}_{experiments_name}_advantage.csv")

            except Exception as e:
                print(f"Error: {e}")
                continue



## ACS

### total_variation_distance

### marginals_3_max_abs_diff_error

### marginals_3_avg_abs_diff_error

### thresholded_marginals_3_max_abs_diff_error

### thresholded_marginals_3_avg_abs_diff_error

### pearson_corr_max_abs_diff

### pearson_corr_avg_abs_diff

### cramer_v_corr_max_abs_diff

### cramer_v_corr_avg_abs_diff

### error_rate_diff

### error_rate_train_dataset

### error_rate_synth_dataset

### aoc_diff

### aoc_train_dataset

### aoc_synth_dataset

## EDAD

### total_variation_distance

### marginals_3_max_abs_diff_error

### marginals_3_avg_abs_diff_error

### thresholded_marginals_3_max_abs_diff_error

### thresholded_marginals_3_avg_abs_diff_error

### pearson_corr_max_abs_diff

### pearson_corr_avg_abs_diff

### cramer_v_corr_max_abs_diff

### cramer_v_corr_avg_abs_diff

### error_rate_diff

Error in extractor function:
 - synth_name: aim_jax
 - epsilon: 1
 - dev_name: edad/gen-MIX-MAX
 - test_name: edad/2023
 - Error: Cannot index by location index with a non-integer key
Error in evaluate method: Cannot index by location index with a non-integer key
Error: Cannot index by location index with a non-integer key


### error_rate_train_dataset

Error in extractor function:
 - synth_name: aim_jax
 - epsilon: 1
 - dev_name: edad/gen-MIX-MAX
 - test_name: edad/2023
 - Error: Cannot index by location index with a non-integer key
Error in evaluate method: Cannot index by location index with a non-integer key
Error: Cannot index by location index with a non-integer key


### error_rate_synth_dataset

Error in extractor function:
 - synth_name: aim_jax
 - epsilon: 1
 - dev_name: edad/gen-MIX-MAX
 - test_name: edad/2023
 - Error: Cannot index by location index with a non-integer key
Error in evaluate method: Cannot index by location index with a non-integer key
Error: Cannot index by location index with a non-integer key


### aoc_diff

Error in extractor function:
 - synth_name: aim_jax
 - epsilon: 1
 - dev_name: edad/gen-MIX-MAX
 - test_name: edad/2023
 - Error: Cannot index by location index with a non-integer key
Error in evaluate method: Cannot index by location index with a non-integer key
Error: Cannot index by location index with a non-integer key


### aoc_train_dataset

Error in extractor function:
 - synth_name: aim_jax
 - epsilon: 1
 - dev_name: edad/gen-MIX-MAX
 - test_name: edad/2023
 - Error: Cannot index by location index with a non-integer key
Error in evaluate method: Cannot index by location index with a non-integer key
Error: Cannot index by location index with a non-integer key


### aoc_synth_dataset

Error in extractor function:
 - synth_name: aim_jax
 - epsilon: 1
 - dev_name: edad/gen-MIX-MAX
 - test_name: edad/2023
 - Error: Cannot index by location index with a non-integer key
Error in evaluate method: Cannot index by location index with a non-integer key
Error: Cannot index by location index with a non-integer key


## WE

### total_variation_distance

### marginals_3_max_abs_diff_error

### marginals_3_avg_abs_diff_error

### thresholded_marginals_3_max_abs_diff_error

### thresholded_marginals_3_avg_abs_diff_error

### pearson_corr_max_abs_diff

### pearson_corr_avg_abs_diff

### cramer_v_corr_max_abs_diff

### cramer_v_corr_avg_abs_diff

### error_rate_diff

### error_rate_train_dataset

### error_rate_synth_dataset

### aoc_diff

### aoc_train_dataset

### aoc_synth_dataset

In [14]:
df_final_avg

Unnamed: 0,experiment,synth_name,_advantage_rank
0,we/2023/we/arbitrary,aim_jax,1.25
1,we/2023/we/arbitrary,gem,3.0
2,we/2023/we/arbitrary,privbayes,1.75
3,we/2023/we/baseline_domain,aim_jax,1.75
4,we/2023/we/baseline_domain,gem,3.0
5,we/2023/we/baseline_domain,privbayes,1.25
6,we/2023/we/baseline_univariate,aim_jax,1.25
7,we/2023/we/baseline_univariate,gem,3.0
8,we/2023/we/baseline_univariate,privbayes,1.75
9,we/2023/we/csv-claude,aim_jax,1.25
