In [29]:
import os
import glob
import pickle
import time
import json

os.environ["PRIVBAYES_BIN"] = "./ydnpd/harness/synthesis/privbayes/mac_bin"

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from IPython.display import display, Markdown


import ydnpd
from additional import ADDITIONAL_EXPERIMENTS, ADDITIONAL_PATH
from ydnpd import EVALUATION_METRICS, ALL_EXPERIMENTS, Experiments

ADDITIONAL_DATASETS = sum(list(ADDITIONAL_EXPERIMENTS.values()), [])

with open("./results/harness.pkl", "rb") as f:
    utility_tasks_results = pickle.load(f)

for x in utility_tasks_results:
    if type(x["evaluation"]["error_rate_diff"]) == list:
        assert len(x["evaluation"]["error_rate_diff"]) == 1
        x["evaluation"]["error_rate_diff"] = x["evaluation"]["error_rate_diff"][0]
# keep only results where dataset name contains "acs"
# acs_results = [x for x in utility_tasks_results if "acs" in x["dataset_name"].lower()]
utility_tasks_results = utility_tasks_results
# also get rid of results where synth is gem
utility_tasks_results = [x for x in utility_tasks_results if x["synth_name"] != "gem"]
# check which metrics are nan for which datasets
# NOTE: there are some datasets for which there are more nans. we will just not include...
missing_metrics = {}
for x in utility_tasks_results:
    for metric in EVALUATION_METRICS:
        if np.isnan(x["evaluation"][metric]):
            missing_metrics[x["dataset_name"]] = missing_metrics.get(x["dataset_name"], []) + [metric]
all_results = utility_tasks_results.copy()

reference_data_mapping = {
    "acs": "acs/national",
    "edad": "edad/2023",
    "we": "we/2023",
}

METRIC_DIRECTION = {
    "total_variation_distance": "closer_to_zero_is_better",
    "pearson_corr_max_abs_diff": "closer_to_zero_is_better",
    "pearson_corr_avg_abs_diff": "closer_to_zero_is_better",
    "cramer_v_corr_max_abs_diff": "closer_to_zero_is_better",
    "cramer_v_corr_avg_abs_diff": "closer_to_zero_is_better",
    "marginals_3_max_abs_diff_error": "closer_to_zero_is_better",
    "marginals_3_avg_abs_diff_error": "closer_to_zero_is_better",
    "thresholded_marginals_3_max_abs_diff_error": "closer_to_zero_is_better",
    "thresholded_marginals_3_avg_abs_diff_error": "closer_to_zero_is_better",
    "error_rate_diff": "closer_to_zero_is_better",
    "aoc_diff": "closer_to_zero_is_better",
}

CORRELATION_METRICS = [
    "total_variation_distance",
    "pearson_corr_max_abs_diff",
    "pearson_corr_avg_abs_diff",
    "cramer_v_corr_max_abs_diff",
    "cramer_v_corr_avg_abs_diff",
]
MARGINALS_METRICS = [
    "marginals_3_max_abs_diff_error",
    "marginals_3_avg_abs_diff_error",
    "thresholded_marginals_3_max_abs_diff_error",
    "thresholded_marginals_3_avg_abs_diff_error",
]
CLASSIFICATION_METRICS = [
    "error_rate_diff",
    "aoc_diff",
]

df = pd.DataFrame(all_results)

df_evaluation = pd.json_normalize(df["evaluation"])

def dict_to_sorted_json_str(d):
    return str(json.dumps(d, sort_keys=True)) # str(

df["hparams_str"] = df["hparams"].apply(dict_to_sorted_json_str)

df_combined = pd.concat(
    [df.drop(columns=["hparams", "evaluation"]),
     df_evaluation],
    axis=1
)

# group by hparams_str, take the mean
df_combined = df_combined[['hparams_str', 'synth_name', 'epsilon', 'dataset_name'] + list(METRIC_DIRECTION.keys())].groupby(['hparams_str', 'synth_name', 'epsilon', 'dataset_name']).mean().reset_index()

def get_reference_dataset_name(ds_name):
    if ds_name.startswith("acs/"):
        return reference_data_mapping["acs"]
    elif ds_name.startswith("edad/"):
        return reference_data_mapping["edad"]
    elif ds_name.startswith("we/"):
        return reference_data_mapping["we"]
    else:
        raise ValueError(f"Unknown dataset name: {ds_name}")

df_combined["reference_dataset_name"] = df_combined["dataset_name"].apply(get_reference_dataset_name)

def find_best_performance_rows(subdf, metric):
    direction = METRIC_DIRECTION[metric]
    if direction == "closer_to_zero_is_better":
        best_val = subdf[metric].abs().min()
        best_hparams_str = subdf[subdf[metric].abs() == best_val]["hparams_str"].iloc[0]
        return subdf[subdf[metric].abs() == best_val], best_hparams_str
    else:
        raise ValueError(f"everything is closer_to_zero_is_better now")
    
rows_for_report = []

# skipped tracker
skipped = {}
group_cols = ["synth_name", "epsilon", "reference_dataset_name"]
for (synth_name, epsilon, reference_dataset_name), group_df in df_combined.groupby(group_cols):
    # identify the reference dataset within this group
    ref_df = group_df[group_df["dataset_name"] == reference_dataset_name]
    if ref_df.empty:
        continue

    # find the 'true best performance' for each metric in that reference subset
    true_best_performance = {}
    for metric in METRIC_DIRECTION.keys():
        best_rows, best_hparams = find_best_performance_rows(ref_df, metric)
        # just pick the first row in case of tie
        best_row = best_rows.iloc[0]
        true_best_val = np.abs(best_row[metric])
        true_best_performance[metric] = (true_best_val, best_row, best_hparams)

    # for each dataset in the group, figure out which hyperparams you'd pick
    for ds_name, ds_group_df in group_df.groupby("dataset_name"):

        # then do that for each metric (or each metric block)
        for metric in METRIC_DIRECTION.keys():
            try:
                best_ds_rows, _ = find_best_performance_rows(ds_group_df, metric)
                chosen_row = best_ds_rows.iloc[0]  # pick first in tie
                chosen_hparams_str = chosen_row["hparams_str"]

                # now we see if that same hparams_str is present in the reference df
                ref_match = ref_df[ref_df["hparams_str"] == chosen_hparams_str]
                if ref_match.empty:
                    # means reference never had that exact set of hyperparams
                    raise ValueError(f"Reference dataset {ref_ds_name} never had hyperparams {chosen_hparams_str} for metric {metric}")

                # how does it perform on the reference dataset?
                row_in_ref = ref_match.iloc[0]
                perf_on_ref = np.abs(row_in_ref[metric])

                # get the "true best" value for that metric
                (true_best_val, _, best_hparams) = true_best_performance[metric]

                # define percent_degradation = (candidate - best) / abs(best), if best != 0, else 0
                if true_best_val == 0:
                    pct_degradation = 0
                else:
                    pct_degradation = (perf_on_ref - true_best_val) / abs(true_best_val)

                # store in our report
                rows_for_report.append({
                    "synth_name": synth_name,
                    "epsilon": epsilon,
                    "dataset_name": ds_name,
                    "metric": metric,
                    "chosen_hparams_str": chosen_hparams_str,
                    "chosen_val_on_dataset": np.abs(chosen_row[metric]),
                    "perf_on_reference": perf_on_ref,
                    "true_best_on_reference": true_best_val,
                    "pct_degradation_on_ref": pct_degradation,
                    "best_hparams": best_hparams,
                    "reference_dataset_name": reference_dataset_name,
                })
            except Exception as e:
                print(f"Skipping {synth_name}, {epsilon}, {ds_name}, {metric}: {e}")
                if (synth_name, epsilon, ds_name, metric) not in skipped:
                    skipped[(synth_name, epsilon, ds_name, metric)] = 0
                skipped[(synth_name, epsilon, ds_name, metric)] += 1


report_df = pd.DataFrame(rows_for_report)

METRIC_GROUPS = {
    "correlation_metrics": [
        "total_variation_distance",
        "pearson_corr_max_abs_diff",
        "pearson_corr_avg_abs_diff",
        "cramer_v_corr_max_abs_diff",
        "cramer_v_corr_avg_abs_diff",
    ],
    "marginals_metrics": [
        "marginals_3_max_abs_diff_error",
        "marginals_3_avg_abs_diff_error",
        "thresholded_marginals_3_max_abs_diff_error",
        "thresholded_marginals_3_avg_abs_diff_error",
    ],
    "classification_metrics": [
        "error_rate_diff",
        "aoc_diff",
    ],
}

# reverse mapping, metric -> group
metric_to_group = {}
for group, metrics in METRIC_GROUPS.items():
    for metric in metrics:
        metric_to_group[metric] = group

# metric_group to column to report_df
report_df['metric_group'] = report_df['metric'].map(metric_to_group)


Skipping aim_jax, 1, edad/gen-llama-MIX-MAX, error_rate_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 1, edad/gen-llama-MIX-MAX, aoc_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 1, edad/gen-llama-MIX-UNIF, error_rate_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 1, edad/gen-llama-MIX-UNIF, aoc_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 4, edad/gen-llama-MIX-MAX, error_rate_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 4, edad/gen-llama-MIX-MAX, aoc_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 4, edad/gen-llama-MIX-UNIF, error_rate_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 4, edad/gen-llama-MIX-UNIF, aoc_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 8, edad/gen-llama-MIX-MAX, error_rate_diff: single positional indexer is out-of-bounds
Skipping aim_jax, 8, edad/gen-llama-MIX-MAX, aoc_diff: single positional indexe

In [83]:
import pandas as pd
import numpy as np

epsilons = [1, 4, 8, 16]

all_distance_dfs = []
normalized_comparison = False
for synth_method in ['privbayes', 'aim_jax']:
    print()
    print(f"synthesizer: {synth_method}")

    filtered_df = report_df[
        (report_df['synth_name'] == synth_method) &
        (report_df['epsilon'].isin(epsilons))
    ]

    aggregated_df = filtered_df.groupby(['dataset_name', 'epsilon', 'reference_dataset_name'])['chosen_val_on_dataset'].mean().reset_index()
    for ref_ds_name in reference_data_mapping.values():
        print(f'dataset_name: {ref_ds_name}')
        # agg chosen_val_on_dataset to take the mean across all metrics for each dataset_name, epsilon
        ref_agged_df = aggregated_df[aggregated_df['reference_dataset_name'] == ref_ds_name]
        
        # take epsilons as columns and dataset_names as rows
        pivot_df = ref_agged_df.pivot(index='dataset_name', columns='epsilon', values='chosen_val_on_dataset')

        # drop datasets that do not have all epsilon values (if any), not fair comparison
        pivot_df = pivot_df.dropna()

        # optionally normalize the values so that they're all in the range [0, 1]
        # NOTE: this way, its a relative comparison, so you choose the epsilon value
        # based on a leveling out in performance

        # NOTE: NORMALIZE BASED ON PERCENT DIFFERENCE FROM THE REFERENCE DATASET
        if normalized_comparison:
            normalized_df = pivot_df.copy()
            for dataset in normalized_df.index:
                row = normalized_df.loc[dataset]
                min_val = row.min()
                max_val = row.max()
                if max_val - min_val != 0:
                    normalized_df.loc[dataset] = (row - min_val) / (max_val - min_val)
                else:
                    normalized_df.loc[dataset] = 0
        else:
            normalized_df = pivot_df

        if ref_ds_name not in normalized_df.index:
            raise ValueError(f"ref dataset {ref_ds_name} not present in the data")

        reference_vector = normalized_df.loc[ref_ds_name].values

        # we'll do both L1 and L2 distance between the vectors 
        def compute_distances(row, ref_vector):
            l1 = np.sum(np.abs(row - ref_vector))
            l2 = np.sqrt(np.sum((row - ref_vector) ** 2))
            return pd.Series({'L1_distance': l1, 'L2_distance': l2})

        distance_df = normalized_df.apply(lambda row: compute_distances(row.values, reference_vector), axis=1)

        distance_df = distance_df.reset_index()

        distance_df_sorted = distance_df.sort_values(['L2_distance'])

        # add columns for the reference dataset and synth
        distance_df_sorted['reference_dataset_name'] = ref_ds_name
        distance_df_sorted['synth_name'] = synth_method

        print("method by method performance compared to reference dataset")
        display(distance_df_sorted)

        all_distance_dfs.append(distance_df_sorted)

all_distance_df = pd.concat(all_distance_dfs)


synthesizer: privbayes
dataset_name: acs/national
method by method performance compared to reference dataset


Unnamed: 0,dataset_name,L1_distance,L2_distance,reference_dataset_name,synth_name
15,acs/national,0.0,0.0,acs/national,privbayes
13,acs/gen-llama-MIX-UNIF,0.012449,0.007701,acs/national,privbayes
11,acs/gen-gpt-MIX-UNIF,0.020296,0.01129,acs/national,privbayes
10,acs/gen-gpt-MIX-MAX,0.030478,0.018491,acs/national,privbayes
0,acs/arbitrary,0.036601,0.019285,acs/national,privbayes
9,acs/gen-claude-MIX-UNIF,0.039533,0.02118,acs/national,privbayes
8,acs/gen-claude-MIX-MAX,0.044508,0.022775,acs/national,privbayes
7,acs/gen-MIX-UNIF,0.042845,0.023545,acs/national,privbayes
5,acs/csv-llama,0.046354,0.023681,acs/national,privbayes
14,acs/massachusetts_upsampled,0.044207,0.024193,acs/national,privbayes


dataset_name: edad/2023
method by method performance compared to reference dataset


Unnamed: 0,dataset_name,L1_distance,L2_distance,reference_dataset_name,synth_name
1,edad/2023,0.0,0.0,edad/2023,privbayes
9,edad/gen-MIX-UNIF,0.033387,0.019713,edad/2023,privbayes
0,edad/2020,0.029888,0.025777,edad/2023,privbayes
8,edad/gen-MIX-MAX,0.06377,0.035555,edad/2023,privbayes
10,edad/gen-claude-MIX-MAX,0.075978,0.044395,edad/2023,privbayes
2,edad/arbitrary,0.084282,0.04656,edad/2023,privbayes
11,edad/gen-claude-MIX-UNIF,0.093852,0.060421,edad/2023,privbayes
12,edad/gen-gpt-MIX-MAX,0.109174,0.07305,edad/2023,privbayes
13,edad/gen-gpt-MIX-UNIF,0.137058,0.0938,edad/2023,privbayes
16,edad/sdscm-gpt2,0.140473,0.09567,edad/2023,privbayes


dataset_name: we/2023
method by method performance compared to reference dataset


Unnamed: 0,dataset_name,L1_distance,L2_distance,reference_dataset_name,synth_name
1,we/2023,0.0,0.0,we/2023,privbayes
0,we/2018,0.02348,0.012075,we/2023,privbayes
16,we/sdscm-gpt2,0.054788,0.02874,we/2023,privbayes
17,we/sdscm-llama-3-8b,0.046439,0.033881,we/2023,privbayes
11,we/gen-claude-MIX-UNIF,0.066486,0.037118,we/2023,privbayes
18,we/sdscm-olmo-1b-hf,0.072805,0.04465,we/2023,privbayes
10,we/gen-claude-MIX-MAX,0.072341,0.052992,we/2023,privbayes
9,we/gen-MIX-UNIF,0.100489,0.056841,we/2023,privbayes
14,we/gen-llama-MIX-MAX,0.118205,0.059499,we/2023,privbayes
2,we/arbitrary,0.119361,0.061687,we/2023,privbayes



synthesizer: aim_jax
dataset_name: acs/national
method by method performance compared to reference dataset


Unnamed: 0,dataset_name,L1_distance,L2_distance,reference_dataset_name,synth_name
15,acs/national,0.0,0.0,acs/national,aim_jax
14,acs/massachusetts_upsampled,0.012819,0.007163,acs/national,aim_jax
2,acs/baseline_univariate,0.03708,0.019127,acs/national,aim_jax
0,acs/arbitrary,0.065158,0.034998,acs/national,aim_jax
1,acs/baseline_domain,0.216506,0.11045,acs/national,aim_jax
9,acs/gen-claude-MIX-UNIF,0.279047,0.142468,acs/national,aim_jax
10,acs/gen-gpt-MIX-MAX,0.276766,0.143138,acs/national,aim_jax
11,acs/gen-gpt-MIX-UNIF,0.284307,0.146147,acs/national,aim_jax
7,acs/gen-MIX-UNIF,0.488423,0.245117,acs/national,aim_jax
18,acs/sdscm-olmo-1b-hf,0.492873,0.247592,acs/national,aim_jax


dataset_name: edad/2023
method by method performance compared to reference dataset


Unnamed: 0,dataset_name,L1_distance,L2_distance,reference_dataset_name,synth_name
1,edad/2023,0.0,0.0,edad/2023,aim_jax
0,edad/2020,0.019807,0.013077,edad/2023,aim_jax
15,edad/gen-llama-MIX-UNIF,0.070555,0.043404,edad/2023,aim_jax
3,edad/baseline_domain,0.080603,0.053108,edad/2023,aim_jax
2,edad/arbitrary,0.107766,0.057467,edad/2023,aim_jax
4,edad/baseline_univariate,0.089014,0.064493,edad/2023,aim_jax
14,edad/gen-llama-MIX-MAX,0.114937,0.065222,edad/2023,aim_jax
18,edad/sdscm-olmo-1b-hf,0.12808,0.067715,edad/2023,aim_jax
13,edad/gen-gpt-MIX-UNIF,0.133243,0.069111,edad/2023,aim_jax
16,edad/sdscm-gpt2,0.16667,0.093901,edad/2023,aim_jax


dataset_name: we/2023
method by method performance compared to reference dataset


Unnamed: 0,dataset_name,L1_distance,L2_distance,reference_dataset_name,synth_name
1,we/2023,0.0,0.0,we/2023,aim_jax
17,we/sdscm-llama-3-8b,0.028089,0.016442,we/2023,aim_jax
0,we/2018,0.034502,0.018057,we/2023,aim_jax
18,we/sdscm-olmo-1b-hf,0.10768,0.054201,we/2023,aim_jax
11,we/gen-claude-MIX-UNIF,0.122781,0.063557,we/2023,aim_jax
15,we/gen-llama-MIX-UNIF,0.138539,0.069922,we/2023,aim_jax
16,we/sdscm-gpt2,0.151997,0.076119,we/2023,aim_jax
14,we/gen-llama-MIX-MAX,0.219781,0.110309,we/2023,aim_jax
10,we/gen-claude-MIX-MAX,0.221324,0.111898,we/2023,aim_jax
2,we/arbitrary,0.229769,0.116135,we/2023,aim_jax


### TODO: pareto frontier of these results

In [92]:
import numpy as np
import pandas as pd

reference_data_mapping = {
    "acs": "acs/national",
    "edad": "edad/2023",
    "we": "we/2023",
}

public_data_mapping = {
    "acs": "acs/massachusetts_upsampled",
    "edad": "edad/2020",
    "we": "we/2018",
}

all_distance_df = all_distance_df[~all_distance_df['dataset_name'].isin(reference_data_mapping.values())]
all_distance_df = all_distance_df[~all_distance_df['dataset_name'].isin(public_data_mapping.values())]

def is_pareto_efficient(costs):
    #find the Pareto-efficient points for array n_points, n_costs
    is_efficient = np.ones(costs.shape[0], dtype=bool)
    for i, c in enumerate(costs):
        if is_efficient[i]:
            # any point that is dominated by point i is not efficient
            is_efficient[is_efficient] = np.any(costs[is_efficient] < c, axis=1)
            is_efficient[i] = True 
    return is_efficient

# pivot for L1 and L2 for each reference dataset
pivot_l1 = all_distance_df.pivot_table(
    index=['synth_name', 'dataset_name'],
    columns='reference_dataset_name',
    values='L1_distance'
).reset_index()

pivot_l2 = all_distance_df.pivot_table(
    index=['synth_name', 'dataset_name'],
    columns='reference_dataset_name',
    values='L2_distance'
).reset_index()

# for each pivot, for each dataset_name, remove text before "/". this will lead to duplicate dataset names.
pivot_l1['dataset_name'] = pivot_l1['dataset_name'].apply(lambda x: x.split("/")[1])
pivot_l2['dataset_name'] = pivot_l2['dataset_name'].apply(lambda x: x.split("/")[1])

# now, for each row, there will be 1 column with a value, and the rest will be NaN
# collapse so that each dataset name is only 1 row, with no NaNs
pivot_l1 = pivot_l1.groupby(['dataset_name','synth_name']).first().reset_index()
pivot_l2 = pivot_l2.groupby(['dataset_name','synth_name']).first().reset_index()

# rename columns of each so that end with _L1 and _L2
pivot_l1.columns = ['dataset_name', 'synth_name'] + [f"{ref}_L1" for ref in pivot_l1.columns if ref not in ['synth_name', 'dataset_name']]
pivot_l2.columns = ['dataset_name', 'synth_name'] + [f"{ref}_L2" for ref in pivot_l2.columns if ref not in ['synth_name', 'dataset_name']]

merged_pivot = pd.merge(pivot_l1, pivot_l2, on=['synth_name', 'dataset_name'])

In [93]:
synthesizers = merged_pivot['synth_name'].unique()

for synth in synthesizers:
    synth_df = merged_pivot[merged_pivot['synth_name'] == synth]
    print()
    print(f"pareto frontier for {synth}")
    
    distance_cols = [col for col in merged_pivot.columns if col.endswith('_L1') or col.endswith('_L2')]
    cost_matrix = synth_df[distance_cols].values
    
    pareto_mask = is_pareto_efficient(cost_matrix)
    pareto_front = synth_df[pareto_mask]
    
    display(pareto_front[['dataset_name'] + distance_cols])

    # # also compute pareto frontier for average of L1 and average L2
    # synth_df['avg_L1'] = synth_df[[col for col in synth_df.columns if col.endswith('_L1')]].mean(axis=1)
    # synth_df['avg_L2'] = synth_df[[col for col in synth_df.columns if col.endswith('_L2')]].mean(axis=1)
    # avg_cost_matrix = synth_df[['avg_L1', 'avg_L2']].values

    # avg_pareto_mask = is_pareto_efficient(avg_cost_matrix)
    # avg_pareto_front = synth_df[avg_pareto_mask]

    # display(avg_pareto_front[['dataset_name', 'avg_L1', 'avg_L2']])



pareto frontier for aim_jax


Unnamed: 0,dataset_name,acs/national_L1,edad/2023_L1,we/2023_L1,acs/national_L2,edad/2023_L2,we/2023_L2
0,arbitrary,0.065158,0.107766,0.229769,0.034998,0.057467,0.116135
2,baseline_domain,0.216506,0.080603,0.26054,0.11045,0.053108,0.130499
4,baseline_univariate,0.03708,0.089014,0.341716,0.019127,0.064493,0.170908
18,gen-claude-MIX-UNIF,0.279047,0.174896,0.122781,0.142468,0.097494,0.063557
26,gen-llama-MIX-UNIF,0.833273,0.070555,0.138539,0.418143,0.043404,0.069922
30,sdscm-llama-3-8b,0.51587,0.168664,0.028089,0.258592,0.095078,0.016442
32,sdscm-olmo-1b-hf,0.492873,0.12808,0.10768,0.247592,0.067715,0.054201



pareto frontier for privbayes


Unnamed: 0,dataset_name,acs/national_L1,edad/2023_L1,we/2023_L1,acs/national_L2,edad/2023_L2,we/2023_L2
1,arbitrary,0.036601,0.084282,0.119361,0.019285,0.04656,0.061687
13,gen-MIX-MAX,0.040503,0.06377,0.134219,0.029562,0.035555,0.068837
15,gen-MIX-UNIF,0.042845,0.033387,0.100489,0.023545,0.019713,0.056841
17,gen-claude-MIX-MAX,0.044508,0.075978,0.072341,0.022775,0.044395,0.052992
19,gen-claude-MIX-UNIF,0.039533,0.093852,0.066486,0.02118,0.060421,0.037118
21,gen-gpt-MIX-MAX,0.030478,0.109174,0.168691,0.018491,0.07305,0.086322
23,gen-gpt-MIX-UNIF,0.020296,0.137058,0.132964,0.01129,0.0938,0.075356
27,gen-llama-MIX-UNIF,0.012449,0.203435,0.136571,0.007701,0.120746,0.069144
29,sdscm-gpt2,0.167606,0.140473,0.054788,0.086104,0.09567,0.02874
31,sdscm-llama-3-8b,0.129806,0.133589,0.046439,0.06846,0.097246,0.033881
