# Fetch and parse ACS benchmark results under a given directory
Each ACS benchmark run outputs a json file. This script collects all such files under a given root directory, parses them, and aggregates them into a more easily digestable pandas DataFrame.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

**[Action required]** Set `RESULTS_ROOT_DIR` to the root results directory path:

In [2]:
RESULTS_ROOT_DIR = Path("/fast/groups/sf") / "folktexts-results" / "2024-08-28_2"

Set the local path to the root data directory (needed only to train baseline ML methods):

In [3]:
DATA_DIR = Path("/fast/groups/sf") / "data"

Important results columns:

In [4]:
model_col = "config_model_name"
task_col = "config_task_name"
numeric_prompt_col = "config_numeric_risk_prompting"

feature_subset_col = "config_feature_subset"
predictions_path_col = "predictions_path"

Helper function to parse each dictionary containing benchmark results:

In [5]:
from utils import (
    num_features_helper,
    parse_model_name,
    get_non_instruction_tuned_name,
    prettify_model_name,
)

def parse_results_dict(dct) -> dict:
    """Parses results dict and brings all information to the top-level."""

    # Make a copy so we don't modify the input object
    dct = dct.copy()

    # Discard plots' paths
    dct.pop("plots", None)

    # Bring configs to top-level
    config = dct.pop("config", {})
    for key, val in config.items():
        dct[f"config_{key}"] = val

    # Parse model name
    dct[model_col] = parse_model_name(dct[model_col])
    dct["base_name"] = get_non_instruction_tuned_name(dct[model_col])
    dct["name"] = prettify_model_name(dct[model_col])

    # Is instruction-tuned model?
    dct["is_inst"] = dct["base_name"] != dct[model_col]

    # Log number of features
    dct["num_features"] = num_features_helper(dct[feature_subset_col], max_features_return=-1)
    dct["uses_all_features"] = (dct[feature_subset_col] is None) or (dct["num_features"] == -1)

    if dct[feature_subset_col] is None:
        dct[feature_subset_col] = "full"

    # Assert all results are at the top-level
    assert not any(isinstance(val, dict) for val in dct.values())
    return dct


Iteratively search the root directory for results files matching the given regex:

In [6]:
from utils import find_files, load_json

# Results file name pattern
pattern = r'^results.bench-(?P<hash>\d+)[.]json$'

# Find results files and aggregate
results = {}
for file_path in tqdm(find_files(RESULTS_ROOT_DIR, pattern)):
    results[Path(file_path).parent.name] = parse_results_dict(load_json(file_path))

if len(results) == 0:
    raise RuntimeError(f"Couldn't find any results at {RESULTS_ROOT_DIR}")
else:
    print(f"Found {len(results)} benchmark results.")

0it [00:00, ?it/s]

Found 224 benchmark results.


Aggregate results into a single DataFrame, generate a unique identifier for each row, and drop potential duplicates:

In [7]:
df = pd.DataFrame(list(results.values()))

def row_id(row) -> str:
    """Unique row identifier."""
    numeric_or_multiple_choice = "Num" if row[numeric_prompt_col] else "QA"
    return f"{row[model_col]}__{row[task_col]}__{row['num_features']}__{numeric_or_multiple_choice}"

print(f"{df.shape=}")
df["id"] = df.apply(row_id, axis=1)

# Drop duplicates
len_with_dups = len(df)
df = df.drop_duplicates(subset=["name", "is_inst", "num_features", task_col, numeric_prompt_col])
df = df.set_index("id", drop=True, verify_integrity=True)

if len_with_dups != len(df):
    print(f"Dropping {len_with_dups - len(df)} duplicates!")
    print(f"{df.shape=}")

df.shape=(224, 58)
Dropping 20 duplicates!
df.shape=(204, 58)


Load scores DFs and analyze score distribution:

In [8]:
def load_model_scores_df(df_row: pd.Series) -> pd.DataFrame:
    """Loads csv containing model scores corresponding to the given DF row."""
    if predictions_path_col in df_row and not pd.isna(df_row[predictions_path_col]):
        return pd.read_csv(df_row[predictions_path_col], index_col=0)
    return None

In [9]:
from folktexts.evaluation import compute_best_threshold
from sklearn import metrics
from scipy import stats

# Number of samples used to fit the one-parameter binarization threshold!
N = 100

fit_thresh_col = f"fit_thresh_on_{N}"
fit_acc_col = f"fit_thresh_accuracy"

optimal_thres_col = "optimal_thresh"
optimal_acc_col = "optimal_thresh_accuracy"

score_stdev_col = "score_stdev"
score_mean_col = "score_mean"

scores_stats = dict()
for row_id, row in tqdm(df.iterrows(), total=len(df)):

    # Load model scores
    scores_df = load_model_scores_df(row)
    if scores_df is None:
        logging.error(f"Couldn't find scores for {row_id}")

    # Extract scores and labels
    risk_scores = scores_df["risk_score"].to_numpy()
    labels = scores_df["label"].to_numpy()

    # Sample N rows to fit threshold
    scores_df_sample = scores_df.sample(n=N, random_state=42)

    # Compute optimal threshold on each data sample
    fit_thr = compute_best_threshold(y_true=scores_df_sample["label"], y_pred_scores=scores_df_sample["risk_score"])
    opt_thr = compute_best_threshold(y_true=labels, y_pred_scores=risk_scores)

    # Evaluate accuracy
    fit_acc = metrics.accuracy_score(labels, (risk_scores >= fit_thr).astype(int))
    opt_acc = metrics.accuracy_score(labels, (risk_scores >= opt_thr).astype(int))

    # Save results
    scores_stats[row_id] = {
        fit_thresh_col: fit_thr,
        fit_acc_col: fit_acc,
        optimal_thres_col: opt_thr,
        optimal_acc_col: opt_acc,
        score_stdev_col: np.std(risk_scores),
        score_mean_col: np.mean(risk_scores),
    }

  0%|          | 0/204 [00:00<?, ?it/s]

Update results DF with scores statistics:

In [10]:
scores_stats_df = pd.DataFrame(scores_stats.values(), index=list(scores_stats.keys()))
print(f"{scores_stats_df.shape=}")

results_df = pd.concat((df, scores_stats_df), axis="columns")
results_df.sample(2)

scores_stats_df.shape=(204, 6)


Unnamed: 0,accuracy,accuracy_diff,accuracy_ratio,balanced_accuracy,balanced_accuracy_diff,balanced_accuracy_ratio,brier_score_loss,ece,ece_quantile,equalized_odds_diff,...,is_inst,num_features,uses_all_features,config_dataset_subsampling,fit_thresh_on_100,fit_thresh_accuracy,optimal_thresh,optimal_thresh_accuracy,score_stdev,score_mean
Mixtral-8x22B-Instruct-v0.1__ACSPublicCoverage__-1__Num,0.742904,0.196134,0.748374,0.615484,0.152825,0.776835,0.183955,0.04505,,0.838043,...,True,-1,True,,0.35,0.747375,0.35,0.747375,0.203367,0.280604
gemma-2-9b__ACSTravelTime__-1__Num,0.48083,0.177731,0.688513,0.537608,0.129063,0.804449,0.23995,0.041956,,0.254823,...,False,-1,True,,0.5,0.48083,0.5,0.48083,0.103281,0.478345


Check if any results are missing:

In [None]:
experiments_per_model_task_pair = results_df.groupby([model_col, task_col]).nunique().max(axis=None)

for m in results_df[model_col].unique():
    for t in results_df[task_col].unique():
        match_ = results_df[(results_df[model_col] == m) & (results_df[task_col] == t)]
        if len(match_) < experiments_per_model_task_pair:
            print(f"Couldn't find all results for m={m}, t={t}")

Finally, save results DF to the results root directory:

In [12]:
from utils import get_current_timestamp
results_df.to_csv(Path(RESULTS_ROOT_DIR) / f"aggregated_results.{get_current_timestamp()}.csv")

In [13]:
results_df[results_df["name"] == "Mixtral 8x22B (it)"]

Unnamed: 0,accuracy,accuracy_diff,accuracy_ratio,balanced_accuracy,balanced_accuracy_diff,balanced_accuracy_ratio,brier_score_loss,ece,ece_quantile,equalized_odds_diff,...,is_inst,num_features,uses_all_features,config_dataset_subsampling,fit_thresh_on_100,fit_thresh_accuracy,optimal_thresh,optimal_thresh_accuracy,score_stdev,score_mean
Mixtral-8x22B-Instruct-v0.1__ACSEmployment__-1__Num,0.794139,0.068837,0.91801,0.794511,0.083952,0.900219,0.143711,0.060081,,0.223129,...,True,-1,True,,0.35,0.801385,0.3,0.801385,0.328532,0.394042
Mixtral-8x22B-Instruct-v0.1__ACSEmployment__-1__QA,0.505264,0.126127,0.784963,0.472782,0.139435,0.728744,0.394009,0.380765,0.372983,0.209254,...,True,-1,True,,0.033075,0.600354,0.014055,0.603206,0.264285,0.184625
Mixtral-8x22B-Instruct-v0.1__ACSIncome__-1__Num,0.767882,0.096716,0.885938,0.770218,0.16522,0.791933,0.16607,0.107078,,0.553366,...,True,-1,True,,0.65,0.769967,0.55,0.767882,0.298323,0.474955
Mixtral-8x22B-Instruct-v0.1__ACSIncome__-1__QA,0.75967,0.103618,0.877798,0.771723,0.176697,0.780013,0.219576,0.213193,0.211182,0.511348,...,True,-1,True,,0.119239,0.747336,0.705858,0.765593,0.47678,0.468819
Mixtral-8x22B-Instruct-v0.1__ACSTravelTime__-1__Num,0.593454,0.345798,0.612706,0.543727,0.077242,0.866188,0.240309,0.1174,,0.278552,...,True,-1,True,,0.45,0.61423,0.28,0.614114,0.139488,0.321938
Mixtral-8x22B-Instruct-v0.1__ACSTravelTime__-1__QA,0.591204,0.334034,0.625882,0.535092,0.07795,0.865126,0.329639,0.313096,0.312959,0.192446,...,True,-1,True,,0.095382,0.605652,0.067536,0.586302,0.150709,0.125353
Mixtral-8x22B-Instruct-v0.1__ACSPublicCoverage__-1__QA,0.723515,0.235237,0.693097,0.54978,0.08115,0.860716,0.249538,0.242453,0.235682,0.210744,...,True,-1,True,,0.014062,0.711418,0.01407,0.718534,0.172813,0.062757
Mixtral-8x22B-Instruct-v0.1__ACSPublicCoverage__-1__Num,0.742904,0.196134,0.748374,0.615484,0.152825,0.776835,0.183955,0.04505,,0.838043,...,True,-1,True,,0.35,0.747375,0.35,0.747375,0.203367,0.280604
Mixtral-8x22B-Instruct-v0.1__ACSMobility__-1__QA,0.39036,0.077569,0.808205,0.503362,0.201014,0.60647,0.3948,0.400191,0.401589,0.642148,...,True,-1,True,,0.731091,0.531629,0.592644,0.446919,0.236867,0.645204
Mixtral-8x22B-Instruct-v0.1__ACSMobility__-1__Num,0.735143,0.145699,0.828962,0.5,0.0,1.0,0.196996,0.053519,,0.0,...,True,-1,True,,0.25,0.474007,0.25,0.474007,0.049587,0.211395


---