# Render paper plots and tables

In [1]:
import logging
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

Load aggregated results data
(can be obtained using the `parse-acs-results.ipynb` notebook).

In [2]:
ACS_AGG_RESULTS_PATH = Path("/fast/groups/sf/folktexts-results") / "2024-07-03" / "aggregated_results.2024.07.09-23.12.24.csv"

In [3]:
results_df = pd.read_csv(ACS_AGG_RESULTS_PATH, index_col=0)
results_df.head(2)

Unnamed: 0,accuracy,accuracy_diff,accuracy_ratio,balanced_accuracy,balanced_accuracy_diff,balanced_accuracy_ratio,brier_score_loss,ece,ece_quantile,equalized_odds_diff,...,name,is_inst,num_features,uses_all_features,fit_thresh_on_100,fit_thresh_accuracy,optimal_thresh,optimal_thresh_accuracy,score_stdev,score_mean
gemma-2-9b-it__ACSIncome__-1,0.78003,0.096459,0.888661,0.753813,0.249373,0.688385,0.203177,0.194715,0.179097,0.694508,...,Gemma 2 9B (it),True,-1,True,0.085142,0.777837,0.007575,0.761892,0.447796,0.32814
Meta-Llama-3-70B__ACSTravelTime__-1,0.548924,0.231092,0.607143,0.591093,0.119547,0.804942,0.241007,0.092395,,0.317821,...,Llama 3 70B,False,-1,True,0.53122,0.614482,0.531059,0.608148,0.048256,0.521286


### Run baseline ML classifiers on the benchmark ACS tasks

In [4]:
DATA_DIR = Path("/fast/groups/sf") / "data"

In [5]:
ALL_TASKS = [
    "ACSIncome",
    "ACSMobility",
    "ACSEmployment",
    "ACSTravelTime",
    "ACSPublicCoverage",
]

List all baseline classifiers here:

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier    # NOTE: requires `pip install xgboost`

baselines = {
    "LR": LogisticRegression(),
    "GBM": HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
}

In [7]:
from folktexts.acs.acs_dataset import ACSDataset
from folktexts.evaluation import evaluate_predictions
from collections import defaultdict

def fit_and_eval(
    clf,
    X_train, y_train,
    X_test, y_test, s_test,
    fillna=False,
) -> dict:
    """Fit and evaluate a given classifier on the given data."""
    assert len(X_train) == len(y_train) and len(X_test) == len(y_test) == len(s_test)

    train_nan_count = X_train.isna().any(axis=1).sum()
    if fillna and train_nan_count > 0:
        # Fill NaNs with value=-1
        X_train = X_train.fillna(axis="columns", value=-1)
        X_test = X_test.fillna(axis="columns", value=-1)

    # Fit on train data
    clf.fit(X_train, y_train)

    # Evaluate on test data
    y_test_scores = clf.predict_proba(X_test)[:, -1]
    return evaluate_predictions(
        y_true=y_test.to_numpy(),
        y_pred_scores=y_test_scores,
        sensitive_attribute=s_test,
        threshold=0.5,
    )

def run_baselines(baselines, tasks) -> dict:
    """Run baseline classifiers on all acs tasks."""
    baseline_results = defaultdict(dict)

    # Prepare progress bar
    progress_bar = tqdm(
        total=len(tasks) * len(baselines),
        leave=True,
    )

    for task in tasks:
        progress_bar.set_postfix({"task": task})

        # Load ACS task data
        acs_dataset = ACSDataset.make_from_task(task=task, cache_dir=DATA_DIR)
    
        # Get train/test data
        X_train, y_train = acs_dataset.get_train()
        X_test, y_test = acs_dataset.get_test()
    
        # Get sensitive attribute test data
        s_test = None
        if acs_dataset.task.sensitive_attribute is not None:
            s_test = acs_dataset.get_sensitive_attribute_data().loc[y_test.index]
    
        for clf_name, clf in baselines.items():
            progress_bar.set_postfix({"task": task, "clf": clf_name})

            try:
                baseline_results[task][clf_name] = fit_and_eval(
                    clf=clf,
                    X_train=X_train, y_train=y_train,
                    X_test=X_test, y_test=y_test, s_test=s_test,
                    fillna=(clf_name == "LR"),
                )
            except Exception as err:
                logging.error(err)
            finally:
                progress_bar.update()

    return baseline_results

Flatten results and add extra columns.

In [8]:
def parse_baseline_results(baseline_results) -> list:
    """Flatten and parse baseline results."""
    parsed_results_list = list()
    
    for task, task_results in baseline_results.items():
    
        for clf, clf_results in task_results.items():
            parsed_results = clf_results.copy()
    
            parsed_results["config_task_name"] = task
            parsed_results["config_model_name"] = clf
            parsed_results["name"] = clf
            parsed_results["num_features"] = -1
            parsed_results["uses_all_features"] = True
    
            parsed_results_list.append(parsed_results)

    return parsed_results_list

Check if baseline results were already computed. If so, load csv; otherwise, compute and save.

In [9]:
BASELINE_RESULTS_PATH = ACS_AGG_RESULTS_PATH.parent / "baseline-results.csv"

# If saved results exists: load
if BASELINE_RESULTS_PATH.exists():
    print(f"Loading pre-computed baseline results from {BASELINE_RESULTS_PATH.as_posix()}")
    baselines_df = pd.read_csv(BASELINE_RESULTS_PATH, index_col=0)

# Compute baseline results
else:
    # Compute baseline results
    baseline_results = run_baselines(baselines, tasks=ALL_TASKS)

    # Parse results
    parsed_results_list = parse_baseline_results(baseline_results)

    # Construct DF
    baselines_df = pd.DataFrame(parsed_results_list, index=[r["name"] for r in parsed_results_list])

    # Save DF to disk
    baselines_df.to_csv(BASELINE_RESULTS_PATH)

# Show 2 random rows
baselines_df.sample(2)

  0%|          | 0/15 [00:00<?, ?it/s]

Loading ACS data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Loading ACS data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Loading ACS data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Loading ACS data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Loading ACS data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,threshold,n_samples,n_positives,n_negatives,model_name,accuracy,tpr,fnr,fpr,tnr,...,equalized_odds_ratio,equalized_odds_diff,roc_auc,ece,ece_quantile,config_task_name,config_model_name,name,num_features,uses_all_features
XGBoost,0.5,166450,61233,105217,,0.817309,0.732089,0.267911,0.133096,0.866904,...,0.077643,0.519188,0.894961,0.004253,0.00362,ACSIncome,XGBoost,XGBoost,-1,True
XGBoost,0.5,146665,64285,82380,,0.702997,0.547453,0.452547,0.175625,0.824375,...,0.06776,0.306513,0.76954,0.017382,0.01798,ACSTravelTime,XGBoost,XGBoost,-1,True


In [10]:
all_results_df = pd.concat((results_df, baselines_df))
print(f"{all_results_df.shape=}")

all_results_df.shape=(115, 64)


## Render results table for each task

In [11]:
table_metrics = ["ece", "brier_score_loss", "roc_auc", "accuracy", "fit_thresh_accuracy", "score_stdev"] #, "score_mean"]

model_col = "config_model_name"
task_col = "config_task_name"

Add model size and model family columns:

In [12]:
from folktexts.llm_utils import get_model_size_B

all_results_df["model_size"] = [
    (
        get_model_size_B(row["name"], default=float("nan"))
        if row["name"] not in baselines else "-"
    )
    for _, row in all_results_df.iterrows()
]

def get_model_family(model_name) -> str:
    if "llama" in model_name.lower():
        return "Llama"
    elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
        return "Mistral"
    elif "gemma" in model_name.lower():
        return "Gemma"
    elif "yi" in model_name.lower():
        return "Yi"
    elif "qwen" in model_name.lower():
        return "Qwen"
    else:
        return "-"

all_results_df["model_family"] = [get_model_family(row[model_col]) for _, row in all_results_df.iterrows()]
all_results_df.groupby([task_col, "model_family"])["accuracy"].count()

config_task_name   model_family
ACSEmployment      -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSIncome          -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSMobility        -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSPublicCoverage  -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSTravelTime      -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
Name: accuracy, dtype: int64

In [13]:
from utils import prettify_model_name

for task in ALL_TASKS:
    task_df = all_results_df[all_results_df[task_col] == task]

    latex_table = task_df.sort_values(["model_family", "model_size", "is_inst"], ascending=False).set_index(model_col)[table_metrics].round(3)
    latex_table = latex_table.rename(columns=lambda col: col.replace("_", " ")).fillna("-")

    # Prettify model names
    latex_table["Model"] = [
        prettify_model_name(id_) if id_ not in baselines.keys() else id_
        for id_, row in latex_table.iterrows()
    ]
    latex_table.set_index("Model", drop=True, inplace=True)

    print(f"*** {task.upper()} ***\n")
    print(latex_table.to_latex(float_format="%.2f"))
    print("")

*** ACSINCOME ***

\begin{tabular}{lrrrrll}
\toprule
 & ece & brier score loss & roc auc & accuracy & fit thresh accuracy & score stdev \\
Model &  &  &  &  &  &  \\
\midrule
Yi 34B (chat) & 0.19 & 0.19 & 0.86 & 0.72 & 0.74 & 0.34 \\
Yi 34B & 0.24 & 0.22 & 0.85 & 0.62 & 0.74 & 0.20 \\
Mixtral 8x22B (it) & 0.21 & 0.22 & 0.85 & 0.76 & 0.75 & 0.48 \\
Mixtral 8x22B & 0.17 & 0.19 & 0.85 & 0.68 & 0.77 & 0.10 \\
Mixtral 8x7B (it) & 0.16 & 0.18 & 0.86 & 0.78 & 0.78 & 0.42 \\
Mixtral 8x7B & 0.17 & 0.21 & 0.83 & 0.65 & 0.74 & 0.06 \\
Mistral 7B (it) & 0.21 & 0.22 & 0.83 & 0.77 & 0.77 & 0.42 \\
Mistral 7B & 0.20 & 0.23 & 0.80 & 0.73 & 0.74 & 0.04 \\
Llama 3 70B (it) & 0.27 & 0.27 & 0.86 & 0.69 & 0.78 & 0.42 \\
Llama 3 70B & 0.20 & 0.20 & 0.86 & 0.70 & 0.77 & 0.14 \\
Llama 3 8B (it) & 0.32 & 0.30 & 0.85 & 0.62 & 0.78 & 0.37 \\
Llama 3 8B & 0.25 & 0.26 & 0.81 & 0.38 & 0.69 & 0.05 \\
Gemma 2 27B (it) & 0.34 & 0.35 & 0.71 & 0.57 & 0.67 & 0.36 \\
Gemma 2 27B & 0.20 & 0.29 & 0.51 & 0.53 & 0.39 & 0.23 \

## Render plots

In [None]:
# TODO