# Render paper plots and tables

In [1]:
import logging
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

Load aggregated results data
(can be obtained using the `parse-acs-results.ipynb` notebook).

In [2]:
ACS_AGG_RESULTS_PATH = Path("/fast/groups/sf/folktexts-results") / "2024-07-03" / "aggregated_results.2024.07.09-23.12.24.csv"

In [3]:
results_df = pd.read_csv(ACS_AGG_RESULTS_PATH, index_col=0)
results_df.head(2)

Unnamed: 0,accuracy,accuracy_diff,accuracy_ratio,balanced_accuracy,balanced_accuracy_diff,balanced_accuracy_ratio,brier_score_loss,ece,ece_quantile,equalized_odds_diff,...,name,is_inst,num_features,uses_all_features,fit_thresh_on_100,fit_thresh_accuracy,optimal_thresh,optimal_thresh_accuracy,score_stdev,score_mean
gemma-2-9b-it__ACSIncome__-1,0.78003,0.096459,0.888661,0.753813,0.249373,0.688385,0.203177,0.194715,0.179097,0.694508,...,Gemma 2 9B (it),True,-1,True,0.085142,0.777837,0.007575,0.761892,0.447796,0.32814
Meta-Llama-3-70B__ACSTravelTime__-1,0.548924,0.231092,0.607143,0.591093,0.119547,0.804942,0.241007,0.092395,,0.317821,...,Llama 3 70B,False,-1,True,0.53122,0.614482,0.531059,0.608148,0.048256,0.521286


## Run baseline ML classifiers on the benchmark ACS tasks

In [4]:
DATA_DIR = Path("/fast/groups/sf") / "data"

In [5]:
ALL_TASKS = [
    "ACSIncome",
    "ACSMobility",
    "ACSEmployment",
    "ACSTravelTime",
    "ACSPublicCoverage",
]

List all baseline classifiers here:

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier    # NOTE: requires `pip install xgboost`

baselines = {
    "LR": LogisticRegression(),
    "GBM": HistGradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
}

In [7]:
from folktexts.acs.acs_dataset import ACSDataset
from folktexts.evaluation import evaluate_predictions
from collections import defaultdict

def fit_and_eval(
    clf,
    X_train, y_train,
    X_test, y_test, s_test,
    fillna=False,
) -> dict:
    """Fit and evaluate a given classifier on the given data."""
    assert len(X_train) == len(y_train) and len(X_test) == len(y_test) == len(s_test)

    train_nan_count = X_train.isna().any(axis=1).sum()
    if fillna and train_nan_count > 0:
        # Fill NaNs with value=-1
        X_train = X_train.fillna(axis="columns", value=-1)
        X_test = X_test.fillna(axis="columns", value=-1)

    # Fit on train data
    clf.fit(X_train, y_train)

    # Evaluate on test data
    y_test_scores = clf.predict_proba(X_test)[:, -1]
    return evaluate_predictions(
        y_true=y_test.to_numpy(),
        y_pred_scores=y_test_scores,
        sensitive_attribute=s_test,
        threshold=0.5,
    )

def run_baselines(baselines, tasks) -> dict:
    """Run baseline classifiers on all acs tasks."""
    baseline_results = defaultdict(dict)

    # Prepare progress bar
    progress_bar = tqdm(
        total=len(tasks) * len(baselines),
        leave=True,
    )

    for task in tasks:
        progress_bar.set_postfix({"task": task})

        # Load ACS task data
        acs_dataset = ACSDataset.make_from_task(task=task, cache_dir=DATA_DIR)
    
        # Get train/test data
        X_train, y_train = acs_dataset.get_train()
        X_test, y_test = acs_dataset.get_test()
    
        # Get sensitive attribute test data
        s_test = None
        if acs_dataset.task.sensitive_attribute is not None:
            s_test = acs_dataset.get_sensitive_attribute_data().loc[y_test.index]
    
        for clf_name, clf in baselines.items():
            progress_bar.set_postfix({"task": task, "clf": clf_name})

            try:
                baseline_results[task][clf_name] = fit_and_eval(
                    clf=clf,
                    X_train=X_train, y_train=y_train,
                    X_test=X_test, y_test=y_test, s_test=s_test,
                    fillna=(clf_name == "LR"),
                )
            except Exception as err:
                logging.error(err)
            finally:
                progress_bar.update()

    return baseline_results

Flatten results and add extra columns.

In [8]:
def parse_baseline_results(baseline_results) -> list:
    """Flatten and parse baseline results."""
    parsed_results_list = list()
    
    for task, task_results in baseline_results.items():
    
        for clf, clf_results in task_results.items():
            parsed_results = clf_results.copy()
    
            parsed_results["config_task_name"] = task
            parsed_results["config_model_name"] = clf
            parsed_results["name"] = clf
            parsed_results["num_features"] = -1
            parsed_results["uses_all_features"] = True
    
            parsed_results_list.append(parsed_results)

    return parsed_results_list

Check if baseline results were already computed. If so, load csv; otherwise, compute and save.

In [9]:
BASELINE_RESULTS_PATH = ACS_AGG_RESULTS_PATH.parent / "baseline-results.csv"

# If saved results exists: load
if BASELINE_RESULTS_PATH.exists():
    print(f"Loading pre-computed baseline results from {BASELINE_RESULTS_PATH.as_posix()}")
    baselines_df = pd.read_csv(BASELINE_RESULTS_PATH, index_col=0)

# Compute baseline results
else:
    # Compute baseline results
    baseline_results = run_baselines(baselines, tasks=ALL_TASKS)

    # Parse results
    parsed_results_list = parse_baseline_results(baseline_results)

    # Construct DF
    baselines_df = pd.DataFrame(parsed_results_list, index=[r["name"] for r in parsed_results_list])

    # Save DF to disk
    baselines_df.to_csv(BASELINE_RESULTS_PATH)

# Show 2 random rows
baselines_df.sample(2)

Loading pre-computed baseline results from /fast/groups/sf/folktexts-results/2024-07-03/baseline-results.csv


Unnamed: 0,threshold,n_samples,n_positives,n_negatives,model_name,accuracy,tpr,fnr,fpr,tnr,...,equalized_odds_ratio,equalized_odds_diff,roc_auc,ece,ece_quantile,config_task_name,config_model_name,name,num_features,uses_all_features
XGBoost,0.5,113829,33971,79858,,0.80165,0.515175,0.484825,0.076486,0.923514,...,0.123704,0.368044,0.839742,0.004371,0.004271,ACSPublicCoverage,XGBoost,XGBoost,-1,True
LR,0.5,113829,33971,79858,,0.725826,0.161697,0.838303,0.034198,0.965802,...,0.145795,0.153757,0.696108,0.026235,0.026164,ACSPublicCoverage,LR,LR,-1,True


In [10]:
all_results_df = pd.concat((results_df, baselines_df))
print(f"{all_results_df.shape=}")

all_results_df.shape=(115, 64)


## Prepare results table for each task

In [11]:
table_metrics = ["ece", "brier_score_loss", "roc_auc", "accuracy", "fit_thresh_accuracy", "score_stdev"] #, "score_mean"]

model_col = "config_model_name"
task_col = "config_task_name"

Add model size and model family columns:

In [12]:
from folktexts.llm_utils import get_model_size_B

all_results_df["model_size"] = [
    (
        get_model_size_B(row["name"], default=float("nan"))
        if row["name"] not in baselines else "-"
    )
    for _, row in all_results_df.iterrows()
]

def get_model_family(model_name) -> str:
    if "llama" in model_name.lower():
        return "Llama"
    elif "mistral" in model_name.lower() or "mixtral" in model_name.lower():
        return "Mistral"
    elif "gemma" in model_name.lower():
        return "Gemma"
    elif "yi" in model_name.lower():
        return "Yi"
    elif "qwen" in model_name.lower():
        return "Qwen"
    else:
        return "-"

all_results_df["model_family"] = [get_model_family(row[model_col]) for _, row in all_results_df.iterrows()]
all_results_df.groupby([task_col, "model_family"])["accuracy"].count()

config_task_name   model_family
ACSEmployment      -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSIncome          -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSMobility        -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSPublicCoverage  -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
ACSTravelTime      -               3
                   Gemma           8
                   Llama           4
                   Mistral         6
                   Yi              2
Name: accuracy, dtype: int64

In [73]:
arr = np.random.random(10)
low_pct, high_pct = np.percentile(arr, [0, 100])
low_pct, high_pct

(0.00391486975786115, 0.9873627772436349)

In [117]:
def model_sort_key(name, task_df):
    """Sort key for table rows."""

    if "llama" in name.lower():
        key = 1000
    elif "mixtral" in name.lower():
        key = 900
    elif "mistral" in name.lower():
        key = 800
    elif "yi" in name.lower():
        key = 700
    elif "gemma" in name.lower():
        key = 600
    else:
        return 0

    row = task_df.loc[name]
    return key + int(row["is_inst"]) + (row["model_size"] // 3)


In [125]:
def latex_colored_float_format(val, all_values, higher_is_better=True):
    """Map a cell's value to its colored latex code.

    Current definition:
    - use cyan color gradient for good values;
    - use orange color gradient for bad values;
    - use no color for anything in between;
    """
    min_val, max_val = np.min(all_values), np.max(all_values)

    low_pct_val, high_pct_val = [
        min_val + (max_val - min_val) * interp_point
        for interp_point in [0.1, 0.9]
    ]

    # Use rounded value or original value for coloring?
    # > Using rounded value for consistency in table
    # val = np.round(val, decimals=2)

    # Use no color for middle 33% of values
    if low_pct_val <= val <= high_pct_val:
        return f"{val:.2f}"

    if val < low_pct_val:
        color = "orange" if higher_is_better else "cyan"
        color_value = 100 * (
            (low_pct_val - val) / (low_pct_val - min_val))
        
    elif val > high_pct_val:
        color = "cyan" if higher_is_better else "orange"
        color_value = 100 * (
            (val - high_pct_val) / (max_val - high_pct_val))

    else:
        raise RuntimeError(f"{val}")

    # Note: halving `color_value` to have softer colors
    color_value /= 4

    return (
        r"\cellcolor{"
        + f"{color}!{color_value:.1f}"
        + r"} "
        + f"{val:.2f}"
    )

higher_is_better_cols = {"roc auc", "accuracy", "fit thresh accuracy"}

## Output latex results tables - colored!

In [129]:
from utils import prettify_model_name

for task in ALL_TASKS:
    task_df = all_results_df[all_results_df[task_col] == task]

    # Sort table rows
    sorted_df_index = sorted(
        task_df.index.tolist(),
        key=lambda id_: model_sort_key(id_, task_df),
        reverse=True,
    )

    # Remove Gemma 2 models
    sorted_df_index = [id_ for id_ in sorted_df_index if "gemma-2-" not in id_]
    
    # latex_table = task_df.sort_values(["model_family", "model_size", "is_inst"], ascending=False).set_index(model_col)[table_metrics].round(3)
    latex_table = task_df.loc[sorted_df_index].set_index(model_col)[table_metrics].round(3)
    latex_table = latex_table.rename(columns=lambda col: col.replace("_", " ")).fillna("-")

    # Prettify model names
    latex_table["Model"] = [
        prettify_model_name(id_) if id_ not in baselines.keys() else id_
        for id_, row in latex_table.iterrows()
    ]
    latex_table.set_index("Model", drop=True, inplace=True)

    for col in set(latex_table.columns.tolist()) - {"score stdev"}:
        index_without_baselines = [name for name in latex_table.index if name not in baselines]
        index_baselines = list(baselines.keys())

        col_data = latex_table.loc[index_without_baselines, col].copy()

        new_col_data = [
            latex_colored_float_format(
                val=col_data.loc[id_], all_values=col_data,
                higher_is_better=col in higher_is_better_cols,
            )
            for id_ in index_without_baselines
        ] + [
            latex_table.loc[id_, col]
            for id_ in index_baselines
        ]

        latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data


    print(f"*** {task.upper()} ***\n")
    print(latex_table.to_latex(float_format="%.2f"))
    print("")

*** ACSINCOME ***

\begin{tabular}{lllllll}
\toprule
 & ece & brier score loss & roc auc & accuracy & fit thresh accuracy & score stdev \\
Model &  &  &  &  &  &  \\
\midrule
Llama 3 70B (it) & 0.27 & 0.27 & \cellcolor{cyan!25.0} 0.86 & 0.69 & \cellcolor{cyan!15.3} 0.78 & 0.42 \\
Llama 3 70B & 0.20 & \cellcolor{cyan!14.9} 0.20 & \cellcolor{cyan!20.8} 0.86 & 0.70 & \cellcolor{cyan!10.5} 0.77 & 0.14 \\
Llama 3 8B (it) & 0.32 & 0.30 & \cellcolor{cyan!13.3} 0.85 & 0.62 & \cellcolor{cyan!15.3} 0.78 & 0.37 \\
Llama 3 8B & 0.25 & 0.26 & 0.81 & \cellcolor{orange!20.2} 0.38 & 0.69 & 0.05 \\
Mixtral 8x22B (it) & 0.21 & \cellcolor{cyan!3.6} 0.22 & \cellcolor{cyan!11.2} 0.85 & \cellcolor{cyan!11.1} 0.76 & 0.75 & 0.48 \\
Mixtral 8x22B & \cellcolor{cyan!13.7} 0.17 & \cellcolor{cyan!22.2} 0.19 & \cellcolor{cyan!16.5} 0.85 & 0.68 & \cellcolor{cyan!7.3} 0.77 & 0.10 \\
Mixtral 8x7B (it) & \cellcolor{cyan!17.3} 0.16 & \cellcolor{cyan!25.0} 0.18 & \cellcolor{cyan!19.7} 0.86 & \cellcolor{cyan!25.0} 0.78 & 

  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_without_baselines + index_baselines, col] = new_col_data
  latex_table.loc[index_with

## Render paper plots

Load scores distributions for each model (and with varying degrees of information).

In [None]:
# TODO

#### 1st page illustrative plot

In [None]:
# TODO

#### Score distributions for base/instruct pairs