# Setup Notebook

In [1]:
%run analysis__setup.ipynb

## Prepare Data


In [None]:
# Filter out a single configuration to keep data manageable
df_agg = df_agg_full[
  (df_agg_full["sett_eval_fairness_grouping"] == "race-all") &
  (df_agg_full["sett_eval_exclude_subgroups"] == "keep-in-eval") &
  (df_agg_full["sett_eval_on_subset"] == "full")
][
  # Remove eval columns
  cols_non_eval + cols_performance + cols_fairness
].reset_index(drop = True)

df_agg.shape

# Calculate Variable Importance

## Use a functinoal ANOVA (fANOVA) to Analyze Setting Importance

Based on the following paper:

Hutter, F., Hoos, H., & Leyton-Brown, K. (2014). An Efficient Approach for Assessing Hyperparameter Importance. Proceedings of the 31st International Conference on Machine Learning, 754–762. https://proceedings.mlr.press/v32/hutter14.html


In [None]:
from fairness_multiverse.analysis import MultiverseFanova

m_fanova = MultiverseFanova(features = df_agg[cols_non_eval], outcome = df_agg[main_fairness_metric])


In [None]:
m_fanova.quantify_individual_importance()

In [None]:
m_fanova.quantify_importance(save_to = ANALYSIS_OUTPUT_DIR / "fanova_importance_interactions-overall.csv")

In [None]:
best_p_margs = m_fanova.fanova.get_most_important_pairwise_marginals(n=5)
print(best_p_margs)

In [None]:
from fanova import visualizer

vis = visualizer.Visualizer(m_fanova.fanova, m_fanova.configuration_space, directory = str(ANALYSIS_OUTPUT_DIR))

In [None]:
vis.plot_marginal(0)

In [None]:
vis.plot_pairwise_marginal(['sett_exclude_features', 'sett_exclude_subgroups'])

## Quantify Importance with Partial Data

In [None]:
from tqdm import tqdm

PARTIAL_FANOVA_DIR = ANALYSIS_OUTPUT_DIR / "partial_fanova" / "overall"
PARTIAL_FANOVA_DIR.mkdir(parents=True, exist_ok=True)

N_ITERATIONS = 10

In [None]:
from fairness_multiverse.analysis import MultiverseFanova
import joblib

def quantify_importance_for_fraction(fraction: float, base_directory = PARTIAL_FANOVA_DIR):
    # Get random subset of the data
    df = df_agg.sample(frac = fraction).reset_index(drop = True)
    data_hash = joblib.hash(df)

    # Create directory for this fraction
    directory = base_directory / f"fraction-{fraction}"
    directory.mkdir(exist_ok = True)

    # Run FANOVA on subset
    partial_fanova = MultiverseFanova(features = df[cols_non_eval], outcome = df[main_fairness_metric])
    partial_fanova.quantify_importance(save_to = directory / f"partial-fanova_importance_interactions-majmin-{fraction}-{data_hash}.csv")

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.01)

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.05)

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.1)

In [None]:
for i in tqdm(range(N_ITERATIONS)):
    quantify_importance_for_fraction(0.2)