In [1]:
settings = {
    "dimensions": {
        "scale": "scale", # "scale", "do-not-scale",
        "encode_categorical": "one-hot", # "ordinal", "one-hot"
        "stratify_split": "none", # "none", "target", "protected-attribute", "both",
        "model": "elasticnet", # "logreg", "rf", "svm", "gbm", "elasticnet"
        "cutoff": ["raw_0.5", "quantile_0.1", "quantile_0.25"],
        "preprocess_age": "quantiles_3", # "none", "bins_10", "quantiles_3", "quantiles_4"
        "preprocess_income": "bins_10000", # "none", "log", "bins_10000", "quantiles_3", "quantiles_4"
        "exclude_features": "none", # "race", "sex", "race-sex"
        "exclude_subgroups": "drop-name_race_Some Other Race alone", # keep-all, drop-smallest_race_2, keep-largest_race_1, keep-largest_race_2, drop-name_race_Some Other Race alone
        "eval_fairness_grouping": ["majority-minority", "race-all"],
        "eval_exclude_subgroups": ["exclude-in-eval", "keep-in-eval"],
        "eval_on_subset": [
            "full",
            # Largest PUMA region
            "locality-largest-only",
            # PUMA region w/ highest share of white people
            "locality-whitest-only",
            # PUMA regions belonging to a large city
            "locality-city-la",
            "locality-city-sf",
            # Exclude military personnel from test dataset
            "exclude-military",
            # Exclude non US citizens from test dataset
            "exclude-non-citizens",
        ],
        "fairness_definition": ["sensitivity", "precision"],
    }
}

In [2]:
from multiversum import Universe

universe_analysis = Universe(
    settings=settings,
)

# Get the parsed universe settings
universe = universe_analysis.dimensions
seed = universe_analysis.seed

Always use the same seed


In [None]:
import numpy as np
parsed_seed = int(seed)
np.random.seed(parsed_seed)
print(f"Using Seed: {parsed_seed}")

## Loading Data

### (Down)load Data from Census


In [None]:
from pathlib import Path
import pandas as pd
from folktables import ACSDataSource

data_source = ACSDataSource(
    survey_year='2018',
    horizon='1-Year',
    survey='person'
)

# Use custom caching of data
cache_dir = Path("data")
cache_dir.mkdir(exist_ok=True)
cache_file = cache_dir / "dataset.csv.gz"
if cache_file.exists():
    print(f"Loading data from cache: {cache_file}")
    dataset = pd.read_csv(cache_file)
else:
    # Load dataset via folktables, if necessary download from the internet
    dataset = data_source.get_data(states=["CA"], download=True)
    # Write file to cache
    dataset.to_csv(cache_file, index=False)

In [5]:
# Download additional definition data
definition_df = data_source.get_definitions(download=True)

### Perform Pre-Processing for Selected Task

- **ACSIncome**: predict whether an individual’s income is above $50,000, after filtering the ACS PUMS data sample to only include individuals above the age of 16, who reported usual working hours of at least 1 hour per week in the past year, and an income of at least $100. The threshold of $50,000 was chosen so that this dataset can serve as a replacement to UCI Adult, but we also offer datasets with other income cutoffs described in Appendix B.
- **ACSPublicCoverage**: predict whether an individual is covered by public health insurance, after filtering the ACS PUMS data sample to only include individuals under the age of 65, and those with an income of less than $30,000. This filtering focuses the prediction problem on low-income individuals who are not eligible for Medicare.
- **ACSMobility**: predict whether an individual had the same residential address one year ago, after filtering the ACS PUMS data sample to only include individuals between the ages of 18 and 35. This filtering increases the difficulty of the prediction task, as the base rate of staying at the same address is above 90% for the general population.
- **ACSEmployment**: predict whether an individual is employed, after filtering the ACS PUMS data sample to only include individuals between the ages of 16 and 90.
- **ACSTravelTime**: predict whether an individual has a commute to work that is longer than 20 minutes, after filtering the ACS PUMS data sample to only include individuals who are employed and above the age of 16. The threshold of 20 minutes was chosen as it is the US-wide median travel time to work in the 2018 ACS PUMS data release

- The selected story & task has implications for which fairness metric makes the most sense in the end.


In [6]:
from copy import deepcopy
from folktables import generate_categories
from folktables import ACSPublicCoverage

# Normally you would create the task with the following snippet
# features, label, group = ACSEmployment.df_to_numpy(acs_data)
# But this severly limits us in regards to how many protected
# groups we can examine and further removes feature lables

task = deepcopy(ACSPublicCoverage)

# Additional features to extract, that are not part of the task
extra_feature_cols = ["PUMA"]
task._features.extend(extra_feature_cols)

categories = generate_categories(features=task.features, definition_df=definition_df)
features_org, label_org, group_org = task.df_to_pandas(dataset, categories=categories)

# Keep a reference to the original state of featuers
features = features_org.copy()

# Immediately remove the extra features before they could leak into the task
features.drop(columns=extra_feature_cols, inplace=True)
extra_features = features_org[extra_feature_cols].copy()

label = label_org.copy()
group = group_org.copy()


In [None]:
# Let's see the data
features.head(10)

## Preprocessing Data


### Exclude Protected Features

In [8]:
# Note this will alwqys be n >= 1, even if empty!
excluded_features = universe["exclude_features"].split("-")
excluded_features_dictionary = {
    "race": "RAC1P",
    "sex": "SEX",
    "immigration": "NATIVITY",
}

# Code nice names to column names
excluded_feature_columns = [
    excluded_features_dictionary[f] for f in excluded_features if len(f) > 0 and f != "none"
]

if len(excluded_feature_columns) > 0:
    print(f"Dropping features: {excluded_feature_columns}")
    features.drop(excluded_feature_columns, axis=1, inplace=True)


### Continuous Variables: Binning / Log-Scaling / Keeping Them As-Is

In [9]:

from math import ceil, floor
from typing import List, Optional, Tuple
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.preprocessing import KBinsDiscretizer

def continuous_var_will_be_binned(configuration: str):
    return configuration.startswith(("quantiles", "bins"))

# Modified version of pandas.cut, that also supports DataFrames instead of just Series
def cut_df(df, **kwargs) -> pd.DataFrame:
    # Adapted from https://datascience.stackexchange.com/questions/75787/how-to-use-columntransformer-and-functiontransformer-to-apply-the-same-function
    if isinstance(df, pd.Series):
        return pd.cut(df, **kwargs)
    elif isinstance(df, pd.DataFrame):
        return df.apply(pd.cut, axis=0, **kwargs)
    else:
        raise "Unsupported type of data in cut_df"

def preprocess_continuous(
    source_data: pd.DataFrame, column_name: str, configuration: str
) -> Tuple[Optional[ColumnTransformer], Optional[List[str]]]:
    """Preprocess a continuous variable.

    Args:
        source_data: The source data containing the variable to be
            preprocessed.
        column_name: The name of the column to be preprocessed.

    Returns:
        A tuple containing the ColumnTransformer to be used for preprocessing
        and the list of binned values (if applicable).
    """
    if configuration == "none":
        # Skip transformation if "none" is specified
        return (None, None)
    elif configuration == "log":
        transformer = make_pipeline(
            # Calculate the log (+1 to gracefully handle 0)
            # Since negative values are undefined for log, we replace them with 0
            # (NAs cannot be handled by all algorithms)
            FunctionTransformer(lambda df: np.log1p(df.astype("float")).fillna(0))
        )
        binned_values = None
    elif continuous_var_will_be_binned(configuration=configuration):
        method, value = configuration.split("_")
        if method == "quantiles":
            n_bins = int(value)
            transformer = KBinsDiscretizer(
                n_bins=n_bins,
                encode="ordinal",
                strategy="quantile",
                random_state=np.random.randint(10000)
            )
        elif method == "bins":
            step = int(value)
            round_min = floor(source_data[column_name].min() / step) * step
            round_max = ceil(source_data[column_name].max() / step) * step
            bins = list(range(round_min, round_max, step)) + [round_max]

            print(
                f"Generated bins transformer for {column_name} with the following bins: {bins}"
            )

            transformer = FunctionTransformer(
                cut_df, kw_args={"bins": bins, "labels": False, "retbins": False}
            )
            n_bins = len(bins)
        else:
            raise Exception(
                "Unsupported method for preprocessing continuous variable: " + method
            )

        binned_values = list(range(n_bins))
    else:
        raise Exception(
            "Unsupported configuration for preprocessing continuous variable: "
            + configuration
        )

    column_transformer = ColumnTransformer(
        [(f"bin_{column_name}_{configuration}", transformer, [column_name])],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )
    column_transformer.set_output(transform="pandas")
    return (column_transformer, binned_values)

In [None]:
from sklearn.pipeline import make_pipeline

transformer_age, bins_age = preprocess_continuous(source_data=features, column_name="AGEP", configuration=universe["preprocess_age"])
transformer_income, bins_income = preprocess_continuous(source_data=features, column_name="PINCP", configuration=universe["preprocess_income"])

continuous_processor = make_pipeline(
    transformer_age,
    transformer_income
)

### Categorical Variables: One-Hot or Ordinal Encoding

In [11]:
from sklearn.pipeline import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

all_categorical_columns = list(set(categories.keys()).intersection(set(features.columns)))

# For which columns is ordinal encoding even an option?
categorical_columns_to_transform = [
    'SCHL',
    # 'MAR',
    # 'SEX',
    # 'DIS',
    # 'ESP',
    # 'CIT',
    # 'MIG',
    'MIL',
    # 'ANC',
    # 'NATIVITY',
    # 'DEAR',
    # 'DEYE',
    # 'DREM',
    # 'ESR',
    # 'ST',
    # 'FER',
    # 'RAC1P'
]

# Support to-be-binned continuous variables
def add_binned_variable_to_categorical_transformation(colname, values):
    if values is not None:
        categorical_columns_to_transform.append(colname)
        categories[colname] = {val: val for val in values}

add_binned_variable_to_categorical_transformation("AGEP", bins_age)
add_binned_variable_to_categorical_transformation("PINCP", bins_income)

def nested_list(all_categories, columns_to_use):
    categories = { col: all_categories[col] for col in columns_to_use }
    # Create a nested list from the categories dict
    categories_list = [[v for k, v in mapping.items()] for column, mapping in categories.items()]
    return categories_list

if (universe["encode_categorical"] == "ordinal"):
    categorical_transformer = OrdinalEncoder(
        categories = nested_list(categories, categorical_columns_to_transform),
    )
elif (universe["encode_categorical"] == "one-hot"):
    categorical_transformer = OneHotEncoder(
        categories = nested_list(categories, categorical_columns_to_transform),
        sparse_output=False
    )
else:
    raise "Unsupported universe option for encode_categorical"

# One-Hot Encode all other cateogircal columns
other_categorical_columns = list(set(all_categorical_columns) - set(categorical_columns_to_transform))
other_transformer = OneHotEncoder(
    categories = nested_list(categories, other_categorical_columns),
    sparse_output=False
)

categorical_preprocessor = ColumnTransformer([
        ("encode_categorical", categorical_transformer, categorical_columns_to_transform),
        ("encode_categorical_rest", other_transformer, other_categorical_columns),
    ],
    remainder='passthrough',
)

## Split Data


In [12]:
# Select stratification strategy
if universe["stratify_split"] == "none":
    stratify = None
elif universe["stratify_split"] == "target":
    stratify = label
elif universe["stratify_split"] == "protected-attribute":
    stratify = features_org["RAC1P"]
elif universe["stratify_split"] == "both":
    # Concatinate both columns
    stratify = features_org["RAC1P"].astype(str) + "-" + label["PUBCOV"].astype(str)

stratify

In [13]:
from sklearn.model_selection import train_test_split

(
    X_train, X_test,
    y_train, y_true,
    group_train, group_test,
    org_train, org_test
) = train_test_split(
    features,
    label,
    group,
    features_org,
    test_size=0.2,
    # Note: The analysis originally used two distinct seeds, one for numpy (defaulting to 2023) and one for the train_test_split (defaulting to 0).
    # To allow for exact reproducibility of the original results, as well as specification of only a single seed we base this second seed off the first one.
    # If you adapt this code for your own analysis feel free to remove this line and replace it e.g. with e.g. a call to numpy.random.randint.
    random_state=abs(parsed_seed - 2023),
    stratify=stratify
)

## Post-Splitting Processing

If e.g. only train data is affected

### Exclude Certain Subgroups

In [None]:
# Extract configuration
exclude_subgroups_config = universe["exclude_subgroups"].split("_")
if len(exclude_subgroups_config) == 1:
    exclude_subgroups_config = (exclude_subgroups_config[0], None, None)
excl_subgroups_method, excl_subgroup_colname, excl_subgroups_value = exclude_subgroups_config

if excl_subgroup_colname == "race":
    excl_subgroup_column = "RAC1P"
    excl_subgroup_counts = org_train[excl_subgroup_column].value_counts()
elif excl_subgroups_method != "keep-all":
    raise Exception("Unsupported configuration for exclude_subgroups:" + universe["exclude_subgroups"])

if excl_subgroups_method == "keep-all":
    # Don't need to do anything
    excl_subgroup_column = None
    excl_subgroup_values = []
else:
    if excl_subgroups_method == "drop-smallest":
        drop_smallest_n = int(excl_subgroups_value)
        excl_subgroup_values = list(excl_subgroup_counts.tail(drop_smallest_n).index)
    elif excl_subgroups_method == "keep-largest":
        keep_largest_n = int(excl_subgroups_value)
        excl_subgroup_values = list(excl_subgroup_counts.tail(
            len(excl_subgroup_counts) - keep_largest_n
        ).index)
    elif excl_subgroups_method == "drop-name":
        excl_subgroup_values = [excl_subgroups_value]
    elif excl_subgroups_method == "keep-names":
        excl_subgroup_values = list(excl_subgroup_counts.index)
        for group_to_keep in excl_subgroups_value.split("-"):
            excl_subgroup_values.remove(group_to_keep)
    else:
        raise Exception("Unsupported configuration for exclude_subgroups:" + universe["exclude_subgroups"])

    if excl_subgroup_column is not None:
        print(f"Dropping values: {excl_subgroup_values}")
        keep_rows_mask = ~org_train[excl_subgroup_column].isin(excl_subgroup_values)

    n_rows_to_drop = (~keep_rows_mask).sum()
    if n_rows_to_drop > 0:
        print(f"Dropping N = {n_rows_to_drop} ({n_rows_to_drop / len(keep_rows_mask):.2%}) rows from {excl_subgroup_colname}")
        X_train = X_train[keep_rows_mask]
        y_train = y_train[keep_rows_mask]
        group_train = group_train[keep_rows_mask]

## Fitting the Model

Select which model to fit

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

if (universe["model"] == "logreg"):
    model = LogisticRegression(random_state=np.random.randint(10000))
elif (universe["model"] == "rf"):
    model = RandomForestClassifier(random_state=np.random.randint(10000))
elif (universe["model"] == "svm"):
    model = SVC(random_state=np.random.randint(10000))
elif (universe["model"] == "gbm"):
    model = GradientBoostingClassifier(random_state=np.random.randint(10000))
elif (universe["model"] == "elasticnet"):
    model = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5, random_state=np.random.randint(10000))
else:
    raise "Unsupported universe.model"

In [16]:
from multiversum.universe import predict_w_threshold

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

model = Pipeline([
    ("continuous_processor", continuous_processor),
    ("categorical_preprocessor", categorical_preprocessor),
    ("scale", StandardScaler() if universe["scale"] == "scale" else None),
    ("model", model),
])

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)
y_pred_default = predict_w_threshold(y_prob, 0.5)

# Naive prediction
accuracy_score(y_true = y_true, y_pred = y_pred_default)

In [None]:
# Evaluate
model.predict(X_test)

## (Fairness) Metrics

- Using [Fairlearn](https://fairlearn.org/v0.8/quickstart.html)


In [19]:
import numpy as np

colname_to_bin = "RAC1P"
majority_value = features_org[colname_to_bin].mode()[0]

org_test["majmin"] = np.where(org_test[colname_to_bin] == majority_value, "majority", "minority")

In [20]:
example_universe = universe.copy()
example_universe["cutoff"] = example_universe["cutoff"][0]
example_universe["eval_fairness_grouping"] = example_universe["eval_fairness_grouping"][0]
example_universe["fairness_definition"] = example_universe["fairness_definition"][0]
fairness_dict, metric_frame = universe_analysis.compute_sub_universe_metrics(
    example_universe,
    y_pred_prob=y_prob,
    y_test=y_true,
    org_test=org_test,
)

In [None]:
sub_universes = universe_analysis.generate_sub_universes()
len(sub_universes)

In [22]:
# Only needs to be run once, so disabled for now

# Calculate PUMA area with the highest share of people with PUBCOV == 1
# puma_coverage = org_train.join(y_train).groupby("PUMA")["PUBCOV"].mean()
# puma_coverage_highest = puma_coverage.sort_values(ascending=False).head(1)
# puma_coverage_highest


In [23]:
import pandas as pd

# Uses excl_subgroup_column and values from the global scope

def filter_sub_universe_data(sub_universe, org_test):
  # Generate an all True mask to start with
  keep_rows_mask = np.ones(org_test.shape[0], dtype=bool)

  # Potentially remove any subgroups from the test set
  if (sub_universe["eval_exclude_subgroups"] == "exclude-in-eval"):
    if excl_subgroup_column is not None:
      assert excl_subgroup_values is not None

      exclude_subgroup_eval_mask = ~org_test[excl_subgroup_column].isin(excl_subgroup_values)
      keep_rows_mask = keep_rows_mask & exclude_subgroup_eval_mask

      n_rows_to_drop = (~exclude_subgroup_eval_mask).sum()
      print(f"[drop subgroups] Dropping N = {n_rows_to_drop} ({n_rows_to_drop / len(keep_rows_mask):.2%}) rows from {excl_subgroup_colname}")
  elif (sub_universe["eval_exclude_subgroups"] == "keep-in-eval"):
    pass
  else:
    raise "Unsupported eval_exclude_subgroups"

  # Potentially use a smaller and more "convenient" subset of the data to do evalaution on
  if (sub_universe["eval_on_subset"] == "full"):
    pass
  else:
    if sub_universe["eval_on_subset"].startswith("locality"):
      # Filter based on locality / region
      # Step 1: Decide which regions to keep
      if (sub_universe["eval_on_subset"] == "locality-largest-only"):
        # Use the largest PUMA region
        puma_regions_to_keep = [org_test["PUMA"].value_counts().idxmax()]
      elif (sub_universe["eval_on_subset"] == "locality-most-privileged"):
        # Use the PUMA region the highest number of PUBCOV = 1 (done in cell above)
        puma_regions_to_keep = [2904] # 2904 -> 62.8% of PUBCOV

      elif (sub_universe["eval_on_subset"] == "locality-whitest-only"):
        # Find the majority class on the prot. attribute
        majority_class = org_test["RAC1P"].value_counts().index[0]
        majority_class

        # Find the PUMA region with the highest share of the majority class
        counts = pd.DataFrame()
        counts["full"] = org_test["PUMA"].value_counts(sort=False)
        counts["majority"] = org_test[org_test["RAC1P"] == majority_class]["PUMA"].value_counts(sort=False)
        counts["fraction"] = counts["majority"] / counts["full"]

        # Use the PUMA region with the highest share of the majority class
        majority_puma_id = counts.sort_values(by="fraction", ascending=False).index[0]
        puma_regions_to_keep = [majority_puma_id]
      elif (sub_universe["eval_on_subset"] == "locality-city-la"):
        puma_regions_to_keep = list(range(3701, 3769+1))
      elif (sub_universe["eval_on_subset"] == "locality-city-sf"):
        puma_regions_to_keep = list(range(7501, 7507+1))

      # Step 2: Keep only those regions
      print(f"Keeping the following PUMA regions: {puma_regions_to_keep}")
      eval_on_subset_mask = org_test["PUMA"].isin(puma_regions_to_keep)
    elif (sub_universe["eval_on_subset"] == "exclude-military"):
      # Only keep non-military personnel
      eval_on_subset_mask = (org_test["MIL"].isin(["Never served in the military", "N/A (less than 17 years old)"]))
    elif (sub_universe["eval_on_subset"] == "exclude-non-citizens"):
      # Only keep US citizens
      eval_on_subset_mask = ~(org_test["CIT"] == "Not a citizen of the U.S.")
    else:
      raise "Unsupported eval_on_subset"

    keep_rows_mask = keep_rows_mask & eval_on_subset_mask

    n_rows_to_drop = (~eval_on_subset_mask).sum()
    print(f"[subset] Dropping N = {n_rows_to_drop} ({n_rows_to_drop / len(keep_rows_mask):.2%}) rows")

  n_rows_to_drop = (~keep_rows_mask).sum()
  print(f"[TOTAL] Dropping N = {n_rows_to_drop} ({n_rows_to_drop / len(keep_rows_mask):.2%}) rows. Final size: {keep_rows_mask.sum()}.")

  return keep_rows_mask

In [24]:
from typing import Dict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    balanced_accuracy_score,
    f1_score
)
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import (
    false_positive_rate,
    false_negative_rate,
    selection_rate,
    count
)
from fairlearn.metrics import (
    equalized_odds_difference,
    equalized_odds_ratio,
    demographic_parity_difference,
    demographic_parity_ratio,
)
from multiversum.universe import add_dict_to_df, flatten_dict

metrics = {
    "accuracy": accuracy_score,
    "balanced accuracy": balanced_accuracy_score,
    "f1": f1_score,
    "precision": precision_score, # = ppv
    "recall": recall_score, # = sensitivity, tpr
    "false positive rate": false_positive_rate,
    "false negative rate": false_negative_rate,
    "selection rate": selection_rate,
    "count": count,
}

fairness_metrics = {
    "equalized_odds_difference": equalized_odds_difference,
    "equalized_odds_ratio": equalized_odds_ratio,
    "demographic_parity_difference": demographic_parity_difference,
    "demographic_parity_ratio": demographic_parity_ratio,
}

def compute_metrics(
        sub_universe: Dict,
        y_pred_prob: pd.Series,
        y_test: pd.Series,
        org_test: pd.DataFrame,
    ) -> Tuple[dict, dict]:
        """
        Computes a set of metrics for a given sub-universe.

        Args:
            sub_universe: A dictionary containing the parameters for the
                sub-universe.
            y_pred_prob: A pandas series containing the predicted
                probabilities.
            y_test: A pandas series containing the true labels.
            org_test: A pandas dataframe containing the test data, including
                variables that were not used as features.

        Returns:
            A tuple containing two dicst: explicit fairness metrics and
                performance metrics split by fairness groups.
        """
        # Determine cutoff for predictions
        cutoff_type, cutoff_value = sub_universe["cutoff"].split("_")
        cutoff_value = float(cutoff_value)

        if cutoff_type == "raw":
            threshold = cutoff_value
        elif cutoff_type == "quantile":
            probabilities_true = y_pred_prob[:, 1]
            threshold = np.quantile(probabilities_true, cutoff_value)

        fairness_grouping = sub_universe["eval_fairness_grouping"]
        if fairness_grouping == "majority-minority":
            fairness_group_column = "majmin"
        elif fairness_grouping == "race-all":
            fairness_group_column = "RAC1P"

        y_pred = predict_w_threshold(y_pred_prob, threshold)

        # Compute fairness metrics
        fairness_dict = {
            name: metric(
                y_true=y_test,
                y_pred=y_pred,
                sensitive_features=org_test[fairness_group_column],
            )
            for name, metric in fairness_metrics.items()
        }

        # Compute "normal" metrics (but split by fairness column)
        metric_frame = MetricFrame(
            metrics=metrics,
            y_true=y_test,
            y_pred=y_pred,
            sensitive_features=org_test[fairness_group_column],
        )

        # Compute the maximum difference on the chosen metric
        fairness_definition = sub_universe["fairness_definition"]
        if fairness_definition == "sensitivity": # sensitivity = recall = true positive rate
            metric_col = "recall"
            pass
        elif fairness_definition == "precision":
            metric_col = "precision"
        else:
            raise "Unsupported fairness definition: " + fairness_definition
        fairness_dict["custom_fairness_metric"] = metric_frame.difference()[metric_col]

        return (fairness_dict, metric_frame)

def visit_sub_universe(
    sub_universe, y_pred_prob, y_test, org_test, filter_data
) -> pd.DataFrame:
    """
    Visit a sub-universe and compute the metrics for it.

    Sub-universes correspond to theoretically distinct universes of
    decisions, which can be computed without re-fitting a model. The
    distinction has only been made to improve performance by not having to
    compute these universes from scratch.

    Args:
        sub_universe: A dictionary containing the parameters for the
            sub-universe.
        y_pred_prob: A pandas series containing the predicted
            probabilities.
        y_test: A pandas series containing the true labels.
        org_test: A pandas dataframe containing the test data, including
            variables that were not used as features.
        filter_data: A function that filters data for each sub-universe.
            The function is called for each sub-universe with its
            respective settings and expected to return a pandas Series
            of booleans.

    Returns:
        A pandas dataframe containing the metrics for the sub-universe.
    """
    final_output = universe_analysis._add_universe_info(
        pd.DataFrame(index=[universe_analysis.universe_id]),
        overwrite_dimensions=sub_universe
    )

    data_mask = filter_data(
        sub_universe=sub_universe,
        org_test=org_test
    )
    test_size_n = data_mask.sum()
    final_output["test_size_n"] = test_size_n
    final_output["test_size_frac"] = data_mask.sum() / len(data_mask)

    # Only compute metrics if we have a sample size to calculate it on
    if test_size_n > 0:
        # Compute metrics
        fairness_dict, metric_frame = compute_metrics(
            sub_universe,
            y_pred_prob[data_mask],
            y_test[data_mask],
            org_test[data_mask],
        )

        # Add main fairness metrics to final_output
        final_output = add_dict_to_df(final_output, fairness_dict, prefix="fair_main_")
        final_output = add_dict_to_df(
            final_output, dict(metric_frame.overall), prefix="perf_ovrl_"
        )
        # Add group metrics to final output
        final_output = add_dict_to_df(
            final_output, flatten_dict(metric_frame.by_group), prefix="perf_grp_"
        )

    return final_output

In [None]:
sub_universes = universe_analysis.generate_sub_universes()

final_outputs = list()
for sub_universe in sub_universes:
    final_outputs.append(
        visit_sub_universe(
            sub_universe=sub_universe,
            y_pred_prob=y_prob,
            y_test=y_true,
            org_test=org_test,
            filter_data=filter_sub_universe_data,
        ).reset_index(drop=True)
    )
final_output = pd.concat(final_outputs)

# Write the final output file
universe_analysis.save_data(
    final_output,
    add_info=False
)
