In [None]:
possible_settings = {
    "dataset": [
        "ORG-017-DS-0",
        "ORG-017-DS-2",
    ],
    "protected_attrs": [
        ["age"],
        ["job"],
        ["job_and_age"],
        ["marital_status"],
    ],
    "process_protected": [
        "none",
        "marital_status == married",
        "age >= 35; job: privileged vs. unprivileged",
        "age >= 35",
        "age >= 25",
        "age >= 25 and age < 60",
    ],
}

In [None]:
# For trying out a particular setting
chosen_settings = {
    "dataset": "ORG-017-DS-0",
    "protected_attrs": ["age"],
    "process_protected": "?",
}

Write single function to run full analysis?
- get propper settings
- preprocess protected
- select features
- one-hot encode all categorical variables
- fit different models
- calculate fairness & performance metrics
- determine best model (based on fairness & performance metrics)


In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    balanced_accuracy_score,
    f1_score,
)
from fairlearn.metrics import (
    true_positive_rate,
    true_negative_rate,
    false_positive_rate,
    false_negative_rate,
    selection_rate,
    count,
)
from fairlearn.metrics import (
    equalized_odds_difference,
    equalized_odds_ratio,
    demographic_parity_difference,
    demographic_parity_ratio,
)

metrics_to_calculate = {
    "accuracy": accuracy_score,
    "balanced accuracy": balanced_accuracy_score,
    "f1": f1_score,
    "precision": precision_score,
    "true positive rate": true_positive_rate,
    "true negative rate": true_negative_rate,
    "false positive rate": false_positive_rate,
    "false negative rate": false_negative_rate,
    "selection rate": selection_rate,
    "count": count,
}
fairness_metrics_to_calculate = {
    "equalized_odds_difference": equalized_odds_difference,
    "equalized_odds_ratio": equalized_odds_ratio,
    "demographic_parity_difference": demographic_parity_difference,
    "demographic_parity_ratio": demographic_parity_ratio,
}

In [None]:
import pandas as pd
import hashlib

def hash_df(df):
    return hashlib.md5(df.to_csv().encode()).hexdigest()

In [None]:
import pandas as pd
import numpy as np
import json
import random
import torch
from pathlib import Path
import shutil

from autogluon.tabular import TabularDataset, TabularPredictor

from sklearn.model_selection import train_test_split
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

GENERATED_DATA_DIR = Path("./generated_data")
GENERATED_DATA_DIR.mkdir(exist_ok=True, parents=True)

def conduct_analysis(settings, seed = 80539):
    print(f"Running Analysis w/ Settings: {str(settings)}")

    # Set seed to zip code of our institute :)
    np.random.seed(seed)
    seed_2 = np.random.randint(0, 2**32 - 1)
    random.seed(seed_2)
    seed_3 = np.random.randint(0, 2**32 - 1)
    torch.manual_seed(seed_3)
    seed_4 = np.random.randint(0, 2**32 - 1)


    # Load dataset
    if settings["dataset"] == "ORG-017-DS-0":
        df = pd.read_csv("data/raw/bank-additional-full.csv", sep=";")
    elif settings["dataset"] == "ORG-017-DS-2":
        df = pd.read_csv("data/raw/bank-full.csv", sep=";")
    col_target = "y"

    # Pre-process protected attribute(s)
    if settings["process_protected"] == "age >= 35":
        df["age"] = df["age"] >= 35
    elif settings["process_protected"] == "age >= 25":
        df["age"] = df["age"] >= 25
    elif (settings["process_protected"] == "age >= 25 and age < 60"):
        df["age"] = (df["age"] >= 25) & (df["age"] < 60)
    elif (settings["process_protected"] == "?"):
        # Mirror most common decision on unknown processing
        df["age"] = (df["age"] >= 25) & (df["age"] < 60)
    elif settings["process_protected"] == "age >= 35; job: privileged vs. unprivileged":
        df["age"] = df["age"] >= 35

        df["job"] = df["job"].replace({
            'management': "privileged",
            'technician': "privileged",
            'admin.': "privileged",
            'self-employed': "privileged",
            'entrepreneur': "privileged",
            'blue-collar': "unprivileged",
            'services': "unprivileged",
            'retired': "unprivileged",
            'unemployed': "unprivileged",
            'housemaid': "unprivileged",
            'student': "unprivileged",
            'unknown': "unprivileged",
        })

        # Create a new attribute that identifies the most unprivleged group
        # as a combination of being in the vulnerable group for both age and job
        df["job_and_age"] = (df["job"] == "privileged") | df["age"]

        # Drop the original job and age attributes since we cannot exclude them
        # by default as it would mess with how the fairness metric is computed.
        df.drop(columns=["job", "age"], inplace=True)
    elif settings["process_protected"] == "marital == married":
        df["marital"] = df["marital"] == "married"
    elif settings["process_protected"] == "none":
        pass
    else:
        raise "Unsupported value for setting process_protected"

    # Use all features
    col_features = df.columns
    print(f"All columns: {col_features}")
    # Remove protected attributes as possible features
    col_features = set(col_features) - set(settings["protected_attrs"])
    col_features.remove(col_target)

    # ==== One-hot Encoding ====
    # Identify all non-numeric columns in df
    categorical_features = df[list(col_features)].select_dtypes(include=["object"]).columns
    # One-hot encode all categorical features
    df_onehot = pd.get_dummies(df[categorical_features])
    print(df_onehot)
    new_features_onehot = df_onehot.columns
    # Add one-hot encoded features
    df = df.join(df_onehot)

    # Use the one-hot encoded features instead of the original ones
    col_features = set(col_features) | set(new_features_onehot)
    col_features = set(col_features) - set(categorical_features)

    print(f"Features: {col_features}")

    # Prepare final data
    final_data = df[sorted(col_features)]
    # Turn target into a binary (i.e. not string)
    final_target = df[col_target] == "yes"
    final_protected  = df[settings["protected_attrs"]]

    X_train, X_test, y_train, y_test, prot_train, prot_test = train_test_split(
        final_data,
        final_target,
        final_protected,
        random_state=seed_4
    )

    label = "target"
    full_train = X_train.copy()
    full_train[label] = y_train
    full_test = X_test.copy()
    full_test[label] = y_test

    ds_train = TabularDataset(full_train)

    hyperparameters = get_hyperparameter_config("default")
    hyperparameters["LR"] = {}

    predictor = TabularPredictor(
        label=label
    ).fit(
        ds_train,
        hyperparameters=hyperparameters
    )
    y_pred = predictor.predict(TabularDataset(X_test))
    print(predictor.leaderboard(full_test))

    output = {}

    # Combine all settings-values into one string
    settings_string = '-'.join(str(value) for value in settings.values())

    X_train.to_csv(GENERATED_DATA_DIR / f"{settings_string}-{seed}-X_train.csv")
    X_test.to_csv(GENERATED_DATA_DIR / f"{settings_string}-{seed}-X_test.csv")
    final_data.to_csv(GENERATED_DATA_DIR / f"{settings_string}-{seed}-X_all.csv")

    final_all_combined_again = pd.concat([
        final_data,
        final_target.to_frame("target"),
        final_protected,
    ], axis=1, ignore_index=False)
    final_all_combined_again.to_csv(GENERATED_DATA_DIR / f"{settings_string}-full_data.csv")

    base_rates = final_protected.value_counts().to_dict()
    hash_train = hash_df(X_train)
    hash_test = hash_df(X_test)
    for name in predictor.model_names():

        y_pred = predictor.predict(TabularDataset(X_test), model = name)

        # Prepare output dict with metrics etc.
        output[name] = {
            'model': name,
            'seed': seed,
            'settings': settings,
            'base_rates': base_rates,
            'hash_train': hash_train,
            'hash_test': hash_test,
        }
        # Compute metrics
        standard_metrics = {
            name: metric(
                y_true=y_test,
                y_pred=y_pred,
            )
            for name, metric in metrics_to_calculate.items()
        }
        output[name].update(standard_metrics)
        fairness_metrics = {
            name: metric(
                y_true=y_test,
                y_pred=y_pred,
                sensitive_features=prot_test,
            )
            for name, metric in fairness_metrics_to_calculate.items()
        }
        output[name].update(fairness_metrics)

    # Generate dataframe from output dict
    df_output = pd.DataFrame.from_dict(output).transpose()

    # Clean up model files
    # Let's prefix the predictor path with "local directory",
    # just to be extra safe with the recursive delete :)
    shutil.rmtree(Path(f"./{predictor.path}"))

    return df_output



## Test a single analysis

In [None]:
# output = conduct_analysis(settings = chosen_settings)
# output

# Run The Big Analysis

In [None]:
possible_settings

In [None]:
import json

paper_cols = ["new_dataset_id", "col_protected", "processing_protected"]
bank_papers = pd.read_csv("data/raw/bank-papers.csv")

# Process
bank_papers["processing_protected"].replace("age < 25 or age > 60", "age >= 25 and age < 60", inplace=True) # Are we ok with this?
bank_papers["processing_protected"].fillna("none", inplace=True)

# Extract protected colnames
bank_papers['col_protected'] = bank_papers['col_protected'].apply(json.loads).apply(lambda x: frozenset(x.keys()))
bank_papers['col_protected'].replace({frozenset({"age", "job"}): frozenset({"job_and_age"})}, inplace=True)

# Rename columns to match settings
bank_papers.rename(columns={
    "new_dataset_id": "dataset",
    "col_protected": "protected_attrs",
    "processing_protected":
    "process_protected"
}, inplace=True)

# Remove duplicate configurations (this needs frozenset or tuple to be hashable)
bank_papers.drop_duplicates(inplace=True)

bank_papers['protected_attrs'] = bank_papers['protected_attrs'].apply(list)

bank_papers

In [None]:
observed_settings = bank_papers.to_dict(orient='records')
observed_settings

In [None]:
outputs = []
seeds = [
    # Original Seed
    80539,
    # random.org 1 - 10000
    8811,
    1563,
    9401,
    3032,
    8060,
    2412,
    6371,
    9192,
    115,
]

for seed in seeds:
    for sett in observed_settings:
        outputs.append(conduct_analysis(settings = sett, seed = seed))


In [None]:
final_data = pd.concat(outputs)
final_data.to_csv("bank-analysis-results.csv")
final_data

In [None]:
final_data["accuracy"].plot(kind="hist")