### Imports

In [1]:
# Import analysis libraries
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# from notebooks/ directory
import sys, os

# ADDITIONS:
from facet.inspection import RAIDataBaisCheck
from facet.inspection import RAIFairnessScenarios

### Input parameters

In [2]:
# set parameters for bias check
model_input_path = './adult.csv'
dependent = 'rich'  # dependent variable we measure
protected_group = 'race'  # protected group for which we compare outcomes
outcome_advantage = " >50K"
pg_disadvantage = " Black" ## this is the class we want to test for whether an advantage exists
dependent_bin = 'rich_outcome'
thresh_min = .6
thresh_max = 1

# set additional parameters for the toy model [skip if you have your own model]
target_rate = .84  # target_rate = 1 - desired (target) positive rate, i.e., outome rate: percentage of the population that classified as predicted outcome 1.
test_size = .4
random_state = 777
n_estimators = 400
bias_detect_thresh = .2
data_name = 'adult_dataset'

### Read data

In [3]:
# Read in data
model_input = pd.read_csv(model_input_path)

# Create IDs to number individuals
model_input['ID'] = model_input.index
pg_array = model_input[protected_group]

# Select dependent and PG -- measure historical bias
# Encode chose parameter for disadvantaged class as binary variable
model_input[dependent_bin] = model_input[dependent].apply(lambda x: 1 if x == outcome_advantage else 0)

# Encode chose parameter for disadvantaged class as binary variable
model_input[protected_group] = model_input[protected_group].apply(lambda x: 1 if x == pg_disadvantage else 0)

##### Run functions to create dummies for categorical variables and normalize numeric
num_vars = ['age', 'fnlwgt', 'education-num', 'race', 'capital-gain', 'capital-loss',
            'hours-per-week']

cat_vars = ['workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'sex', 'native-country']


In [4]:
def normalize_num(df, num_cols):
    """
    This function normalizes numeric variables

    Args:
        df: pandas df with numeric columns to normalize
        num_cols: list of numeric column names

    Result:
        Returns same dataframe with normalized data
    """

    x = df[num_cols].values  # returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_num = pd.DataFrame(x_scaled)
    df_num.columns = num_cols
    df_num = df_num.reset_index(drop=True)

    df_drop = df.drop(num_cols, axis=1)
    df_drop = df_drop.reset_index(drop=True)
    df_final = pd.concat([df_drop, df_num], axis=1)

    return df_final

In [5]:
# This is an internal function which one-hot encodes all categorical and normalizes all numeric variables
model_data_norm = normalize_num(model_input, num_vars)
model_data_cat = pd.get_dummies(model_input[cat_vars], columns=cat_vars)
model_data_num = model_data_norm[num_vars]
model_data = pd.concat([model_data_norm[[dependent_bin, "ID"]], model_data_cat, model_data_num], axis=1)

# Print shape of final machine-learning ready data frame
print("Model data shape: ", model_data.shape)

Model data shape:  (32561, 106)


In [6]:
model_data.head()

Unnamed: 0,rich_outcome,ID,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,age,fnlwgt,education-num,race,capital-gain,capital-loss,hours-per-week
0,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0.30137,0.044302,0.8,0.0,0.02174,0.0,0.397959
1,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0.452055,0.048238,0.8,0.0,0.0,0.0,0.122449
2,0,2,0,0,0,0,1,0,0,0,...,1,0,0,0.287671,0.138113,0.533333,0.0,0.0,0.0,0.397959
3,0,3,0,0,0,0,1,0,0,0,...,1,0,0,0.493151,0.151068,0.4,1.0,0.0,0.0,0.397959
4,0,4,0,0,0,0,1,0,0,0,...,0,0,0,0.150685,0.221488,0.8,1.0,0.0,0.0,0.397959


### 01 - Bias Index Check

In [7]:
RAI_bias_checker = RAIDataBaisCheck(protected_group='race',
                                    test_col='rich_outcome', 
                                    pvalue_threshold=0.1, 
                                    test_type="z-test",
                                    is_2_sided=False)

In [8]:
RAI_bias_checker.fit(model_input)

(True, 1.1361393491156106e-58)

In [9]:
RAI_bias_checker.biased

True

In [10]:
RAI_bias_checker.p_value

1.1361393491156106e-58

In [11]:
RAI_bias_checker.historic_crosstab

rich_outcome,0,1
race,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.746781,0.253219
1,0.87612,0.12388


### 03 - Fairness Scenarios

In [12]:
def train_model(
    model_input, dependent, protected_group, test_size, random_state, n_estimators
):
    """
    Parameters
    ----------
    model_input_path = location of model input file - must be parquet
    dependent = outcome variable to compare rates for (binary)
    protected_group = variable indicating a protected group (binary)
    test_size = train test split to use
    random_state = random state selection for re-producability

    Returns
    -------
    outcome_array = true outcome array (binary)
    pg_array = protected group indicator (binary)
    preds_proba = predicted probabilities output array from sklearn
    preds_naive = predictions of highest accuracy threshold

    """
    # If want to sub only 100 K rows to speed up the run
    model_input_sub = model_input  # .sample(n=100000, random_state=777)

    # SPLIT TRAIN Test
    train, test = train_test_split(
        model_input_sub, test_size=test_size, random_state=random_state
    )

    # Assign y and x
    y_train = train[dependent]
    X_train = train.drop(labels=[dependent], axis=1)

    # Drop dependent variable
    y_test = test[dependent]
    X_test = test.drop(labels=[dependent], axis=1)

    # Time model training
    start = time.time()

    # Build RF model
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators, class_weight="balanced", random_state=777
    )
    rf_model.fit(X_train, y_train)

    end = time.time()
    print("Seconds to run: ", end - start)

    # Output model predictions and probabilities
    preds_proba_raw = rf_model.predict_proba(X_test)
    preds_proba = np.array(preds_proba_raw)[:, 1]
    preds_naive = rf_model.predict(X_test)

    # outcome and pg
    outcome_array = test[[dependent]]
    pg_array = test[[protected_group]]

    # Overall model accuracy - logloss
    ll = log_loss(y_test, preds_proba)
    print("log-loss", ll)

    return (outcome_array, pg_array, preds_proba, preds_naive)


In [13]:
### If running model on own, repeat steps to create required binary arrays below
protected_group = 'race' 

### Required model inputs [update with your own model]
# outcome_array = binary y_true for your validation set -- what you are trying to predict
# pg_array = binary protected group array -- 1/0 indicator where 1 is the PG (ex. black, female)
# preds_proba = your model predicted scores for each record in your validation set = output from your model predict_proba() function from sklearn
# preds_naive = your model predictions at optimal threshold -- use the one selected for your baseline / champion model

# Code example -- where rf_model is your model
#     preds_proba_raw = rf_model.predict_proba(X_test)
#     preds_proba = np.array(preds_proba_raw)[:, 1]

# Train model - generate data frames for scenario analysis
outcome_array, pg_array, preds_proba, preds_naive = train_model(model_data,
                                                                dependent_bin,
                                                                protected_group,
                                                                test_size,
                                                                random_state,
                                                                n_estimators)

y_true = outcome_array["rich_outcome"]
y_pred_proba = preds_proba
y_pred_naive = preds_naive
pg = pg_array["race"]

Seconds to run:  8.967778444290161
log-loss 0.3139295316819027


In [14]:
RAI_fairness_simulator = RAIFairnessScenarios(target_rate=0.9,
                                              bias_detect_thresh=0.2)

In [15]:
RAI_fairness_simulator.fit(y_true,
                           y_pred_naive,
                           y_pred_proba,
                           pg)

Unnamed: 0_level_0,Bias_Test,Bias_Index,Accuracy,TP,FN,TN,FP,Non_PG_Outcome_Rate,PG_Outcome_Rate
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1 - Naive,Fail,0.4488,0.8618,0.7583,0.2417,0.8868,0.1132,0.2051,0.092
2 - Threshold Best,Fail,0.4009,0.8389,0.8891,0.1109,0.8334,0.1666,0.1056,0.0423
3 - Historic Parity,Fail,0.4934,0.8397,0.8893,0.1107,0.8341,0.1659,0.1056,0.0521
4 - Demographic Parity,Pass,1.0022,0.8383,0.8848,0.1152,0.8331,0.1669,0.0999,0.1001


In [16]:
RAI_fairness_simulator.thresh_best

0.7325

In [17]:
RAI_fairness_simulator.thresh_hist_pg

0.684258721844227

In [18]:
RAI_fairness_simulator.thresh_hist_non_pg

0.735

In [19]:
RAI_fairness_simulator.thresh_demog_pg

0.47324999999999984

In [20]:
RAI_fairness_simulator.thresh_demog_non_pg

0.7475

In [21]:
RAI_fairness_simulator.preds_naive

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
RAI_fairness_simulator.preds_threshold

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
RAI_fairness_simulator.preds_historic

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
RAI_fairness_simulator.preds_demographic

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
# Steps Ahead:
# - unit tests
# - exception handling
# - feature suggestions