### Imports

In [3]:
# Import analysis libraries
import time
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# from notebooks/ directory
import sys, os

# ADDITIONS:
from src.RAI import DataBiasChecker, ModelBiasRanker, ModelBiasHandler

ImportError: cannot import name 'LGMBClassifierDF' from 'sklearndf.classification.extra' (/Users/drobkovgleb/miniconda3/lib/python3.7/site-packages/sklearndf/classification/extra/__init__.py)

### Input parameters

In [4]:
# set parameters for bias check
model_input_path = './data/adult.csv'
dependent = 'rich'  # dependent variable we measure
protected_group = 'race'  # protected group for which we compare outcomes
outcome_advantage = " >50K"
pg_disadvantage = " Black" ## this is the class we want to test for whether an advantage exists
dependent_bin = 'rich_outcome'
thresh_min = .6
thresh_max = 1

# set additional parameters for the toy model [skip if you have your own model]
target_rate = .84  # target_rate = 1 - desired (target) positive rate, i.e., outome rate: percentage of the population that classified as predicted outcome 1.
test_size = .4
random_state = 777
n_estimators = 400
bias_detect_thresh = .2
data_name = 'adult_dataset'

### Read data

In [5]:
47 +
7# Read in data
model_input = pd.read_csv(model_input_path)

# Create IDs to number individuals
pg_array = model_input[protected_group]

# Select dependent and PG -- measure historical bias
# Encode chose parameter for disadvantaged class as binary variable
model_input[dependent_bin] = model_input[dependent].apply(lambda x: 1 if x == outcome_advantage else 0)

# Encode chose parameter for disadvantaged class as binary variable
model_input[protected_group] = model_input[protected_group].apply(lambda x: 1 if x == pg_disadvantage else 0)

##### Run functions to create dummies for categorical variables and normalize numeric
num_vars = ['age', 'fnlwgt', 'education-num', 'race', 'capital-gain', 'capital-loss',
            'hours-per-week']

cat_vars = ['workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'sex', 'native-country']


In [6]:
def normalize_num(df, num_cols):
    """
    This function normalizes numeric variables

    Args:
        df: pandas df with numeric columns to normalize
        num_cols: list of numeric column names

    Result:
        Returns same dataframe with normalized data
    """

    x = df[num_cols].values  # returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_num = pd.DataFrame(x_scaled)
    df_num.columns = num_cols
    df_num = df_num.reset_index(drop=True)

    df_drop = df.drop(num_cols, axis=1)
    df_drop = df_drop.reset_index(drop=True)
    df_final = pd.concat([df_drop, df_num], axis=1)

    return df_final

In [7]:
# This is an internal function which one-hot encodes all categorical and normalizes all numeric variables
model_data_norm = normalize_num(model_input, num_vars)
model_data_cat = pd.get_dummies(model_input[cat_vars], columns=cat_vars)
model_data_num = model_data_norm[num_vars]
model_data = pd.concat([model_data_norm[[dependent_bin, "ID"]], model_data_cat, model_data_num], axis=1)

# Print shape of final machine-learning ready data frame
print("Model data shape: ", model_data.shape)

Model data shape:  (32561, 106)


In [8]:
model_data.head()

Unnamed: 0,rich_outcome,ID,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,age,fnlwgt,education-num,race,capital-gain,capital-loss,hours-per-week
0,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0.30137,0.044302,0.8,0.0,0.02174,0.0,0.397959
1,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0.452055,0.048238,0.8,0.0,0.0,0.0,0.122449
2,0,2,0,0,0,0,1,0,0,0,...,1,0,0,0.287671,0.138113,0.533333,0.0,0.0,0.0,0.397959
3,0,3,0,0,0,0,1,0,0,0,...,1,0,0,0.493151,0.151068,0.4,1.0,0.0,0.0,0.397959
4,0,4,0,0,0,0,1,0,0,0,...,0,0,0,0.150685,0.221488,0.8,1.0,0.0,0.0,0.397959


### 01 - Bias Index Check

In [9]:
bias_checker = DataBiasChecker(pvalue_threshold=0.1, 
                               test_type="z-test",
                               is_2_sided=False)

In [10]:
pg = model_input[protected_group]
y = model_input["rich_outcome"]

bias_checker.fit(pg, y)

<src.RAI._data_bias_check.DataBiasChecker at 0x7f9f00033d90>

In [11]:
bias_checker.biased_

True

In [12]:
bias_checker.p_value_

1.1361393491156106e-58

In [13]:
bias_checker.historic_crosstab_

rich_outcome,0,1
race,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.746781,0.253219
1,0.87612,0.12388


### 03 - Fairness Scenarios

In [14]:
X_train = model_data.drop(labels=["rich_outcome"], axis=1)
y_train = model_data["rich_outcome"]
pg_train = model_data[protected_group]

grid = {"n_estimators": [100, 200, 400],
        "max_depth": [4, 6, 8, 10]}
rf_clf = RandomForestClassifier(class_weight="balanced", random_state=42)
clf_cv = GridSearchCV(rf_clf, grid, cv=5, scoring="roc_auc", refit=True, verbose=3, n_jobs=-1)
clf_cv.fit(X_train, y_train)

best_estimator = clf_cv.best_estimator_
best_params = clf_cv.best_params_
print(best_params)

# best_estimator = RandomForestClassifier(max_depth=10, n_estimators=400, class_weight="balanced", random_state=42)
# best_estimator.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  58 out of  60 | elapsed:  1.1min remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.1min finished


{'max_depth': 10, 'n_estimators': 400}


In [15]:
bias_ranker = ModelBiasRanker(best_estimator,
                              pos_rate=0.1,
                              bias_tolerance=0.2)

In [16]:
bias_ranker.fit(X_train, y_train, pg_train)

<src.RAI._model_bias_ranker.ModelBiasRanker at 0x7f9f38048fd0>

In [17]:
bias_ranker.results_

Unnamed: 0,Bias Test,Bias Index,Accuracy,TP,FN,TN,FP,Non PG Positive Rate,PG Positive Rate
Naive,Fail,0.4997,0.8023,0.5549,0.4451,0.9618,0.0382,0.4119,0.2058
Threshold Best,Fail,0.3232,0.8376,0.8919,0.1081,0.8316,0.1684,0.107,0.0346
Historic Parity,Fail,0.49,0.8375,0.8913,0.1087,0.8315,0.1685,0.1052,0.0515
Demographic Parity,Pass,1.0018,0.8366,0.8867,0.1133,0.831,0.169,0.1,0.1002


In [18]:
bias_handler = ModelBiasHandler(best_estimator,
                                pos_rate=0.1,
                                bias_tolerance=0.2,
                                method="demog_parity")

In [19]:
bias_handler.fit(X_train, y_train, pg_train)

<src.RAI._model_bias_explain.ModelBiasHandler at 0x7f9f90d3f7d0>

In [20]:
bias_handler.bias_test_

'Pass'

In [21]:
bias_handler.bias_index_

1.0018185167079552

In [22]:
bias_handler.acc_

0.8365529314210252

In [23]:
bias_handler.TP_

0.8867055572612834

In [24]:
bias_handler.FN_

0.11329444273871661

In [25]:
bias_handler.TN_

0.8309787059787059

In [26]:
bias_handler.FP_

0.169021294021294

In [27]:
bias_handler.non_pg_rate_

0.10001019125590244

In [28]:
bias_handler.pg_rate_

0.1001920614596671

In [29]:
bias_handler.predict(X_train, pg_train)

array([0, 0, 0, ..., 0, 0, 1])

In [30]:
bias_handler.thresh_pg_

0.6603530294609407

In [31]:
bias_handler.thresh_non_pg_

0.8343797758352146

In [32]:
# Steps Ahead:
# - unit tests
# - exception handling