# Adult - Bias
This notebook computes the gender bias of scores developed the on the adult dataset.
It using different bias metrics.

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from fairscoring.metrics import bias_pe, bias_eo, bias_cal, WassersteinMetric, CalibrationMetric
from fairscoring.metrics.roc import bias_roc, bias_xroc

from tqdm.notebook import tqdm

In [2]:
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

## Load and pre-process data
### Load Adult data

In [3]:
feature_names=['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

In [4]:
dataURL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(dataURL, delimiter=', ',header=None, names=feature_names, engine="python")

### Feature Engineering

In [5]:
df['native_country_bin']=df['native_country']=='United-States'

In [6]:
num_features=['age', 'capital_gain', 'capital_loss','hours_per_week', 'education_num']
cat_features=['workclass', 'education', 'marital_status', 'occupation', 'race', 'sex', 'native_country_bin'] #'relationship'

In [7]:
df=df.replace({'workclass' : { '?' : 'Other/Unknown', 'Federal-gov' : 'Government', 'Local-gov' : 'Government', 'Never-worked': 'Other/Unknown', 'Private': 'Private', 'Self-emp-inc':  'Self-emp',                                 'Self-emp-not-inc': 'Self-emp', 'State-gov':'Government', 'Without-pay':'Other/Unknown'},
               'education' : {'10th': '1-12th', '11th': '1-12th', '12th': '1-12th', '1st-4th': '1-12th', '5th-6th': '1-12th', '7th-8th': '1-12th', '9th': '1-12th',
                              'Assoc-acdm': 'Assoc', 'Assoc-voc': 'Assoc', 'Bachelors': 'University/College', 'Doctorate': 'University/College', 'HS-grad': 'HS-grad', 'Masters': 'University/College', 'Preschool': '1-12th', 'Prof-school': 'University/College' ,'Some-college': 'University/College'},
               'marital_status': {'Married-AF-spouse': 'Married', 'Married-civ-spouse': 'Married', 'Married-spouse-absent': 'Married', 'Divorced': 'Div/Sep/Wid', 'Separated':  'Div/Sep/Wid',                      'Widowed': 'Div/Sep/Wid'},
               'relationship': {'Husband': 'Spouse/Partner', 'Wife': 'Spouse/Partner', 'Unmarried': 'Unmarried'},
               'occupation': {'Adm-clerical': 'White-Collar', 'Craft-repair': 'Blue-Collar', 'Exec-managerial': 'White-Collar', 'Farming-fishing': 'Blue-Collar', 'Handlers-cleaners': 'Blue-Collar',               'Machine-op-inspct': 'Blue-Collar', 'Other-service': 'Service', 'Priv-house-serv': 'Service', 'Prof-specialty': 'Professional',
                              'Protective-serv': 'Service', 'Tech-support': 'Service', 'Transport-moving': 'Blue-Collar', '?': 'Other/Unknown', 'Armed-Forces': 'Other/Unknown'}
              })

### Encoding

In [8]:
# Store gener column
gender_column = df["sex"].copy()

ordinal_enc = OrdinalEncoder().fit(df[cat_features])
df[cat_features]=ordinal_enc.transform(df[cat_features])
df[cat_features]=df[cat_features].astype(int)

# Undo Encoding gender
df["sex"] = gender_column

In [9]:
categorical=pd.get_dummies(df[cat_features].astype(str))
numerical=MinMaxScaler().fit_transform(df[num_features])

In [10]:
encoder = LabelEncoder()
target=encoder.fit_transform(df['income'])

## Training
### Train-Test Split

In [11]:
log_reg_data = pd.concat([pd.DataFrame(categorical), pd.DataFrame(numerical)], axis=1)
log_reg_data = log_reg_data.rename(columns={0: 'age', 1: 'capital_gain', 2: 'capital_loss', 3: 'hours_per_week', 4: 'education_num'})

In [12]:
X_train, X_test, y_train, y_test = train_test_split(log_reg_data, target, test_size=0.3, random_state=43)

### Train LogReg Model
#### Cross-Validation to check for stability

In [13]:
shuffle = KFold(n_splits=5, shuffle=True, random_state=2579)
logreg = LogisticRegression(max_iter=1000)
ROC_Values=cross_val_score(logreg, X_train , y_train, cv=shuffle, scoring="roc_auc")

print('\nROC AUC values for 5-fold Cross Validation:\n',ROC_Values)
print('\nStandard Deviation of ROC AUC of the models:', round(ROC_Values.std(),3))
print('\nFinal Average ROC AUC of the model:', round(ROC_Values.mean(),3))


ROC AUC values for 5-fold Cross Validation:
 [0.90249847 0.89177676 0.8820583  0.89022666 0.8969462 ]

Standard Deviation of ROC AUC of the models: 0.007

Final Average ROC AUC of the model: 0.893


#### Final Model

In [14]:
logreg.fit(X_train, y_train)

y_pred = logreg.predict_proba(X_test)[:,1]
y_pred_train = logreg.predict_proba(X_train)[:,1]

roc_score_logreg = roc_auc_score(y_test, y_pred)
roc_score_logreg_train = roc_auc_score(y_train, y_pred_train)

print('The ROC-AUC of the Logistic Regression is', roc_score_logreg)
print('The train-ROC-AUC of the Logistic Regression is', roc_score_logreg_train)

The ROC-AUC of the Logistic Regression is 0.8975588173788007
The train-ROC-AUC of the Logistic Regression is 0.8942243079704495


### Train debiased LogReg Model
#### Remove Gender Information

In [15]:
X_train.columns[[22,23]]

Index(['sex_Female', 'sex_Male'], dtype='object')

In [16]:
X_train_wosex = X_train.drop(X_train.columns[[22,23]], axis=1)
X_test_wosex = X_test.drop(X_train.columns[[22,23]], axis=1)

#### Cross-Validation to check for stability

In [17]:
shuffle = KFold(n_splits=5, shuffle=True, random_state=2579)
logreg_wosex = LogisticRegression(max_iter=1000)
ROC_Values=cross_val_score(logreg_wosex, X_train_wosex, y_train, cv=shuffle, scoring="roc_auc")

print('\nROC AUC values for 5-fold Cross Validation:\n',ROC_Values)
print('\nStandard Deviation of ROC AUC of the models:', round(ROC_Values.std(),3))
print('\nFinal Average ROC AUC of the model:', round(ROC_Values.mean(),3))


ROC AUC values for 5-fold Cross Validation:
 [0.90207961 0.89145549 0.88137445 0.88927664 0.89602997]

Standard Deviation of ROC AUC of the models: 0.007

Final Average ROC AUC of the model: 0.892


#### Final Model

In [18]:
logreg_wosex = LogisticRegression(max_iter=1000)
logreg_wosex.fit(X_train_wosex, y_train)

y_pred_wosex = logreg_wosex.predict_proba(X_test_wosex)[:,1]
y_pred_train_wosex = logreg_wosex.predict_proba(X_train_wosex)[:,1]

roc_score_logreg_wosex = roc_auc_score(y_test, y_pred_wosex)
roc_score_logreg_wosex_train = roc_auc_score(y_train, y_pred_train_wosex)

print('The ROC-AUC of the Logistic Regression is', roc_score_logreg_wosex)
print('The train-ROC-AUC of the Logistic Regression is', roc_score_logreg_wosex_train)

The ROC-AUC of the Logistic Regression is 0.8968531931820867
The train-ROC-AUC of the Logistic Regression is 0.8935059284878036


### Train XGBoost Model
#### Cross-Validation to check for stability

In [19]:
shuffle = KFold(n_splits=5, shuffle=True, random_state=2579)
xgb_model = xgb.XGBClassifier()
ROC_Values=cross_val_score(xgb_model, X_train , y_train, cv=shuffle, scoring="roc_auc")

print('\nROC AUC values for 5-fold Cross Validation:\n',ROC_Values)
print('\nStandard Deviation of ROC AUC of the models:', round(ROC_Values.std(),3))
print('\nFinal Average ROC AUC of the model:', round(ROC_Values.mean(),3))


ROC AUC values for 5-fold Cross Validation:
 [0.92175832 0.9203497  0.91333443 0.91947067 0.92419827]

Standard Deviation of ROC AUC of the models: 0.004

Final Average ROC AUC of the model: 0.92


#### Final Model

In [20]:
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict_proba(X_test)[:,1]
y_pred_train_xgb = xgb_model.predict_proba(X_train)[:,1]


roc_score_xgb = roc_auc_score(y_test, y_pred_xgb)
roc_score_xgb_train = roc_auc_score(y_train, y_pred_train_xgb)

print('The ROC-AUC of the Logistic Regression is', roc_score_xgb)
print('The train-ROC-AUC of the Logistic Regression is', roc_score_xgb_train)

The ROC-AUC of the Logistic Regression is 0.9221733121562541
The train-ROC-AUC of the Logistic Regression is 0.9495695617402895


## Bias Measures
### Prepare Dataset

In [21]:
attribute = df.loc[X_test.index,"sex"]

groups = ['Female', 'Male']

favorable_target = encoder.transform([">50K"])[0]

models = [
    ("LogReg", y_pred),
    ("LogReg (debiased)", y_pred_wosex),
    ("XGBoost", y_pred_xgb)
]

### List of bias metrics

In [22]:
metrics = [
    bias_eo,     # Standardized Equal Opportunity
    bias_pe,     # Standardized Predictive Equality
    bias_cal,    # Standardized Calibration Equality
    bias_roc,    # ROC-Bias
    bias_xroc,   # xROC-Bias
    WassersteinMetric(fairness_type="EO",name="Equal Opportunity (U)", score_transform="rescale"),
    WassersteinMetric(fairness_type="PE",name="Predictive Equality (U)", score_transform="rescale"),
    CalibrationMetric(weighting="scores",name="Calibration (U)", score_transform="rescale"),
]

### Compute Bias Metrics
Compute all bias metrics for the dataset

In [23]:
results = []
for metric in tqdm(metrics):
    for model, scores in models:
        # Compute bias
        bias = metric.bias(
            scores, y_test, attribute,
            groups=groups,
            favorable_target=favorable_target,
            min_score=0, max_score=1,
            n_permute=1000, seed=2579)

        # Store result
        results.append((metric, model, bias))

  0%|          | 0/8 [00:00<?, ?it/s]

### Result Table
This corresponds to table 3 in the publication.

In [24]:
# Models vertically arranged
results = [[
    metric.name,
    model,
    f"{bias.bias:.3f}",
    f"{100*bias.pos_component:.0f}%",
    f"{100*bias.neg_component:.0f}%",
    f"{bias.p_value:.2f}" ] for metric, model, bias in results
]

df_v = pd.DataFrame(results, columns=["metric", "model", "total", "pos", "neg", "p-value"])
df_v.set_index(["metric", "model"], inplace=True)

In [25]:
# Models horizontally arranged
model_names = [name for name, _ in models]

blocks = [df_v[df_v.index.get_level_values(1) == name] for name in model_names]

for i in range(len(blocks)):
    blocks[i].set_index(blocks[i].index.droplevel("model"))
    blocks[i] = blocks[i].reset_index()
    blocks[i].drop("model", axis=1, inplace=True)
    if i == 0:
        metric_col = blocks[i]["metric"]
    blocks[i].drop("metric", axis=1, inplace=True)

df_h = pd.concat([metric_col] + blocks, axis=1, keys=[""]+model_names)
df_h.set_index(df_h.columns[0],inplace=True)
df_h.index.names = ["Metric"]
df_h

Unnamed: 0_level_0,LogReg,LogReg,LogReg,LogReg,LogReg (debiased),LogReg (debiased),LogReg (debiased),LogReg (debiased),XGBoost,XGBoost,XGBoost,XGBoost
Unnamed: 0_level_1,total,pos,neg,p-value,total,pos,neg,p-value,total,pos,neg,p-value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Equal Opportunity,0.107,0%,100%,0.0,0.069,0%,100%,0.0,0.057,1%,99%,0.0
Predictive Equality,0.164,0%,100%,0.0,0.121,0%,100%,0.0,0.143,0%,100%,0.0
Calibration,0.052,22%,78%,0.0,0.045,55%,45%,0.01,0.05,52%,48%,0.0
ROC bias,0.05,98%,2%,0.0,0.051,98%,2%,0.0,0.033,98%,2%,0.0
xROC bias,0.205,0%,100%,0.0,0.151,0%,100%,0.0,0.129,0%,100%,0.0
Equal Opportunity (U),0.161,0%,100%,0.0,0.104,0%,100%,0.0,0.087,0%,100%,0.0
Predictive Equality (U),0.118,0%,100%,0.0,0.098,0%,100%,0.0,0.101,0%,100%,0.0
Calibration (U),0.105,20%,80%,0.0,0.102,50%,50%,0.0,0.138,62%,38%,0.0
