# German Credit Risk - Bias
This notebook computes the gender bias of models developed on the *German Credit Risk* dataset.

__Source__: [https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data](https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data)

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

from sklearn.datasets import fetch_openml

from fairscoring.metrics import bias_pe, bias_eo, bias_cal, WassersteinMetric, CalibrationMetric
from fairscoring.metrics.roc import bias_roc, bias_xroc

from tqdm.notebook import tqdm

In [2]:
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

## Load and pre-process data
### Load German Credit Risk data from [OpenML](https://api.openml.org/d/46116)

In [3]:
openML_ID = 46116
data = fetch_openml(data_id=openML_ID)
features = data.data.copy()
target = data.target

### Preprocessing

In [4]:
# Drop index Column
# features.drop("Unnamed:_0", axis=1, inplace=True)

# Fill n/a
features['Saving accounts'] = features['Saving accounts'].astype(object).fillna('no_inf')
features['Checking account'] = features['Checking account'].astype(object).fillna('no_inf')

# Small beautification
features['Purpose'] = features['Purpose'].replace("'domestic appliances'", "domestic appliances")

In [5]:
num_columns = ['Credit amount', 'Duration']
cat_columns = ['Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Sex']

### Encoding

In [6]:
ordinal_enc = OrdinalEncoder().fit(features[cat_columns])
features[cat_columns]=ordinal_enc.transform(features[cat_columns])
features[cat_columns]=features[cat_columns].astype(int)

In [7]:
categorical = pd.get_dummies(features[cat_columns].astype(str), drop_first=True)
numerical = MinMaxScaler().fit_transform(features[num_columns])

In [8]:
target_encoder = LabelEncoder()
target= target_encoder.fit_transform(target)

## Training
### Train-Test Split

In [9]:
log_reg_data=pd.concat([pd.DataFrame(categorical), pd.DataFrame(numerical)], axis=1)
log_reg_data=log_reg_data.rename(columns = {0:'Credit amount', 1:'Duration'})

In [10]:
X_train, X_test, y_train, y_test = train_test_split(log_reg_data.astype(float), target.astype(int), test_size=0.33, random_state=42)

### Train LogReg Model
#### Cross-Validation to check for stability

In [11]:
shuffle = KFold(n_splits=5, shuffle=True, random_state=2579)
logreg = LogisticRegression(max_iter=1000)
ROC_Values=cross_val_score(logreg, X_train , y_train, cv=shuffle, scoring="roc_auc")

print('\nROC AUC values for 5-fold Cross Validation:\n',ROC_Values)
print('\nStandard Deviation of ROC AUC of the models:', round(ROC_Values.std(),3))
print('\nFinal Average ROC AUC of the model:', round(ROC_Values.mean(),3))


ROC AUC values for 5-fold Cross Validation:
 [0.62074468 0.76400111 0.78967544 0.75111461 0.71436404]

Standard Deviation of ROC AUC of the models: 0.059

Final Average ROC AUC of the model: 0.728


#### Final Model

In [12]:
logreg = sm.Logit(y_train, X_train).fit()
# performing predictions on the test datdaset
y_pred = logreg.predict(X_test)
y_pred_train = logreg.predict(X_train)
prediction_test = list(map(round, y_pred))
prediction_train = list(map(round, y_pred_train))


Optimization terminated successfully.
         Current function value: 0.506663
         Iterations 6


### Train debiased LogReg Model
#### Remove Gender Information

In [13]:
X_train_wosex = X_train.drop(X_train.columns[[19,19]], axis=1)
X_test_wosex = X_test.drop(X_train.columns[[19,19]], axis=1)

#### Cross-Validation to check for stability

In [14]:
shuffle = KFold(n_splits=5, shuffle=True, random_state=2579)
logreg_wosex = LogisticRegression(max_iter=1000)
ROC_Values=cross_val_score(logreg_wosex, X_train_wosex, y_train, cv=shuffle, scoring="roc_auc")

print('\nROC AUC values for 5-fold Cross Validation:\n',ROC_Values)
print('\nStandard Deviation of ROC AUC of the models:', round(ROC_Values.std(),3))
print('\nFinal Average ROC AUC of the model:', round(ROC_Values.mean(),3))


ROC AUC values for 5-fold Cross Validation:
 [0.6087766  0.77904709 0.78507539 0.73826383 0.71299342]

Standard Deviation of ROC AUC of the models: 0.064

Final Average ROC AUC of the model: 0.725


#### Final Model

In [15]:
logreg_wosex = sm.Logit(y_train, X_train_wosex).fit()

y_pred_wosex = logreg_wosex.predict(X_test_wosex)
y_pred_train_wosex = logreg_wosex.predict(X_train_wosex)

roc_score_logreg_wosex = roc_auc_score(y_test, y_pred_wosex)
roc_score_logreg_wosex_train = roc_auc_score(y_train, y_pred_train_wosex)

print('The ROC-AUC of the Logistic Regression is', roc_score_logreg_wosex)
print('The train-ROC-AUC of the Logistic Regression is', roc_score_logreg_wosex_train)

Optimization terminated successfully.
         Current function value: 0.510914
         Iterations 6
The ROC-AUC of the Logistic Regression is 0.7712395693717844
The train-ROC-AUC of the Logistic Regression is 0.765014029809344


## Bias Measures
### Prepare Dataset

In [16]:
attribute = data.data.loc[X_test.index,"Sex"]

groups = ['female', 'male']

favorable_target = target_encoder.transform(["good"])[0]

models = [
    ("LogReg", y_pred),
    ("LogReg (debiased)", y_pred_wosex),
]

### List of bias metrics

In [17]:
metrics = [
    bias_eo,     # Standardized Equal Opportunity
    bias_pe,     # Standardized Predictive Equality
    bias_cal,    # Standardized Calibration Equality
    bias_roc,    # ROC-Bias
    bias_xroc,   # xROC-Bias
    WassersteinMetric(fairness_type="EO",name="Equal Opportunity (U)", score_transform="rescale"),
    WassersteinMetric(fairness_type="PE",name="Predictive Equality (U)", score_transform="rescale"),
    CalibrationMetric(weighting="scores",name="Calibration (U)", score_transform="rescale"),
]

### Compute Bias Metrics
Compute all bias metrics for the dataset

In [18]:
results = []
for metric in tqdm(metrics):
    for model, scores in models:
        # Compute bias
        bias = metric.bias(
            scores, y_test, attribute,
            groups=groups,
            favorable_target=favorable_target,
            min_score=0, max_score=1,
            n_permute=1000, seed=2579)

        # Store result
        results.append((metric, model, bias))

  0%|          | 0/8 [00:00<?, ?it/s]

  fraction_of_positives = np.where(nonzero, bin_true / bin_total, np.nan)
  mean_predicted_value = np.where(nonzero, bin_sums / bin_total, np.nan)
  fraction_of_positives = np.where(nonzero, bin_true / bin_total, np.nan)
  mean_predicted_value = np.where(nonzero, bin_sums / bin_total, np.nan)
  fraction_of_positives = np.where(nonzero, bin_true / bin_total, np.nan)
  mean_predicted_value = np.where(nonzero, bin_sums / bin_total, np.nan)


### Result Table I
_Models vertically arranged_
This corresponds to table C2 in the publication.

In [19]:
results = [[
    metric.name,
    model,
    f"{bias.bias:.3f}",
    f"{100*bias.pos_component:.0f}%",
    f"{100*bias.neg_component:.0f}%",
    f"{bias.p_value:.2f}" ] for metric, model, bias in results
]

df = pd.DataFrame(results, columns=["metric", "model", "total", "pos", "neg", "p-value"])
df.set_index(["metric", "model"], inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,total,pos,neg,p-value
metric,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Equal Opportunity,LogReg,0.083,1%,99%,0.04
Equal Opportunity,LogReg (debiased),0.048,93%,7%,0.32
Predictive Equality,LogReg,0.092,0%,100%,0.09
Predictive Equality,LogReg (debiased),0.025,62%,38%,0.99
Calibration,LogReg,0.291,46%,54%,0.35
Calibration,LogReg (debiased),0.299,58%,42%,0.26
ROC bias,LogReg,0.044,98%,2%,0.8
ROC bias,LogReg (debiased),0.05,98%,2%,0.69
xROC bias,LogReg,0.133,0%,100%,0.02
xROC bias,LogReg (debiased),0.057,93%,7%,0.54


### Result Table II
_Models horizontally arranged_
This corresponds to table 2 in the publication.

In [20]:
model_names = [name for name, _ in models]

blocks = [df[df.index.get_level_values(1) == name] for name in model_names]

for i in range(len(blocks)):
    blocks[i].set_index(blocks[i].index.droplevel("model"))
    blocks[i] = blocks[i].reset_index()
    blocks[i].drop("model", axis=1, inplace=True)
    if i == 0:
        metric_col = blocks[i]["metric"]
    blocks[i].drop("metric", axis=1, inplace=True)

df2 = pd.concat([metric_col] + blocks, axis=1, keys=[""]+model_names)
df2.set_index(df2.columns[0],inplace=True)
df2.index.names = ["Metric"]

In [21]:
df2

Unnamed: 0_level_0,LogReg,LogReg,LogReg,LogReg,LogReg (debiased),LogReg (debiased),LogReg (debiased),LogReg (debiased)
Unnamed: 0_level_1,total,pos,neg,p-value,total,pos,neg,p-value
Metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Equal Opportunity,0.083,1%,99%,0.04,0.048,93%,7%,0.32
Predictive Equality,0.092,0%,100%,0.09,0.025,62%,38%,0.99
Calibration,0.291,46%,54%,0.35,0.299,58%,42%,0.26
ROC bias,0.044,98%,2%,0.8,0.05,98%,2%,0.69
xROC bias,0.133,0%,100%,0.02,0.057,93%,7%,0.54
Equal Opportunity (U),0.041,3%,97%,0.13,0.036,97%,3%,0.23
Predictive Equality (U),0.078,1%,99%,0.1,0.024,74%,26%,0.98
Calibration (U),0.246,40%,60%,0.57,0.225,75%,25%,0.84
