# Analyzing Fair PCA on different datasets


## Exploratory Data Analysis


### Imports


In [None]:
from sklearn.model_selection import train_test_split
import dython
import numpy as np
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier


In [None]:
df_recidivism = pd.read_csv(f'data/propublica_data_for_fairml.csv')

df_recidivism['Caucasian'] = ((df_recidivism['Other'] == 0) & (df_recidivism['African_American'] == 0) & (
    df_recidivism['Asian'] == 0) & (df_recidivism['Hispanic'] == 0) & (df_recidivism['Native_American'] == 0)).astype(int)
df_recidivism['Between_TwentyFive_And_FourtyFive'] = (
    (df_recidivism['Age_Above_FourtyFive'] == 0) & (df_recidivism['Age_Below_TwentyFive'] == 0)).astype(int)
df_recidivism['Male'] = (df_recidivism['Female'] == 0).astype(int)

# revert one hot encoding
races = ['Other', 'African_American', 'Asian', 'Hispanic', 'Native_American', 'Caucasian']
df_recidivism['Race'] = df_recidivism[races].idxmax(axis=1)
df_recidivism = df_recidivism.drop(races, axis=1)

genders = ['Female', 'Male']
df_recidivism['Gender'] = df_recidivism[genders].idxmax(axis=1)
df_recidivism = df_recidivism.drop(genders, axis=1)

age_group = ['Age_Above_FourtyFive', 'Age_Below_TwentyFive', 'Between_TwentyFive_And_FourtyFive']
df_recidivism['Age_Group'] = df_recidivism[age_group].idxmax(axis=1)
df_recidivism = df_recidivism.drop(age_group, axis=1)

df_recidivism = df_recidivism.drop('score_factor', axis=1)
df_recidivism = df_recidivism[df_recidivism["Race"].isin(["African_American", "Caucasian"])]


In [None]:
cat_cols = ["Race", "Gender", "Age_Group"]
dython.nominal.associations(df_recidivism, nominal_columns=cat_cols, mark_columns=True)

In [None]:
# plot feature distributions for both genders in one plot
fig, axes = plt.subplots(7, figsize=(20, 40))
for i, feature in enumerate(df_recidivism.columns):
    sns.histplot(data=df_recidivism, x=feature, hue='Race', ax=axes[i], palette='Set2')
plt.show()


In [None]:
# one hot encode data
dummies_df_recividism = pd.get_dummies(df_recidivism, columns=cat_cols, drop_first=True)
# remove target variable from features
labels = dummies_df_recividism.Two_yr_Recidivism
features = dummies_df_recividism.drop("Two_yr_Recidivism", axis=1)

features = features[["Number_of_Priors", "Misdemeanor", "Age_Group_Age_Below_TwentyFive",
                     "Age_Group_Between_TwentyFive_And_FourtyFive", "Race_Caucasian", "Gender_Male"]]

# identify protected groups
indices = []
for i, f in enumerate(features.columns):
    if ("Race" in f) or ("Gender" in f):
        print("Column ID: %s" % i, "(%s)" % f)
        indices.append(i)

print(indices)

groups = features.iloc[:, indices]

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features.values, labels.values.reshape(-1), groups, test_size=0.3, random_state=0, shuffle=True)


In [None]:
# last columns of our data contains the protected features
protected = X_train[:, -2:]
nonprotected = X_train[:, :-2]

protected_test = X_test[:, -2:]
nonprotected_test = X_test[:, :-2]


In [None]:
def calculate_metrics(y_true, y_pred):
    """
    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        Ground truth (correct) target values.
    y_pred : array-like of shape (n_samples,)
        Estimated targets as returned by a classifier.

    Returns
    -------
    recall : float
        Recall of the positive class in binary classification or weighted
        average of the recall of each class for the multiclass task.
    precision : float
        Precision of the positive class in binary classification or weighted
        average of the precision of each class for the multiclass task.
    f1_score : float
        F1 score of the positive class in binary classification or weighted
        average of the F1 score of each class for the multiclass task.
    accuracy : float
        Accuracy of the positive class in binary classification or weighted
        average of the accuracy of each class for the multiclass task.
    """

    TP = np.sum(np.logical_and(y_pred == 1, y_true == 1))
    FP = np.sum(np.logical_and(y_pred == 1, y_true == 0))
    TN = np.sum(np.logical_and(y_pred == 0, y_true == 0))
    FN = np.sum(np.logical_and(y_pred == 0, y_true == 1))

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (TP/(TP+FN) + TN/(TN+FP)) / 2

    metrics_dict = {'recall': recall,
                    'precision': precision,
                    'f1_score': f1_score,
                    'accuracy': accuracy}
    
    return metrics_dict


In [63]:
# caluclate statistical parity, equalized odds and equalized outcome for all groups
def calculate_fairness_metrics(y_true, y_pred, groups):
    """
    Calculate statistical parity, equalized odds and equalized outcome for all groups
    """
    metrics = {}
    for group in groups:
        for i in [0, 1]:
            group_idx = np.where(groups[group] == i)
            y_true_group = y_true[group_idx]
            y_pred_group = y_pred[group_idx]
            g = group + str(i)
            metrics[g] = {}
            metrics[g]['statistical_parity'] = np.mean(y_pred_group)
            metrics[g]['equalized_odds'] = np.mean(
                y_pred_group[y_true_group == 1]) - np.mean(y_pred_group[y_true_group == 0])
            metrics[g]['equalized_outcome'] = np.mean(
                y_pred_group[y_true_group == 1])
    return metrics


def calculate_balanced_accuracy_groups(y_true, y_pred, groups):
    """
    Calculate balanced accuracy for all groups
    """
    metrics = {}
    for group in groups:
        for i in [0, 1]:
            group_idx = np.where(groups[group] == i)
            y_true_group = y_true[group_idx]
            y_pred_group = y_pred[group_idx]
            g = group + str(i)
            metrics[g] = {}
            metrics[g]['balanced_accuracy'] = calculate_metrics(
                y_true_group, y_pred_group)['accuracy']
    return metrics

### with DecisionTreeClassifier


In [None]:
# create decision tree classifier object
dt = DecisionTreeClassifier(max_depth=4)

# fit the model to the training data
dt.fit(X_train, y_train)
# evaluate the model on the test set
y_pred = dt.predict(X_test)
metric_scores = calculate_metrics(y_test, y_pred)
print("Balanced accuracy on test set:", metric_scores['accuracy'])
print("Precision on test set:", metric_scores['precision'])
print("Recall on test set:", metric_scores['recall'])
print("F1 score on test set:", metric_scores['f1_score'])

In [None]:
# calculate fairness metrics for all groups
y_test_ = np.array([1 if y else 0 for y in y_test])
fairness_metrics = calculate_fairness_metrics(y_test, y_pred, group_test)
for key, value in fairness_metrics.items():
    print(key, ":",  value)

In [None]:
# calculate uncertainty by bootstrapping
n_bootstraps = 1000
bootstrapped_scores = []
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = np.random.randint(low=0, high=len(y_pred), size=len(y_pred))
    if len(np.unique(y_test[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = calculate_metrics(y_test[indices], y_pred[indices])['accuracy'] # accuracy
    bootstrapped_scores.append(score)

print("Confidence interval for the accuracy score: [{:0.3f} - {:0.3}]".format(
    np.percentile(bootstrapped_scores, 2.5),
    np.percentile(bootstrapped_scores, 97.5)))

## Fair PCA


In [None]:
# apply fair PCA

In [None]:
# run logistic regression on fair PCA data


In [None]:
# calculate fairness metrics and accuracy scores


In [None]:
# compare results amongst all datasets


## Loan defaulting


In [None]:
df_original = pd.read_csv('data/loan_default.csv')

# Dataset from https://www.kaggle.com/datasets/yasserh/loan-default-dataset?datasetId=1897041&sortBy=voteCount,
# Protected attribute is Gender
# Object is status, 0 or 1 (default or not)

# Drop all rows with nan
df_loans = df_original.drop(['Region', 'Security_Type', 'dtir1', 'total_units', 'Secured_by',
                             'term', 'open_credit', 'year', 'rate_of_interest', 'Interest_rate_spread',
                             'Upfront_charges', 'loan_limit', 'construction_type',
                             'co-applicant_credit_type', 'ID'], axis=1)

# Drop all rows from column 'Gender' that have 'Sex Not Available'
df_loans = df_loans[(df_loans['Gender'] != 'Sex Not Available')
                    & (df_loans['Gender'] != 'Joint')]

# Replace missing values with mode
df_loans['approv_in_adv'].fillna(
    df_loans['approv_in_adv'].mode()[0], inplace=True)
df_loans['loan_purpose'].fillna(
    df_loans['loan_purpose'].mode()[0], inplace=True)
df_loans['Neg_ammortization'].fillna(
    df_loans['Neg_ammortization'].mode()[0], inplace=True)
df_loans['property_value'].fillna(
    df_loans['property_value'].mode()[0], inplace=True)
df_loans['income'].fillna(df_loans['income'].mode()[0], inplace=True)
df_loans['LTV'].fillna(df_loans['LTV'].mode()[0], inplace=True)

df_loans.hist(bins=30, figsize=(25, 25))
plt.show()


In [None]:
# get list of categorical columns
num_cols = df_loans._get_numeric_data().columns
cat_cols = list(set(df_loans.columns) - set(num_cols))


In [None]:
dython.nominal.associations(
    df_loans, nominal_columns=cat_cols, mark_columns=True, figsize=(12, 12));


In [None]:
# plot feature distributions for both genders in one plot
fig, axes = plt.subplots(19, figsize=(20, 40))
for i, feature in enumerate(df_loans.columns):
    sns.histplot(data=df_loans, x=feature, hue='Gender',
                 ax=axes[i], palette='Set2')
plt.show()


In [None]:
# one hot encode data
dummies_df_loans = pd.get_dummies(df_loans, columns=cat_cols, drop_first=True)
# remove target variable from features
labels = dummies_df_loans.Status
features = dummies_df_loans.drop("Status", axis=1)

# identify protected groups
indices = []
for i, f in enumerate(features.columns):
    if ("Gender" in f):
        print("Column ID: %s" % i, "(%s)" % f)
        indices.append(i)

print(indices)

groups = features.iloc[:, indices]

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features.values, labels.values.reshape(-1), groups, test_size=0.3, random_state=0, shuffle=True)


In [None]:
# remove column 6 from X_train and X_test and save it as protected variable
protected = X_train[:, 6]
nonprotected = np.delete(X_train, 6, 1)

protected_test = X_test[:, 6]
nonprotected_test = np.delete(X_test, 6, 1)


In [None]:
# create decision tree classifier object
dt = DecisionTreeClassifier(max_depth=4)

# fit the model to the training data
dt.fit(X_train, y_train)
# evaluate the model on the test set
y_pred = dt.predict(X_test)
metric_scores = calculate_metrics(y_test, y_pred)
print("Balanced accuracy on test set:", metric_scores['accuracy'])
print("Precision on test set:", metric_scores['precision'])
print("Recall on test set:", metric_scores['recall'])
print("F1 score on test set:", metric_scores['f1_score'])

In [None]:
# calculate fairness metrics for all groups
y_test_ = np.array([1 if y else 0 for y in y_test])
fairness_metrics = calculate_fairness_metrics(y_test, y_pred, group_test)
for key, value in fairness_metrics.items():
    print(key, ":",  value)

In [None]:
# calculate uncertainty by bootstrapping
n_bootstraps = 1000
bootstrapped_scores = []
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = np.random.randint(low=0, high=len(y_pred), size=len(y_pred))
    if len(np.unique(y_test[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    score = calculate_metrics(y_test[indices], y_pred[indices])['accuracy'] # accuracy
    bootstrapped_scores.append(score)

print("Confidence interval for the accuracy score: [{:0.3f} - {:0.3}]".format(
    np.percentile(bootstrapped_scores, 2.5),
    np.percentile(bootstrapped_scores, 97.5)))