In [1]:
# import pandas and numpy

import pandas as pd
import numpy as np


# import sklearn

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# fairlearn metrics

import fairlearn
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_ratio, demographic_parity_difference
from fairlearn.metrics import false_negative_rate, equalized_odds_ratio, equalized_odds_difference


# fairlearn reductions

from fairlearn.reductions import DemographicParity, EqualizedOdds


# fairlearn postprocessing

from fairlearn.postprocessing import ThresholdOptimizer

In [3]:
# load in preprocessed dataset and display first 5 rows

data = pd.read_csv("../Data/preprocessed_pilot_non-pilot_data.csv")
data.head()

Unnamed: 0,Pilot,Age,Gender,PSS,JSS,MFI,GF,PF,RA,RM,MF
0,yes,21,male,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1
1,yes,19,female,0.410714,0.5,0.568421,0.5,0.933333,0.8,0.45,0.25
2,yes,21,male,0.357143,0.25,0.54,0.55,0.35,0.6,0.4,0.8
3,yes,19,male,0.160714,0.0,0.5,0.55,0.45,0.45,0.5,0.55
4,yes,18,female,0.392857,0.3,0.42,0.3,0.4,0.6,0.5,0.3


In [4]:
# determine number of yes and no values for Pilot column

data["Pilot"].value_counts()

Pilot
yes    28
no     20
Name: count, dtype: int64

In [5]:
# since there are less no values than yes values and the value types need to be the same for class 
# label balance, there needs to be a random sample of 20 yes values selected from the data

yes = data[data["Pilot"] == "yes"]
yes_sample = yes.sample(n = 20, random_state = 42)

In [6]:
# display the random sample of yes values

yes_sample

Unnamed: 0,Pilot,Age,Gender,PSS,JSS,MFI,GF,PF,RA,RM,MF
9,yes,18,male,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45
25,yes,19,female,0.625,1.0,0.5,0.45,0.55,0.7,0.4,0.4
8,yes,20,female,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65
21,yes,23,female,0.375,0.6,0.43,0.15,0.35,0.6,0.55,0.5
0,yes,21,male,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1
12,yes,19,female,0.482143,0.4,0.32,0.25,0.45,0.2,0.25,0.45
17,yes,21,male,0.642857,0.4,0.3,0.1,0.2,0.5,0.4,0.3
22,yes,21,male,0.428571,0.25,0.58,0.5,0.65,0.45,0.6,0.7
11,yes,18,male,0.392857,0.15,0.46,0.45,0.2,0.6,0.55,0.5
13,yes,26,male,0.25,0.2,0.67,0.65,0.7,0.7,0.65,0.65


In [7]:
# extract the no values from the data and combine the yes_sample with the no values into one dataframe

no = data[data["Pilot"] == "no"]
final_data = pd.concat([yes_sample, no])

In [8]:
# display the combined dataframe

final_data

Unnamed: 0,Pilot,Age,Gender,PSS,JSS,MFI,GF,PF,RA,RM,MF
9,yes,18,male,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45
25,yes,19,female,0.625,1.0,0.5,0.45,0.55,0.7,0.4,0.4
8,yes,20,female,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65
21,yes,23,female,0.375,0.6,0.43,0.15,0.35,0.6,0.55,0.5
0,yes,21,male,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1
12,yes,19,female,0.482143,0.4,0.32,0.25,0.45,0.2,0.25,0.45
17,yes,21,male,0.642857,0.4,0.3,0.1,0.2,0.5,0.4,0.3
22,yes,21,male,0.428571,0.25,0.58,0.5,0.65,0.45,0.6,0.7
11,yes,18,male,0.392857,0.15,0.46,0.45,0.2,0.6,0.55,0.5
13,yes,26,male,0.25,0.2,0.67,0.65,0.7,0.7,0.65,0.65


In [9]:
# use pandas to create dummy variables for the columns that have categorical value

one_hot_encoded_data = pd.get_dummies(final_data, columns = ["Pilot", "Gender"])

In [10]:
one_hot_encoded_data

Unnamed: 0,Age,PSS,JSS,MFI,GF,PF,RA,RM,MF,Pilot_no,Pilot_yes,Gender_female,Gender_male
9,18,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45,False,True,False,True
25,19,0.625,1.0,0.5,0.45,0.55,0.7,0.4,0.4,False,True,True,False
8,20,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65,False,True,True,False
21,23,0.375,0.6,0.43,0.15,0.35,0.6,0.55,0.5,False,True,True,False
0,21,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1,False,True,False,True
12,19,0.482143,0.4,0.32,0.25,0.45,0.2,0.25,0.45,False,True,True,False
17,21,0.642857,0.4,0.3,0.1,0.2,0.5,0.4,0.3,False,True,False,True
22,21,0.428571,0.25,0.58,0.5,0.65,0.45,0.6,0.7,False,True,False,True
11,18,0.392857,0.15,0.46,0.45,0.2,0.6,0.55,0.5,False,True,False,True
13,26,0.25,0.2,0.67,0.65,0.7,0.7,0.65,0.65,False,True,False,True


In [11]:
# remove extra columns that were created from pandas dummy variables

one_hot_encoded_data = one_hot_encoded_data.loc[:,~one_hot_encoded_data.columns.isin(["Pilot_no","Gender_female"])]
one_hot_encoded_data

Unnamed: 0,Age,PSS,JSS,MFI,GF,PF,RA,RM,MF,Pilot_yes,Gender_male
9,18,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45,True,True
25,19,0.625,1.0,0.5,0.45,0.55,0.7,0.4,0.4,True,False
8,20,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65,True,False
21,23,0.375,0.6,0.43,0.15,0.35,0.6,0.55,0.5,True,False
0,21,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1,True,True
12,19,0.482143,0.4,0.32,0.25,0.45,0.2,0.25,0.45,True,False
17,21,0.642857,0.4,0.3,0.1,0.2,0.5,0.4,0.3,True,True
22,21,0.428571,0.25,0.58,0.5,0.65,0.45,0.6,0.7,True,True
11,18,0.392857,0.15,0.46,0.45,0.2,0.6,0.55,0.5,True,True
13,26,0.25,0.2,0.67,0.65,0.7,0.7,0.65,0.65,True,True


In [13]:
# for Pilot, True = Yes and False = No
# for Gender, True = Male and False = Female

one_hot_encoded_data.rename(columns = {"Pilot_yes" : "Pilot", "Gender_male" : "Gender"}, inplace = True)

In [14]:
one_hot_encoded_data

Unnamed: 0,Age,PSS,JSS,MFI,GF,PF,RA,RM,MF,Pilot,Gender
9,18,0.285714,0.0,0.53,0.75,0.65,0.35,0.45,0.45,True,True
25,19,0.625,1.0,0.5,0.45,0.55,0.7,0.4,0.4,True,False
8,20,0.321429,0.2,0.57,0.5,0.55,0.55,0.6,0.65,True,False
21,23,0.375,0.6,0.43,0.15,0.35,0.6,0.55,0.5,True,False
0,21,0.464286,0.4,0.53,0.65,0.75,0.65,0.5,0.1,True,True
12,19,0.482143,0.4,0.32,0.25,0.45,0.2,0.25,0.45,True,False
17,21,0.642857,0.4,0.3,0.1,0.2,0.5,0.4,0.3,True,True
22,21,0.428571,0.25,0.58,0.5,0.65,0.45,0.6,0.7,True,True
11,18,0.392857,0.15,0.46,0.45,0.2,0.6,0.55,0.5,True,True
13,26,0.25,0.2,0.67,0.65,0.7,0.7,0.65,0.65,True,True


In [15]:
# X is the features, y is the target variable

X = one_hot_encoded_data.loc[:, one_hot_encoded_data.columns != "Pilot"]

y = one_hot_encoded_data["Pilot"]

In [16]:
# function to perform grid search cross validation and determine the optimal hyperparameters for the decision tree
# using 7 folds
# best_params are the best parameters
# best_score is the average performance 

def grid_search(X, y, cv):
    param_grid = {"criterion" : ["gini", "entropy"], "max_depth": np.arange(3, 15), 
                  "min_samples_split": [2, 3, 4, 5], "min_samples_leaf": [2, 3, 4, 5]}
    
    decision_tree = DecisionTreeClassifier()
    
    grid_search_cv = GridSearchCV(decision_tree, param_grid, cv = cv)
    grid_search_cv.fit(X, y)
    
    print("Best Parameters: ", grid_search_cv.best_params_)
    print("Best Score: ", grid_search_cv.best_score_)
    
    # return best estimator to use for the decision tree
    return grid_search_cv.best_estimator_

In [24]:
# accuracy score for decision tree model
# approximately 90% accurate
# approximately 36/40 samples classified correctly

best_estimator = grid_search(X, y, 7)
y_pred = best_estimator.predict(X)

print(accuracy_score(y, y_pred, normalize = True))
print(accuracy_score(y, y_pred, normalize = False))

Best Parameters:  {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Score:  0.7047619047619047
0.9
36


In [25]:
# lists to hold metric values before mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

female_sr_before = []
male_sr_before = []

dpr_before = []
dpd_before = []

In [26]:
# lists to hold metric values after mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

female_sr_after = []
male_sr_after = []

dpr_after = []
dpd_after = []

In [27]:
# run function 30 times
# get y_pred values 30 times
# get metric values 30 times before mitigation algorithm
# get metric values 30 times after mitigation algortihm
# metric values: selection rate, demographic parity ratio, demographic parity difference
# mitigation algorithm: threshold optimizer (use demographic_parity for constraint)


for i in range(1, 31):
    # get best estimator from grid search cv
    best_estimator = grid_search(X, y, 7)
    
    # get y_pred values
    y_pred = best_estimator.predict(X)
    
    # metrics before mitigation
    # True = Male, False = Female
    print("ITERATION: ", i)
    
    metrics = {"Selection Rate" : selection_rate}

    metric_frame = MetricFrame(metrics = metrics, y_true = y, y_pred = y_pred, sensitive_features = X["Gender"])

    # append to lists to hold metric values before mitigation algorithm for each of the 30 iterations
    female_sr_before.append(metric_frame.by_group["Selection Rate"].iloc[0])
    male_sr_before.append(metric_frame.by_group["Selection Rate"].iloc[1])

    dpr_before.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred, 
                                                                 sensitive_features = X["Gender"], 
                                                                 method = "between_groups"))
    dpd_before.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred, 
                                                                      sensitive_features = X["Gender"], 
                                                                      method = "between_groups"))
    
    # threshold optimizer with demographic parity
    threshold_optimizer = ThresholdOptimizer(estimator = best_estimator, constraints = "demographic_parity", 
                                             predict_method = "predict_proba", prefit = False)
    
    # fit the model and get y_pred values
    threshold_optimizer.fit(X, y, sensitive_features = X["Gender"])
    y_pred_optimized = threshold_optimizer.predict(X, sensitive_features = X["Gender"])
    
    # metrics after mitigation
    # True = Male, False = Female
    metric_frame_optimized = MetricFrame(metrics = metrics, y_true = y, y_pred = y_pred_optimized, 
                                         sensitive_features = X["Gender"])

    # append to lists to hold metric values after mitigation algorithm for each of the 30 iterations
    female_sr_after.append(metric_frame_optimized.by_group["Selection Rate"].iloc[0])
    male_sr_after.append(metric_frame_optimized.by_group["Selection Rate"].iloc[1])

    dpr_after.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred_optimized, 
                                                                sensitive_features = X["Gender"], 
                                                                method = "between_groups"))
    dpd_after.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred_optimized, 
                                                                     sensitive_features = X["Gender"], 
                                                                     method = "between_groups"))

Best Parameters:  {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 3}
Best Score:  0.6809523809523809
ITERATION:  1
Best Parameters:  {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Score:  0.680952380952381
ITERATION:  2
Best Parameters:  {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Score:  0.7047619047619047
ITERATION:  3
Best Parameters:  {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 4}
Best Score:  0.6761904761904762
ITERATION:  4
Best Parameters:  {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Score:  0.7047619047619047
ITERATION:  5
Best Parameters:  {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 4}
Best Score:  0.6809523809523809
ITERATION:  6
Best Parameters:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best

In [28]:
# convert results of metrics to a dataframe

results = {
    "Female Selection Rate Before": female_sr_before,
    "Male Selection Rate Before": male_sr_before,
    "Female Selection Rate After": female_sr_after,
    "Male Selection Rate After": male_sr_after,
    "Demographic Parity Ratio Before": dpr_before,
    "Demographic Parity Ratio After": dpr_after,
    "Demographic Parity Difference Before": dpd_before,
    "Demographic Parity Difference After" : dpd_after,
}

metric_results = pd.DataFrame(results)
metric_results

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Before,Demographic Parity Ratio After,Demographic Parity Difference Before,Demographic Parity Difference After
0,0.4,0.8,0.5,0.5,0.5,1.0,0.4,0.0
1,0.3,0.7,0.45,0.3,0.428571,0.666667,0.4,0.15
2,0.4,0.8,0.55,0.55,0.5,1.0,0.4,0.0
3,0.25,0.65,0.65,0.65,0.384615,1.0,0.4,0.0
4,0.3,0.7,0.45,0.45,0.428571,1.0,0.4,0.0
5,0.3,0.7,0.4,0.3,0.428571,0.75,0.4,0.1
6,0.3,0.75,0.45,0.45,0.4,1.0,0.45,0.0
7,0.4,0.8,0.3,0.4,0.5,0.75,0.4,0.1
8,0.4,0.8,0.35,0.4,0.5,0.875,0.4,0.05
9,0.3,0.65,0.4,0.35,0.461538,0.875,0.35,0.05


In [29]:
# convert average of each metric for to a dataframe

averages = pd.DataFrame(metric_results.mean()).T
averages

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Before,Demographic Parity Ratio After,Demographic Parity Difference Before,Demographic Parity Difference After
0,0.358333,0.743333,0.456667,0.441667,0.480311,0.899823,0.385,0.045


In [30]:
# save metric_results and averages dataframes as csv files

metric_results.to_csv("../Data/demographic_parity_metric_results.csv", index = False)
averages.to_csv("../Data/demographic_parity_averages.csv", index = False)