In [1]:
# import pandas, numpy, and matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# import sklearn

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


# fairlearn metrics

import fairlearn
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_ratio, demographic_parity_difference
from fairlearn.metrics import equalized_odds_ratio, equalized_odds_difference
from fairlearn.metrics import false_negative_rate, false_positive_rate


# fairlearn reductions

from fairlearn.reductions import DemographicParity, EqualizedOdds


# fairlearn postprocessing

from fairlearn.postprocessing import ThresholdOptimizer

In [2]:
# load in preprocessed CoughVID dataset and display first 5 rows

data = pd.read_csv("../../preprocessed_coughvid_data.csv")
data.head()

Unnamed: 0,age,reported_gender,pcr_test_result_inferred,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,old,male,negative,-832.542,47.722702,-18.37968,31.09029,-16.050894,22.458717,-5.056018,...,0.914785,4.65387,-5.853877,-2.536242,1.582973,5.607704,1.397359,-1.465576,-1.254243,2.220975
1,young,male,negative,-1035.8025,70.914444,24.486706,39.24166,8.321938,-6.705366,3.596952,...,-2.712213,-4.592238,-7.682649,-9.517369,-4.125193,-2.689272,2.242155,6.358678,-0.130439,-2.016235
2,old,male,negative,-964.15137,4.123369,11.5602,13.998945,15.475512,9.942076,8.380241,...,-1.308795,-2.059448,-3.164607,0.26054,-3.896403,1.36875,-3.289319,1.358415,-0.305018,0.843219
3,young,male,negative,-1062.9386,64.8412,1.44924,-36.003242,-30.50861,-12.251176,-9.075456,...,1.255003,0.851643,1.888769,2.743327,3.055347,2.76582,1.751094,0.627919,0.001254,-0.341351
4,young,male,negative,-1081.5917,56.35535,32.41554,20.493723,18.165644,15.897766,9.830666,...,4.560422,5.526708,1.660786,-0.236842,0.213741,-0.044513,-0.322685,2.002674,3.273654,-0.833895


In [3]:
# determine number of positive and negative values for pcr_test_result_inferred column

data.pcr_test_result_inferred.value_counts()

pcr_test_result_inferred
negative    3628
positive     351
Name: count, dtype: int64

In [4]:
# since there are less positive values than negative values and the value types need to be the same for class 
# label balance, there needs to be a random sample of 351 negative values selected from the data

negatives = data[data["pcr_test_result_inferred"] == "negative"]
negatives_sample = negatives.sample(n = 351, random_state = 42)

In [5]:
# display the random sample of negative values

negatives_sample

Unnamed: 0,age,reported_gender,pcr_test_result_inferred,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
651,young,female,negative,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,...,-0.701048,6.424246,1.226897,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817
1997,young,male,negative,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,...,0.075996,0.020223,-0.039056,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185
2629,young,male,negative,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,...,-0.335263,-5.825594,-2.752297,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424
3481,young,male,negative,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,...,0.821477,0.514392,0.135383,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465
2740,young,female,negative,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,...,-0.616912,-15.159307,-6.337787,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2481,old,male,negative,-1060.42970,67.301155,22.629314,10.356158,2.091388,-7.763188,-8.596008,...,-0.221658,-0.989659,-2.704707,1.433190,0.182784,-3.280553,0.983729,1.442970,-2.403072,-1.104054
2677,old,male,negative,-973.44763,56.432728,-25.203697,-8.190401,-22.668957,-2.653147,-14.151488,...,3.716814,7.788183,-13.340111,0.485227,-1.185603,-3.014235,-0.433026,3.928296,-1.173150,-6.413605
2146,young,female,negative,-964.34204,63.396470,-32.943695,10.561619,4.241620,32.009727,9.146317,...,-10.837431,-7.194852,-2.091639,-2.310403,3.245666,3.676422,-4.493384,-4.022187,2.465693,2.778678
1298,old,female,negative,-1125.76730,6.170935,2.837262,-1.377946,-5.522353,-6.865581,-6.309572,...,-0.496309,-0.869521,-0.381020,-0.428697,0.030043,0.651106,0.446867,0.735546,0.474544,-0.079790


In [6]:
# extract the positive values from the data and combine the negatives_sample with the positive values into one 
# dataframe

positives = data[data["pcr_test_result_inferred"] == "positive"]
final_data = pd.concat([negatives_sample, positives])

In [7]:
# display the combined dataframe

final_data

Unnamed: 0,age,reported_gender,pcr_test_result_inferred,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
651,young,female,negative,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,...,-0.701048,6.424246,1.226897,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817
1997,young,male,negative,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,...,0.075996,0.020223,-0.039056,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185
2629,young,male,negative,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,...,-0.335263,-5.825594,-2.752297,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424
3481,young,male,negative,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,...,0.821477,0.514392,0.135383,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465
2740,young,female,negative,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,...,-0.616912,-15.159307,-6.337787,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,young,male,positive,-825.06430,76.595960,11.107402,44.856045,25.264648,21.166021,25.613499,...,-8.914774,13.665854,-0.904290,-7.707289,-3.369097,5.814813,-7.910339,-9.233421,4.961718,-2.636617
3928,young,female,positive,-743.87320,163.496220,-9.669670,19.465970,-69.647280,-17.570038,-68.491590,...,-0.031399,-0.992662,-1.351948,-2.733991,1.361803,3.712362,5.674425,-2.668595,-7.222219,2.014791
3944,young,female,positive,-873.42487,68.492600,-6.219747,29.446487,-22.424980,-0.172477,-1.039133,...,-9.669401,-1.912023,2.587761,4.959102,7.186164,9.428362,-2.131184,-3.735090,0.440364,2.148107
3955,old,female,positive,-595.73150,86.894060,-40.133774,60.106552,-43.743233,47.801560,-38.907402,...,-3.292270,-2.790255,3.632284,-2.890401,2.294373,10.285553,5.016209,1.449564,0.146818,-1.475920


In [8]:
# use pandas to create dummy variables for the columns that have categorical value

one_hot_encoded_data = pd.get_dummies(final_data, columns = ["age", "reported_gender", "pcr_test_result_inferred"])

In [9]:
# remove extra columns that were created from pandas dummy variables

one_hot_encoded_data = one_hot_encoded_data.loc[:, 
                                                ~one_hot_encoded_data.columns.isin(["age_young", 
                                                                "reported_gender_female", 
                                                                "reported_gender_other", 
                                                                "pcr_test_result_inferred_negative"])]
one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,34,35,36,37,38,39,40,age_old,reported_gender_male,pcr_test_result_inferred_positive
651,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,12.297134,13.349513,9.448143,...,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817,False,False,False
1997,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,3.921380,3.753706,3.588913,...,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185,False,True,False
2629,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,-5.321093,-15.653751,-2.557728,...,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424,False,True,False
3481,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,16.456585,6.457840,-3.852306,...,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465,False,True,False
2740,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,-15.263950,-20.909666,-2.185685,...,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,-825.06430,76.595960,11.107402,44.856045,25.264648,21.166021,25.613499,17.175945,34.172028,39.736153,...,-7.707289,-3.369097,5.814813,-7.910339,-9.233421,4.961718,-2.636617,False,True,True
3928,-743.87320,163.496220,-9.669670,19.465970,-69.647280,-17.570038,-68.491590,-22.430347,-9.634733,-15.103171,...,-2.733991,1.361803,3.712362,5.674425,-2.668595,-7.222219,2.014791,False,False,True
3944,-873.42487,68.492600,-6.219747,29.446487,-22.424980,-0.172477,-1.039133,15.284597,-8.199100,4.415235,...,4.959102,7.186164,9.428362,-2.131184,-3.735090,0.440364,2.148107,False,False,True
3955,-595.73150,86.894060,-40.133774,60.106552,-43.743233,47.801560,-38.907402,24.900967,-13.270189,-3.497858,...,-2.890401,2.294373,10.285553,5.016209,1.449564,0.146818,-1.475920,True,False,True


In [10]:
# for Age, True = Old and False = Young
# for Gender, True = Male and False = Female
# for Covid, True = Positive Test Result and False = Negative Test Result

one_hot_encoded_data.rename(columns = {"age_old" : "Age", "reported_gender_male" : "Gender", 
                                       "pcr_test_result_inferred_positive" : "Covid"}, inplace = True)

In [11]:
one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,34,35,36,37,38,39,40,Age,Gender,Covid
651,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,12.297134,13.349513,9.448143,...,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817,False,False,False
1997,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,3.921380,3.753706,3.588913,...,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185,False,True,False
2629,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,-5.321093,-15.653751,-2.557728,...,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424,False,True,False
3481,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,16.456585,6.457840,-3.852306,...,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465,False,True,False
2740,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,-15.263950,-20.909666,-2.185685,...,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,-825.06430,76.595960,11.107402,44.856045,25.264648,21.166021,25.613499,17.175945,34.172028,39.736153,...,-7.707289,-3.369097,5.814813,-7.910339,-9.233421,4.961718,-2.636617,False,True,True
3928,-743.87320,163.496220,-9.669670,19.465970,-69.647280,-17.570038,-68.491590,-22.430347,-9.634733,-15.103171,...,-2.733991,1.361803,3.712362,5.674425,-2.668595,-7.222219,2.014791,False,False,True
3944,-873.42487,68.492600,-6.219747,29.446487,-22.424980,-0.172477,-1.039133,15.284597,-8.199100,4.415235,...,4.959102,7.186164,9.428362,-2.131184,-3.735090,0.440364,2.148107,False,False,True
3955,-595.73150,86.894060,-40.133774,60.106552,-43.743233,47.801560,-38.907402,24.900967,-13.270189,-3.497858,...,-2.890401,2.294373,10.285553,5.016209,1.449564,0.146818,-1.475920,True,False,True


In [12]:
# X is the features, y is the target variable

X = one_hot_encoded_data.loc[:, one_hot_encoded_data.columns != "Covid"]

y = one_hot_encoded_data["Covid"]

In [13]:
# function to perform grid search cross validation and determine the optimal hyperparameters for the decision tree
# using 5 folds
# best_params are the best parameters
# best_score is the average performance 

def grid_search(X, y, cv):
    param_grid = {"criterion" : ["gini", "entropy"], "max_depth": np.arange(3, 15), 
                  "min_samples_split": [2, 3, 4, 5], "min_samples_leaf": [2, 3, 4, 5]}
    
    decision_tree = DecisionTreeClassifier()
    
    grid_search_cv = GridSearchCV(decision_tree, param_grid, cv = cv)
    grid_search_cv.fit(X, y)
    
    print("Best Parameters: ", grid_search_cv.best_params_)
    print("Best Score: ", grid_search_cv.best_score_)
    
    # return best estimator to use for the decision tree
    return grid_search_cv.best_estimator_

In [14]:
# accuracy score for decision tree model
# approximately 83% accurate
# approximately 589/702 samples classified correctly

best_estimator = grid_search(X, y, 5)
y_pred = best_estimator.predict(X)

print(accuracy_score(y, y_pred, normalize = True))
print(accuracy_score(y, y_pred, normalize = False))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5826545086119554
0.8390313390313391
589


In [15]:
# lists to hold metric values for gender before mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

# No longer using the commented metrics for demographic parity
'''
dp_female_accuracy_before = []
dp_male_accuracy_before = []

dp_female_fnr_before = []
dp_male_fnr_before = []

dp_female_fpr_before = []
dp_male_fpr_before = []

dp_eor_gender_before = []
dp_eod_gender_before = []
'''

dp_female_selection_rate_before = []
dp_male_selection_rate_before = []

dp_dpr_gender_before = []
dp_dpd_gender_before = []

In [16]:
# lists to hold metric values for gender after mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

# No longer using the commented metrics for demographic parity
'''
dp_female_accuracy_after = []
dp_male_accuracy_after = []

dp_female_fnr_after = []
dp_male_fnr_after = []

dp_female_fpr_after = []
dp_male_fpr_after = []

dp_eor_gender_after = []
dp_eod_gender_after = []
'''

dp_female_selection_rate_after = []
dp_male_selection_rate_after = []

dp_dpr_gender_after = []
dp_dpd_gender_after = []

In [17]:
# lists to hold metric values for age before mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

# No longer using the commented metrics for demographic parity
'''
dp_young_accuracy_before = []
dp_old_accuracy_before = []

dp_young_fnr_before = []
dp_old_fnr_before = []

dp_young_fpr_before = []
dp_old_fpr_before = []

dp_eor_age_before = []
dp_eod_age_before = []
'''

dp_young_selection_rate_before = []
dp_old_selection_rate_before = []


dp_dpr_age_before = []
dp_dpd_age_before = []

In [18]:
# lists to hold metric values for age after mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

# No longer using the commented metrics for demographic parity
'''
dp_young_accuracy_after = []
dp_old_accuracy_after = []

dp_young_fnr_after = []
dp_old_fnr_after = []

dp_young_fpr_after = []
dp_old_fpr_after = []

dp_eor_age_after = []
dp_eod_age_after = []
'''

dp_young_selection_rate_after = []
dp_old_selection_rate_after = []

dp_dpr_age_after = []
dp_dpd_age_after = []

In [19]:
# run function 30 times
# get y_pred values 30 times
# get metric values 30 times before mitigation algorithm
# get metric values 30 times after mitigation algortihm
# metric values: selection rate, demographic parity ratio, demographic parity difference
# mitigation algorithm: threshold optimizer (use demographic_parity for constraint)


for i in range(1, 31):
    # get best estimator from grid search cv
    best_estimator = grid_search(X, y, 5)
    
    # get y_pred values
    y_pred = best_estimator.predict(X)
    
    # metrics based on gender before mitigation
    # True = Male, False = Female
    print("ITERATION: ", i)
    
    metrics_gender = {"Selection Rate" : selection_rate}


    metric_frame_gender = MetricFrame(metrics = metrics_gender, y_true = y, y_pred = y_pred, 
                                  sensitive_features = X["Gender"])

    # append to lists to hold metric values for gender before mitigation algorithm for each of the 30 iterations
    dp_female_selection_rate_before.append(metric_frame_gender.by_group["Selection Rate"].iloc[0])
    dp_male_selection_rate_before.append(metric_frame_gender.by_group["Selection Rate"].iloc[1])

    dp_dpr_gender_before.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    dp_dpd_gender_before.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    
    # threshold optimizer with demographic parity for gender
    dp_threshold_optimizer_gender = ThresholdOptimizer(estimator = best_estimator, 
                                                       constraints = "demographic_parity", 
                                                       predict_method = "predict_proba", 
                                                       prefit = False)
    
    # fit the model and get y_pred values
    dp_threshold_optimizer_gender.fit(X, y, sensitive_features = X["Gender"])
    dp_y_pred_optimized_gender = dp_threshold_optimizer_gender.predict(X, sensitive_features = X["Gender"])
    
    # metrics based on gender after mitigation
    # True = Male, False = Female
    metric_frame_gender_optimized = MetricFrame(metrics = metrics_gender, y_true = y, 
                                                y_pred = dp_y_pred_optimized_gender, 
                                                sensitive_features = X["Gender"])

    # append to lists to hold metric values for gender after mitigation algorithm for each of the 30 iterations
    dp_female_selection_rate_after.append(metric_frame_gender_optimized.by_group["Selection Rate"].iloc[0])
    dp_male_selection_rate_after.append(metric_frame_gender_optimized.by_group["Selection Rate"].iloc[1])

    dp_dpr_gender_after.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, 
                                                                       y_pred = dp_y_pred_optimized_gender, 
                                                      sensitive_features = X["Gender"], method = "between_groups"))
    dp_dpd_gender_after.append(fairlearn.metrics.demographic_parity_difference(y_true = y,
                                                                               y_pred = dp_y_pred_optimized_gender, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    
    # metrics based on age before mitigation
    # True = Old and False = Young
    metrics_age = {"Selection Rate" : selection_rate}


    metric_frame_age = MetricFrame(metrics = metrics_age, y_true = y, y_pred = y_pred, 
                                   sensitive_features = X["Age"])

    # append lists to hold metric values for age before mitigation algorithm for each of the 30 iterations
    dp_young_selection_rate_before.append(metric_frame_age.by_group["Selection Rate"].iloc[0])
    dp_old_selection_rate_before.append(metric_frame_age.by_group["Selection Rate"].iloc[1])

    dp_dpr_age_before.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    dp_dpd_age_before.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    
    # threshold optimizer with demographic parity for age
    dp_threshold_optimizer_age = ThresholdOptimizer(estimator = best_estimator, constraints = "demographic_parity", 
                                                 predict_method = "predict_proba", 
                                                 prefit = False)
    
    # fit the model and get y_pred values
    dp_threshold_optimizer_age.fit(X, y, sensitive_features = X["Age"])
    dp_y_pred_optimized_age = dp_threshold_optimizer_age.predict(X, sensitive_features = X["Age"])
    
    # metrics based on age after mitigation
    # True = Old and False = Young
    metric_frame_age_optimized = MetricFrame(metrics = metrics_age, y_true = y, y_pred = dp_y_pred_optimized_age, 
                                         sensitive_features = X["Age"])

    # append lists to hold metric values for age after mitigation algorithm for each of the 30 iterations
    dp_young_selection_rate_after.append(metric_frame_age_optimized.by_group["Selection Rate"].iloc[0])
    dp_old_selection_rate_after.append(metric_frame_age_optimized.by_group["Selection Rate"].iloc[1])

    dp_dpr_age_after.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = dp_y_pred_optimized_age, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    dp_dpd_age_after.append(fairlearn.metrics.demographic_parity_difference(y_true = y,
                                                                               y_pred = dp_y_pred_optimized_age, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 5}
Best Score:  0.5840020263424519
ITERATION:  1
Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5825633232016212
ITERATION:  2
Best Parameters:  {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 3, 'min_samples_split': 5}
Best Score:  0.5769807497467072
ITERATION:  3
Best Parameters:  {'criterion': 'entropy', 'max_depth': 13, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5798176291793313
ITERATION:  4
Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 4}
Best Score:  0.5825835866261397
ITERATION:  5
Best Parameters:  {'criterion': 'entropy', 'max_depth': 14, 'min_samples_leaf': 3, 'min_samples_split': 3}
Best Score:  0.5812158054711246
ITERATION:  6
Best Parameters:  {'criterion': 'entropy', 'max_depth': 14, 'min_samples_leaf': 3, 'm

In [20]:
# convert results of metrics for gender to a dataframe

dp_results_gender = {
    "Female Selection Rate Before": dp_female_selection_rate_before,
    "Male Selection Rate Before": dp_male_selection_rate_before,
    "Female Selection Rate After": dp_female_selection_rate_after,
    "Male Selection Rate After": dp_male_selection_rate_after,
    "Demographic Parity Ratio Gender Before": dp_dpr_gender_before,
    "Demographic Parity Ratio Gender After": dp_dpr_gender_after,
    "Demographic Parity Difference Gender Before": dp_dpd_gender_before,
    "Demographic Parity Difference Gender After" : dp_dpd_gender_after,
}

dp_metric_results_gender = pd.DataFrame(dp_results_gender)
dp_metric_results_gender

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Gender Before,Demographic Parity Ratio Gender After,Demographic Parity Difference Gender Before,Demographic Parity Difference Gender After
0,0.342308,0.248869,0.357692,0.355204,0.727032,0.993042,0.093439,0.002489
1,0.338462,0.251131,0.346154,0.330317,0.741979,0.954248,0.08733,0.015837
2,0.426923,0.321267,0.434615,0.436652,0.752517,0.995337,0.105656,0.002036
3,0.503846,0.402715,0.430769,0.418552,0.799282,0.971639,0.101131,0.012217
4,0.346154,0.246606,0.357692,0.359729,0.712418,0.99434,0.099548,0.002036
5,0.496154,0.400452,0.426923,0.432127,0.807114,0.987958,0.095701,0.005204
6,0.507692,0.404977,0.415385,0.409502,0.797683,0.985839,0.102715,0.005882
7,0.515385,0.39819,0.419231,0.418552,0.772608,0.998381,0.117195,0.000679
8,0.496154,0.400452,0.407692,0.41629,0.807114,0.979348,0.095701,0.008597
9,0.342308,0.248869,0.353846,0.346154,0.727032,0.978261,0.093439,0.007692


In [21]:
# convert average of each metric for gender to a dataframe

dp_averages_gender = pd.DataFrame(dp_metric_results_gender.mean()).T
dp_averages_gender

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Gender Before,Demographic Parity Ratio Gender After,Demographic Parity Difference Gender Before,Demographic Parity Difference Gender After
0,0.414615,0.31448,0.387436,0.386878,0.752724,0.98191,0.100136,0.006983


In [22]:
# convert results of metrics for age to a dataframe

dp_results_age = {
    "Young Selection Rate Before": dp_young_selection_rate_before,
    "Old Selection Rate Before": dp_old_selection_rate_before,
    "Young Selection Rate After": dp_young_selection_rate_after,
    "Old Selection Rate After": dp_old_selection_rate_after,
    "Demographic Parity Ratio Age Before": dp_dpr_age_before,
    "Demographic Parity Ratio Age After": dp_dpr_age_after,
    "Demographic Parity Difference Age Before": dp_dpd_age_before,
    "Demographic Parity Difference Age After" : dp_dpd_age_after,
}

dp_metric_results_age = pd.DataFrame(dp_results_age)
dp_metric_results_age

Unnamed: 0,Young Selection Rate Before,Old Selection Rate Before,Young Selection Rate After,Old Selection Rate After,Demographic Parity Ratio Age Before,Demographic Parity Ratio Age After,Demographic Parity Difference Age Before,Demographic Parity Difference Age After
0,0.377551,0.164516,0.382653,0.377419,0.435745,0.986323,0.213035,0.005234
1,0.380102,0.16129,0.382653,0.390323,0.424334,0.980351,0.218812,0.00767
2,0.484694,0.203226,0.482143,0.474194,0.419287,0.983513,0.281468,0.007949
3,0.612245,0.222581,0.466837,0.5,0.363548,0.933673,0.389664,0.033163
4,0.382653,0.158065,0.382653,0.36129,0.413075,0.944172,0.224589,0.021363
5,0.584184,0.248387,0.471939,0.493548,0.425187,0.956216,0.335797,0.02161
6,0.596939,0.248387,0.47449,0.506452,0.416101,0.936891,0.348552,0.031962
7,0.596939,0.245161,0.469388,0.441935,0.410698,0.941515,0.351777,0.027452
8,0.602041,0.225806,0.459184,0.454839,0.375068,0.990538,0.376234,0.004345
9,0.377551,0.164516,0.380102,0.387097,0.435745,0.98193,0.213035,0.006995


In [23]:
# convert average of each metric for age to a dataframe

dp_averages_age = pd.DataFrame(dp_metric_results_age.mean()).T
dp_averages_age

Unnamed: 0,Young Selection Rate Before,Old Selection Rate Before,Young Selection Rate After,Old Selection Rate After,Demographic Parity Ratio Age Before,Demographic Parity Ratio Age After,Demographic Parity Difference Age Before,Demographic Parity Difference Age After
0,0.474405,0.196237,0.428912,0.433656,0.416671,0.963838,0.278168,0.016357


In [24]:
# save metric_results_gender, averages_gender, metric_results_age, and averages_age dataframes as csv files

dp_metric_results_gender.to_csv("demographic_parity_metric_results_by_gender_coughvid_data.csv", index = False)
dp_averages_gender.to_csv("demographic_parity_averages_for_gender_coughvid_data.csv", index = False)

dp_metric_results_age.to_csv("demographic_parity_metric_results_by_age_coughvid_data.csv", index = False)
dp_averages_age.to_csv("demographic_parity_averages_for_age_coughvid_data.csv", index = False)