In [1]:
# import pandas and numpy

import pandas as pd
import numpy as np


# import sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# fairlearn metrics

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_ratio, demographic_parity_difference
from fairlearn.metrics import false_negative_rate, equalized_odds_ratio, equalized_odds_difference


# fairlearn reductions

from fairlearn.reductions import DemographicParity, EqualizedOdds


# fairlearn postprocessing

from fairlearn.postprocessing import ThresholdOptimizer

In [2]:
# load in preprocessed CoughVID dataset and display first 5 rows

data = pd.read_csv("../../preprocessed_coughvid_data.csv")
data.head()

Unnamed: 0,age,reported_gender,pcr_test_result_inferred,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,old,male,negative,-832.542,47.722702,-18.37968,31.09029,-16.050894,22.458717,-5.056018,...,0.914785,4.65387,-5.853877,-2.536242,1.582973,5.607704,1.397359,-1.465576,-1.254243,2.220975
1,young,male,negative,-1035.8025,70.914444,24.486706,39.24166,8.321938,-6.705366,3.596952,...,-2.712213,-4.592238,-7.682649,-9.517369,-4.125193,-2.689272,2.242155,6.358678,-0.130439,-2.016235
2,old,male,negative,-964.15137,4.123369,11.5602,13.998945,15.475512,9.942076,8.380241,...,-1.308795,-2.059448,-3.164607,0.26054,-3.896403,1.36875,-3.289319,1.358415,-0.305018,0.843219
3,young,male,negative,-1062.9386,64.8412,1.44924,-36.003242,-30.50861,-12.251176,-9.075456,...,1.255003,0.851643,1.888769,2.743327,3.055347,2.76582,1.751094,0.627919,0.001254,-0.341351
4,young,male,negative,-1081.5917,56.35535,32.41554,20.493723,18.165644,15.897766,9.830666,...,4.560422,5.526708,1.660786,-0.236842,0.213741,-0.044513,-0.322685,2.002674,3.273654,-0.833895


In [3]:
# determine number of positive and negative values for pcr_test_result_inferred column

data.pcr_test_result_inferred.value_counts()

pcr_test_result_inferred
negative    3628
positive     351
Name: count, dtype: int64

In [4]:
# since there are less positive values than negative values and the value types need to be the same for class 
# label balance, there needs to be a random sample of 351 negative values selected from the data

negatives = data[data["pcr_test_result_inferred"] == "negative"]
negatives_sample = negatives.sample(n = 351, random_state = 42)

In [5]:
# display the random sample of negative values

negatives_sample

Unnamed: 0,age,reported_gender,pcr_test_result_inferred,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
651,young,female,negative,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,...,-0.701048,6.424246,1.226897,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817
1997,young,male,negative,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,...,0.075996,0.020223,-0.039056,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185
2629,young,male,negative,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,...,-0.335263,-5.825594,-2.752297,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424
3481,young,male,negative,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,...,0.821477,0.514392,0.135383,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465
2740,young,female,negative,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,...,-0.616912,-15.159307,-6.337787,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2481,old,male,negative,-1060.42970,67.301155,22.629314,10.356158,2.091388,-7.763188,-8.596008,...,-0.221658,-0.989659,-2.704707,1.433190,0.182784,-3.280553,0.983729,1.442970,-2.403072,-1.104054
2677,old,male,negative,-973.44763,56.432728,-25.203697,-8.190401,-22.668957,-2.653147,-14.151488,...,3.716814,7.788183,-13.340111,0.485227,-1.185603,-3.014235,-0.433026,3.928296,-1.173150,-6.413605
2146,young,female,negative,-964.34204,63.396470,-32.943695,10.561619,4.241620,32.009727,9.146317,...,-10.837431,-7.194852,-2.091639,-2.310403,3.245666,3.676422,-4.493384,-4.022187,2.465693,2.778678
1298,old,female,negative,-1125.76730,6.170935,2.837262,-1.377946,-5.522353,-6.865581,-6.309572,...,-0.496309,-0.869521,-0.381020,-0.428697,0.030043,0.651106,0.446867,0.735546,0.474544,-0.079790


In [6]:
# extract the positive values from the data and combine the negatives_sample with the positive values into one 
# dataframe

positives = data[data["pcr_test_result_inferred"] == "positive"]
final_data = pd.concat([negatives_sample, positives])

In [7]:
# display the combined dataframe

final_data

Unnamed: 0,age,reported_gender,pcr_test_result_inferred,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
651,young,female,negative,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,...,-0.701048,6.424246,1.226897,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817
1997,young,male,negative,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,...,0.075996,0.020223,-0.039056,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185
2629,young,male,negative,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,...,-0.335263,-5.825594,-2.752297,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424
3481,young,male,negative,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,...,0.821477,0.514392,0.135383,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465
2740,young,female,negative,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,...,-0.616912,-15.159307,-6.337787,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,young,male,positive,-825.06430,76.595960,11.107402,44.856045,25.264648,21.166021,25.613499,...,-8.914774,13.665854,-0.904290,-7.707289,-3.369097,5.814813,-7.910339,-9.233421,4.961718,-2.636617
3928,young,female,positive,-743.87320,163.496220,-9.669670,19.465970,-69.647280,-17.570038,-68.491590,...,-0.031399,-0.992662,-1.351948,-2.733991,1.361803,3.712362,5.674425,-2.668595,-7.222219,2.014791
3944,young,female,positive,-873.42487,68.492600,-6.219747,29.446487,-22.424980,-0.172477,-1.039133,...,-9.669401,-1.912023,2.587761,4.959102,7.186164,9.428362,-2.131184,-3.735090,0.440364,2.148107
3955,old,female,positive,-595.73150,86.894060,-40.133774,60.106552,-43.743233,47.801560,-38.907402,...,-3.292270,-2.790255,3.632284,-2.890401,2.294373,10.285553,5.016209,1.449564,0.146818,-1.475920


In [8]:
# use pandas to create dummy variables for the columns that have categorical value

one_hot_encoded_data = pd.get_dummies(final_data, columns = ["age", "reported_gender", "pcr_test_result_inferred"])

In [9]:
# remove extra columns that were created from pandas dummy variables

one_hot_encoded_data = one_hot_encoded_data.loc[:, 
                                                ~one_hot_encoded_data.columns.isin(["age_young", 
                                                                "reported_gender_female", 
                                                                "reported_gender_other", 
                                                                "pcr_test_result_inferred_negative"])]
one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,34,35,36,37,38,39,40,age_old,reported_gender_male,pcr_test_result_inferred_positive
651,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,12.297134,13.349513,9.448143,...,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817,False,False,False
1997,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,3.921380,3.753706,3.588913,...,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185,False,True,False
2629,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,-5.321093,-15.653751,-2.557728,...,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424,False,True,False
3481,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,16.456585,6.457840,-3.852306,...,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465,False,True,False
2740,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,-15.263950,-20.909666,-2.185685,...,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,-825.06430,76.595960,11.107402,44.856045,25.264648,21.166021,25.613499,17.175945,34.172028,39.736153,...,-7.707289,-3.369097,5.814813,-7.910339,-9.233421,4.961718,-2.636617,False,True,True
3928,-743.87320,163.496220,-9.669670,19.465970,-69.647280,-17.570038,-68.491590,-22.430347,-9.634733,-15.103171,...,-2.733991,1.361803,3.712362,5.674425,-2.668595,-7.222219,2.014791,False,False,True
3944,-873.42487,68.492600,-6.219747,29.446487,-22.424980,-0.172477,-1.039133,15.284597,-8.199100,4.415235,...,4.959102,7.186164,9.428362,-2.131184,-3.735090,0.440364,2.148107,False,False,True
3955,-595.73150,86.894060,-40.133774,60.106552,-43.743233,47.801560,-38.907402,24.900967,-13.270189,-3.497858,...,-2.890401,2.294373,10.285553,5.016209,1.449564,0.146818,-1.475920,True,False,True


In [10]:
# for Age, True = Old and False = Young
# for Gender, True = Male and False = Female
# for Covid, True = Positive Test Result and False = Negative Test Result

one_hot_encoded_data.rename(columns = {"age_old" : "Age", "reported_gender_male" : "Gender", 
                                       "pcr_test_result_inferred_positive" : "Covid"}, inplace = True)

In [11]:
one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,34,35,36,37,38,39,40,Age,Gender,Covid
651,-804.52057,45.377100,-31.813538,42.266250,-17.959140,18.009663,-6.986963,12.297134,13.349513,9.448143,...,-4.203454,3.776806,6.025266,-2.874931,1.349523,-2.373121,-2.181817,False,False,False
1997,-1128.03060,4.701783,4.637610,4.536282,4.405311,4.253508,4.089700,3.921380,3.753706,3.588913,...,-0.099452,-0.156535,-0.204818,-0.238855,-0.254334,-0.249006,-0.223185,False,True,False
2629,-717.64360,144.797360,-92.674150,-8.846588,-6.478678,32.609436,-5.960134,-5.321093,-15.653751,-2.557728,...,9.036324,2.285958,2.308557,-4.228990,5.600593,-2.786079,-4.325424,False,True,False
3481,-1069.29790,65.386406,40.854310,24.750729,16.177006,21.704578,18.544870,16.456585,6.457840,-3.852306,...,-4.324539,-3.605980,1.527615,-2.622375,-0.064671,0.515271,-2.083465,False,True,False
2740,-779.96450,67.735320,-42.256126,46.131350,-23.374468,64.034454,-15.859959,-15.263950,-20.909666,-2.185685,...,4.724842,-3.471275,3.111603,1.815155,0.767159,-7.081983,-8.282387,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,-825.06430,76.595960,11.107402,44.856045,25.264648,21.166021,25.613499,17.175945,34.172028,39.736153,...,-7.707289,-3.369097,5.814813,-7.910339,-9.233421,4.961718,-2.636617,False,True,True
3928,-743.87320,163.496220,-9.669670,19.465970,-69.647280,-17.570038,-68.491590,-22.430347,-9.634733,-15.103171,...,-2.733991,1.361803,3.712362,5.674425,-2.668595,-7.222219,2.014791,False,False,True
3944,-873.42487,68.492600,-6.219747,29.446487,-22.424980,-0.172477,-1.039133,15.284597,-8.199100,4.415235,...,4.959102,7.186164,9.428362,-2.131184,-3.735090,0.440364,2.148107,False,False,True
3955,-595.73150,86.894060,-40.133774,60.106552,-43.743233,47.801560,-38.907402,24.900967,-13.270189,-3.497858,...,-2.890401,2.294373,10.285553,5.016209,1.449564,0.146818,-1.475920,True,False,True


In [12]:
# X is the features, y is the target variable

X = one_hot_encoded_data.loc[:, one_hot_encoded_data.columns != "Covid"]

y = one_hot_encoded_data["Covid"]

In [13]:
# function to perform grid search cross validation and determine the optimal hyperparameters for the decision tree
# using 5 folds
# best_params are the best parameters
# best_score is the average performance 

def grid_search(X, y, cv):
    param_grid = {"criterion" : ["gini", "entropy"], "max_depth": np.arange(5, 15), 
                  "min_samples_split": [2, 3, 4, 5], "min_samples_leaf": [2, 3, 4, 5]}
    
    decision_tree = DecisionTreeClassifier()
    
    grid_search_cv = GridSearchCV(decision_tree, param_grid, cv = cv)
    grid_search_cv.fit(X, y)
    
    print("Best Parameters: ", grid_search_cv.best_params_)
    print("Best Score: ", grid_search_cv.best_score_)
    
    # return best estimator to use for the decision tree
    return grid_search_cv.best_estimator_

In [14]:
# accuracy score for decision tree model
# approximately 75% accurate
# approximately 532/702 samples classified correctly

best_estimator = grid_search(X, y, 5)
y_pred = best_estimator.predict(X)

print(accuracy_score(y, y_pred, normalize = True))
print(accuracy_score(y, y_pred, normalize = False))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 3}
Best Score:  0.5840020263424519
0.7578347578347578
532


In [15]:
# lists to hold metric values for gender before mitigation algorithm for each of the 30 iterations
# for EQUALIZED ODDS, only looking at false negative rate, equalized odds ratio, and equalized odds difference

# No longer using the commented metrics for equalized odds
'''
eo_female_accuracy_before = []
eo_male_accuracy_before = []

eo_female_selection_rate_before = []
eo_male_selection_rate_before = []

eo_female_fpr_before = []
eo_male_fpr_before = []

eo_dpr_gender_before = []
eo_dpd_gender_before = []

'''

eo_female_fnr_before = []
eo_male_fnr_before = []

eo_eor_gender_before = []
eo_eod_gender_before = []

In [16]:
# lists to hold metric values for gender after mitigation algorithm for each of the 30 iterations
# for EQUALIZED ODDS, only looking at false negative rate, equalized odds ratio, and equalized odds difference

# No longer using the commented metrics for equalized odds
'''
eo_female_accuracy_after = []
eo_male_accuracy_after = []

eo_female_selection_rate_after = []
eo_male_selection_rate_after = []

eo_female_fpr_after = []
eo_male_fpr_after = []

eo_dpr_gender_after = []
eo_dpd_gender_after = []

'''

eo_female_fnr_after = []
eo_male_fnr_after = []

eo_eor_gender_after = []
eo_eod_gender_after = []

In [17]:
# lists to hold metric values for age before mitigation algorithm for each of the 30 iterations
# for EQUALIZED ODDS, only looking at false negative rate, equalized odds ratio, and equalized odds difference

# No longer using the commented metrics for equalized odds
'''
eo_young_accuracy_before = []
eo_old_accuracy_before = []

eo_young_selection_rate_before = []
eo_old_selection_rate_before = []

eo_young_fpr_before = []
eo_old_fpr_before = []

eo_dpr_age_before = []
eo_dpd_age_before = []

'''

eo_young_fnr_before = []
eo_old_fnr_before = []

eo_eor_age_before = []
eo_eod_age_before = []

In [18]:
# lists to hold metric values for age after mitigation algorithm for each of the 30 iterations
# for EQUALIZED ODDS, only looking at false negative rate, equalized odds ratio, and equalized odds difference

# No longer using the commented metrics for equalized odds
'''
eo_young_accuracy_after = []
eo_old_accuracy_after = []

eo_young_selection_rate_after = []
eo_old_selection_rate_after = []

eo_young_fpr_after = []
eo_old_fpr_after = []

eo_dpr_age_after = []
eo_dpd_age_after = []

'''

eo_young_fnr_after = []
eo_old_fnr_after = []

eo_eor_age_after = []
eo_eod_age_after = []

In [19]:
# run function 30 times
# get y_pred values 30 times
# get metric values 30 times before mitigation algorithm
# get metric values 30 times after mitigation algortihm
# metric values: false negative rate, equalized odds ratio, equalized odds difference
# mitigation algorithm: threshold optimizer (use equalized_odds for constraint)


for i in range(1, 31):
    # get best estimator from grid search cv
    best_estimator = grid_search(X, y, 5)
    
    # get y_pred values
    y_pred = best_estimator.predict(X)
    
    # metrics based on gender before mitigation
    # True = Male, False = Female
    print("ITERATION: ", i)
    
    metrics_gender = {"False Negative Rate" : false_negative_rate}


    metric_frame_gender = MetricFrame(metrics = metrics_gender, y_true = y, y_pred = y_pred, 
                                  sensitive_features = X["Gender"])

    # append to lists to hold metric values for gender before mitigation algorithm for each of the 30 iterations
    eo_female_fnr_before.append(metric_frame_gender.by_group["False Negative Rate"].iloc[0])
    eo_male_fnr_before.append(metric_frame_gender.by_group["False Negative Rate"].iloc[1])

    try:
        eo_eor_gender_before.append(fairlearn.metrics.equalized_odds_ratio(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    except ZeroDivisionError:
        eo_eor_gender_before.append(0.0)
    
    eo_eod_gender_before.append(fairlearn.metrics.equalized_odds_difference(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    
    # threshold optimizer with equalized odds for gender
    eo_threshold_optimizer_gender = ThresholdOptimizer(estimator = best_estimator, constraints = "equalized_odds", 
                                                predict_method = "predict_proba", 
                                                prefit = False)
    
    # fit the model and get y_pred values
    eo_threshold_optimizer_gender.fit(X, y, sensitive_features = X["Gender"])
    eo_y_pred_optimized_gender = eo_threshold_optimizer_gender.predict(X, sensitive_features = X["Gender"])
    
    # metrics based on gender after mitigation
    # True = Male, False = Female
    metric_frame_gender_optimized = MetricFrame(metrics = metrics_gender, y_true = y, 
                                                y_pred = eo_y_pred_optimized_gender, 
                                            sensitive_features = X["Gender"])

    # append to lists to hold metric values for gender after mitigation algorithm for each of the 30 iterations
    eo_female_fnr_after.append(metric_frame_gender_optimized.by_group["False Negative Rate"].iloc[0])
    eo_male_fnr_after.append(metric_frame_gender_optimized.by_group["False Negative Rate"].iloc[1])

    try:
        eo_eor_gender_after.append(fairlearn.metrics.equalized_odds_ratio(y_true = y, 
                                                                          y_pred = eo_y_pred_optimized_gender, 
                                                                          sensitive_features = X["Gender"], 
                                                                          method = "between_groups"))
    except ZeroDivisionError:
        eo_eor_gender_after.append(0.0)
    
    eo_eod_gender_after.append(fairlearn.metrics.equalized_odds_difference(y_true = y, 
                                                                           y_pred = eo_y_pred_optimized_gender, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))

    # metrics based on age before mitigation
    # True = Old and False = Young
    metrics_age = {"False Negative Rate" : false_negative_rate}


    metric_frame_age = MetricFrame(metrics = metrics_age, y_true = y, y_pred = y_pred, 
                                   sensitive_features = X["Age"])

    # append lists to hold metric values for age before mitigation algorithm for each of the 30 iterations
    eo_young_fnr_before.append(metric_frame_age.by_group["False Negative Rate"].iloc[0])
    eo_old_fnr_before.append(metric_frame_age.by_group["False Negative Rate"].iloc[1])

    try:
        eo_eor_age_before.append(fairlearn.metrics.equalized_odds_ratio(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    except ZeroDivisionError:
        eo_eor_age_before.append(0.0)
        
    eo_eod_age_before.append(fairlearn.metrics.equalized_odds_difference(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))

    # threshold optimizer with equalized odds for age
    eo_threshold_optimizer_age = ThresholdOptimizer(estimator = best_estimator, constraints = "equalized_odds", 
                                                 predict_method = "predict_proba", 
                                                 prefit = False)
    
    # fit the model and get y_pred values
    eo_threshold_optimizer_age.fit(X, y, sensitive_features = X["Age"])
    eo_y_pred_optimized_age = eo_threshold_optimizer_age.predict(X, sensitive_features = X["Age"])
    
    # metrics based on age after mitigation
    # True = Old and False = Young
    metric_frame_age_optimized = MetricFrame(metrics = metrics_age, y_true = y, y_pred = eo_y_pred_optimized_age, 
                                         sensitive_features = X["Age"])

    # append lists to hold metric values for age after mitigation algorithm for each of the 30 iterations
    eo_young_fnr_after.append(metric_frame_age_optimized.by_group["False Negative Rate"].iloc[0])
    eo_old_fnr_after.append(metric_frame_age_optimized.by_group["False Negative Rate"].iloc[1])
    
    
    try:
        eo_eor_age_after.append(fairlearn.metrics.equalized_odds_ratio(y_true = y, 
                                                                       y_pred = eo_y_pred_optimized_age, 
                                                                       sensitive_features = X["Age"], 
                                                                       method = "between_groups"))
    except ZeroDivisionError:
        eo_eor_age_after.append(0.0)
    
    eo_eod_age_after.append(fairlearn.metrics.equalized_odds_difference(y_true = y, 
                                                                        y_pred = eo_y_pred_optimized_age, 
                                                                        sensitive_features = X["Age"], 
                                                                        method = "between_groups"))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 3}
Best Score:  0.5797365754812562
ITERATION:  1
Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 3}
Best Score:  0.5826241134751773
ITERATION:  2
Best Parameters:  {'criterion': 'entropy', 'max_depth': 14, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5854711246200608
ITERATION:  3
Best Parameters:  {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 3}
Best Score:  0.5854407294832826
ITERATION:  4
Best Parameters:  {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 3, 'min_samples_split': 4}
Best Score:  0.5855116514690983
ITERATION:  5
Best Parameters:  {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 3, 'min_samples_split': 4}
Best Score:  0.5798074974670719
ITERATION:  6
Best Parameters:  {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 3, 'm

In [20]:
# convert results of metrics for gender to a dataframe

eo_results_gender = {
    "Female False Negative Rate Before": eo_female_fnr_before,
    "Male False Negative Rate Before": eo_male_fnr_before,
    "Female False Negative Rate After": eo_female_fnr_after,
    "Male False Negative Rate After": eo_male_fnr_after,
    "Equalized Odds Ratio Gender Before" : eo_eor_gender_before,
    "Equalized Odds Ratio Gender After" : eo_eor_gender_after,
    "Equalized Odds Difference Gender Before" : eo_eod_gender_before,
    "Equalized Odds Difference Gender After" : eo_eod_gender_after
}

eo_metric_results_gender = pd.DataFrame(eo_results_gender)
eo_metric_results_gender

Unnamed: 0,Female False Negative Rate Before,Male False Negative Rate Before,Female False Negative Rate After,Male False Negative Rate After,Equalized Odds Ratio Gender Before,Equalized Odds Ratio Gender After,Equalized Odds Difference Gender Before,Equalized Odds Difference Gender After
0,0.412162,0.492611,0.209459,0.256158,0.863145,0.940929,0.080449,0.046698
1,0.391892,0.487685,0.27027,0.231527,0.355655,0.878661,0.095793,0.038743
2,0.195946,0.172414,0.168919,0.17734,0.553823,0.838329,0.043821,0.018941
3,0.418919,0.487685,0.27027,0.251232,0.881659,0.803347,0.068766,0.061454
4,0.25,0.221675,0.256757,0.295567,0.569038,0.803347,0.05387,0.03881
5,0.297297,0.349754,0.378378,0.344828,0.527197,0.527197,0.052456,0.033772
6,0.304054,0.344828,0.344595,0.334975,0.669456,0.71131,0.040774,0.010871
7,0.418919,0.487685,0.22973,0.256158,0.881659,0.96569,0.068766,0.026428
8,0.168919,0.187192,0.222973,0.216749,0.553823,0.937238,0.043821,0.006224
9,0.324324,0.330049,0.324324,0.339901,0.527197,0.843515,0.033772,0.015577


In [21]:
# convert average of each metric for gender to a dataframe

eo_averages_gender = pd.DataFrame(eo_metric_results_gender.mean()).T
eo_averages_gender

Unnamed: 0,Female False Negative Rate Before,Male False Negative Rate Before,Female False Negative Rate After,Male False Negative Rate After,Equalized Odds Ratio Gender Before,Equalized Odds Ratio Gender After,Equalized Odds Difference Gender Before,Equalized Odds Difference Gender After
0,0.348649,0.401642,0.261712,0.261576,0.723673,0.862529,0.06458,0.031737


In [22]:
# convert results of metrics for age to a dataframe

eo_results_age = {
    "Young False Negative Rate Before": eo_young_fnr_before,
    "Old False Negative Rate Before": eo_old_fnr_before,
    "Young False Negative Rate After": eo_young_fnr_after,
    "Old False Negative Rate After": eo_old_fnr_after,
    "Equalized Odds Ratio Age Before" : eo_eor_age_before,
    "Equalized Odds Ratio Age After" : eo_eor_age_after,
    "Equalized Odds Difference Age Before" : eo_eod_age_before,
    "Equalized Odds Difference Age After" : eo_eod_age_after
}

eo_metric_results_age = pd.DataFrame(eo_results_age)
eo_metric_results_age

Unnamed: 0,Young False Negative Rate Before,Old False Negative Rate Before,Young False Negative Rate After,Old False Negative Rate After,Equalized Odds Ratio Age Before,Equalized Odds Ratio Age After,Equalized Odds Difference Age Before,Equalized Odds Difference Age After
0,0.357798,0.62406,0.009174,0.022556,0.122881,0.972916,0.266262,0.015098
1,0.344037,0.616541,0.0,0.0,0.0,0.993185,0.272505,0.003799
2,0.03211,0.428571,0.43578,0.413534,0.042741,0.962068,0.396461,0.022246
3,0.357798,0.62406,0.041284,0.022556,0.122881,0.974856,0.266262,0.018728
4,0.077982,0.488722,0.522936,0.473684,0.032768,0.655367,0.41074,0.049252
5,0.206422,0.526316,0.504587,0.503759,0.061441,0.508621,0.319894,0.005552
6,0.201835,0.533835,0.545872,0.511278,0.061441,0.929217,0.332,0.034593
7,0.357798,0.62406,0.013761,0.022556,0.122881,0.974856,0.266262,0.013637
8,0.022936,0.43609,0.385321,0.413534,0.042741,0.655367,0.413154,0.028213
9,0.206422,0.526316,0.477064,0.511278,0.061441,0.655367,0.319894,0.034214


In [23]:
# convert average of each metric for age to a dataframe

eo_averages_age = pd.DataFrame(eo_metric_results_age.mean()).T
eo_averages_age

Unnamed: 0,Young False Negative Rate Before,Old False Negative Rate Before,Young False Negative Rate After,Old False Negative Rate After,Equalized Odds Ratio Age Before,Equalized Odds Ratio Age After,Equalized Odds Difference Age Before,Equalized Odds Difference Age After
0,0.260398,0.574185,0.203823,0.201003,0.09016,0.866587,0.313788,0.018091


In [24]:
# save metric_results_gender, averages_gender, metric_results_age, and averages_age dataframes as csv files

eo_metric_results_gender.to_csv("equalized_odds_metric_results_by_gender_coughvid_data.csv", index = False)
eo_averages_gender.to_csv("equalized_odds_averages_for_gender_coughvid_data.csv", index = False)

eo_metric_results_age.to_csv("equalized_odds_metric_results_by_age_coughvid_data.csv", index = False)
eo_averages_age.to_csv("equalized_odds_averages_for_age_coughvid_data.csv", index = False)