In [1]:
# import pandas and numpy

import pandas as pd
import numpy as np


# import sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# fairlearn metrics

import fairlearn
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_ratio, demographic_parity_difference
from fairlearn.metrics import false_negative_rate, equalized_odds_ratio, equalized_odds_difference


# fairlearn reductions

from fairlearn.reductions import DemographicParity, EqualizedOdds


# fairlearn postprocessing

from fairlearn.postprocessing import ThresholdOptimizer

In [29]:
# load in preprocessed Coswara dataset and display first 5 rows

data = pd.read_csv("../Data/preprocessed_coswara_data.csv")
data.head()

Unnamed: 0,Age,Gender,Covid Status,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,young,male,negative,-707.02594,56.42623,13.907639,17.147783,-2.012528,5.152593,15.529266,...,-0.237741,3.750417,10.081942,-9.41218,-3.947216,8.084291,-2.917649,3.01896,-1.533405,4.374303
1,young,male,negative,-427.06598,24.380165,53.18984,-27.081852,-17.267035,22.45013,-9.862848,...,0.229314,-5.317071,7.775287,5.483598,-1.584106,0.640845,-8.218983,8.01708,3.545668,9.140753
2,young,male,negative,-576.4634,189.39581,-47.16896,54.42118,-34.666393,48.699585,-40.861412,...,-1.983973,4.916521,5.208999,-3.601717,-7.056323,-7.671651,1.058864,1.054138,3.543286,-0.483198
3,young,male,negative,-660.1685,-41.418015,47.21138,-25.102112,-4.69951,19.00364,11.318865,...,-10.161366,2.952621,-4.387035,8.93989,3.173635,2.95326,4.881428,8.365202,0.733032,4.489803
4,young,female,negative,-832.4382,97.74585,-13.485252,11.668908,2.601781,8.706614,-4.443015,...,12.727421,6.819866,-5.170315,-6.187433,-8.128935,1.426312,-1.929544,6.119107,1.827243,-0.27218


In [30]:
# determine number of positive and negative values for Covid Status column

data["Covid Status"].value_counts()

Covid Status
negative    709
positive    372
Name: count, dtype: int64

In [31]:
# since there are less positive values than negative values and the value types need to be the same for class 
# label balance, there needs to be a random sample of 372 negative values selected from the data

negatives = data[data["Covid Status"] == "negative"]
negatives_sample = negatives.sample(n = 372, random_state = 42)

In [32]:
# display the random sample of negative values

negatives_sample

Unnamed: 0,Age,Gender,Covid Status,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
470,young,male,negative,-543.15533,90.689514,28.970604,6.338865,9.591214,7.962400,8.999533,...,-5.214035,-0.327755,-1.228061,5.113654,-1.423935,2.840373,-1.566947,-3.431976,-3.701202,-2.503761
432,young,male,negative,-727.24084,164.063130,-19.572544,44.866737,-26.744781,22.994564,-29.662718,...,1.756853,0.415929,2.675990,0.054069,2.219139,-1.628018,-0.533565,-0.555858,1.751863,-1.296506
863,young,male,negative,-613.50780,17.359932,1.101616,49.072662,-48.691887,25.287312,-10.712454,...,-13.073709,0.920393,-6.196599,-0.008261,0.137898,-1.873153,-5.381612,-2.450135,-7.963347,-0.397056
549,young,male,negative,-710.35500,58.730850,22.247738,-13.321541,35.798714,23.302910,-3.378738,...,-4.017519,4.403957,-5.098961,-10.796859,-6.332717,-4.476096,-9.461203,7.662711,-10.089844,7.499491
424,young,male,negative,-1013.43750,82.807370,16.093830,9.230137,9.918889,17.411755,-1.945211,...,-1.060213,0.962077,-3.370188,-5.823066,0.351094,3.037789,-2.930956,-0.712987,-0.316106,-1.449122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,young,male,negative,-582.76960,194.736160,-26.117397,73.245820,-29.342022,32.692028,-27.051640,...,-2.787059,-13.819670,-18.762856,-12.226628,4.102395,0.530588,0.657076,0.077049,-3.855539,-1.892024
654,young,male,negative,-595.73627,120.411880,4.039909,7.232664,-7.251581,-18.158218,-14.684391,...,-4.092912,7.523797,-3.755001,8.982017,6.074190,-3.854938,-3.754892,-5.122377,-7.450123,4.354236
994,old,male,negative,-500.01987,180.660540,-127.236270,49.029655,-36.864563,21.681700,12.521856,...,-11.505336,-2.915974,8.138312,6.654276,-1.944061,-4.653092,4.477524,1.930779,0.670010,1.443905
308,young,male,negative,-552.64630,175.033450,-53.143684,87.825880,-29.637371,48.733900,-23.460133,...,-8.556097,-3.857112,1.875697,4.930209,-0.928397,-2.350164,3.783761,4.158632,-0.395517,2.957860


In [33]:
# extract the positive values from the data and combine the negatives_sample with the positive values into one 
# dataframe

positives = data[data["Covid Status"] == "positive"]
final_data = pd.concat([negatives_sample, positives])

In [34]:
# display the combined dataframe

final_data

Unnamed: 0,Age,Gender,Covid Status,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
470,young,male,negative,-543.15533,90.689514,28.970604,6.338865,9.591214,7.962400,8.999533,...,-5.214035,-0.327755,-1.228061,5.113654,-1.423935,2.840373,-1.566947,-3.431976,-3.701202,-2.503761
432,young,male,negative,-727.24084,164.063130,-19.572544,44.866737,-26.744781,22.994564,-29.662718,...,1.756853,0.415929,2.675990,0.054069,2.219139,-1.628018,-0.533565,-0.555858,1.751863,-1.296506
863,young,male,negative,-613.50780,17.359932,1.101616,49.072662,-48.691887,25.287312,-10.712454,...,-13.073709,0.920393,-6.196599,-0.008261,0.137898,-1.873153,-5.381612,-2.450135,-7.963347,-0.397056
549,young,male,negative,-710.35500,58.730850,22.247738,-13.321541,35.798714,23.302910,-3.378738,...,-4.017519,4.403957,-5.098961,-10.796859,-6.332717,-4.476096,-9.461203,7.662711,-10.089844,7.499491
424,young,male,negative,-1013.43750,82.807370,16.093830,9.230137,9.918889,17.411755,-1.945211,...,-1.060213,0.962077,-3.370188,-5.823066,0.351094,3.037789,-2.930956,-0.712987,-0.316106,-1.449122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1076,old,male,positive,-504.75726,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,...,-5.155150,-7.399287,-7.925262,-5.619906,-5.644806,1.242933,-2.810986,-0.122058,-2.671896,0.472195
1077,young,male,positive,-723.34510,-5.390590,83.830670,-18.267603,23.992085,-17.434067,9.397023,...,-7.775133,16.577866,-2.542985,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503
1078,old,male,positive,-724.27490,96.186740,-65.017210,62.636177,-31.711601,48.414160,-7.423612,...,-4.680919,-4.848273,3.366529,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786
1079,young,male,positive,-525.03500,55.996940,-10.076442,20.016684,0.919657,-5.279638,10.089838,...,-1.043215,-2.091019,-5.030453,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519


In [35]:
# use pandas to create dummy variables for the columns that have categorical value

one_hot_encoded_data = pd.get_dummies(final_data, columns = ["Age", "Gender", "Covid Status"])

In [36]:
# display one_hot_encoded_data

one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,37,38,39,40,Age_old,Age_young,Gender_female,Gender_male,Covid Status_negative,Covid Status_positive
470,-543.15533,90.689514,28.970604,6.338865,9.591214,7.962400,8.999533,5.858377,9.145512,11.388906,...,-1.566947,-3.431976,-3.701202,-2.503761,False,True,False,True,True,False
432,-727.24084,164.063130,-19.572544,44.866737,-26.744781,22.994564,-29.662718,2.005984,0.186737,15.917499,...,-0.533565,-0.555858,1.751863,-1.296506,False,True,False,True,True,False
863,-613.50780,17.359932,1.101616,49.072662,-48.691887,25.287312,-10.712454,-0.336582,-12.418156,-4.193436,...,-5.381612,-2.450135,-7.963347,-0.397056,False,True,False,True,True,False
549,-710.35500,58.730850,22.247738,-13.321541,35.798714,23.302910,-3.378738,10.243302,11.419808,2.328775,...,-9.461203,7.662711,-10.089844,7.499491,False,True,False,True,True,False
424,-1013.43750,82.807370,16.093830,9.230137,9.918889,17.411755,-1.945211,-8.004393,-8.872688,-14.920727,...,-2.930956,-0.712987,-0.316106,-1.449122,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1076,-504.75726,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-2.810986,-0.122058,-2.671896,0.472195,True,False,False,True,False,True
1077,-723.34510,-5.390590,83.830670,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,0.011834,-0.797911,-1.411863,7.378503,False,True,False,True,False,True
1078,-724.27490,96.186740,-65.017210,62.636177,-31.711601,48.414160,-7.423612,6.581657,-14.663187,-3.385683,...,0.544383,-5.299528,0.331113,-0.893786,True,False,False,True,False,True
1079,-525.03500,55.996940,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,-6.552867,-3.958253,-0.738340,1.371519,False,True,False,True,False,True


In [37]:
# remove extra columns that were created from pandas dummy variables

one_hot_encoded_data = one_hot_encoded_data.loc[:, 
                                                ~one_hot_encoded_data.columns.isin(["Age_young", 
                                                                                    "Gender_female",  
                                                                                    "Covid Status_negative"])]
one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,34,35,36,37,38,39,40,Age_old,Gender_male,Covid Status_positive
470,-543.15533,90.689514,28.970604,6.338865,9.591214,7.962400,8.999533,5.858377,9.145512,11.388906,...,5.113654,-1.423935,2.840373,-1.566947,-3.431976,-3.701202,-2.503761,False,True,False
432,-727.24084,164.063130,-19.572544,44.866737,-26.744781,22.994564,-29.662718,2.005984,0.186737,15.917499,...,0.054069,2.219139,-1.628018,-0.533565,-0.555858,1.751863,-1.296506,False,True,False
863,-613.50780,17.359932,1.101616,49.072662,-48.691887,25.287312,-10.712454,-0.336582,-12.418156,-4.193436,...,-0.008261,0.137898,-1.873153,-5.381612,-2.450135,-7.963347,-0.397056,False,True,False
549,-710.35500,58.730850,22.247738,-13.321541,35.798714,23.302910,-3.378738,10.243302,11.419808,2.328775,...,-10.796859,-6.332717,-4.476096,-9.461203,7.662711,-10.089844,7.499491,False,True,False
424,-1013.43750,82.807370,16.093830,9.230137,9.918889,17.411755,-1.945211,-8.004393,-8.872688,-14.920727,...,-5.823066,0.351094,3.037789,-2.930956,-0.712987,-0.316106,-1.449122,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1076,-504.75726,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-5.619906,-5.644806,1.242933,-2.810986,-0.122058,-2.671896,0.472195,True,True,True
1077,-723.34510,-5.390590,83.830670,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503,False,True,True
1078,-724.27490,96.186740,-65.017210,62.636177,-31.711601,48.414160,-7.423612,6.581657,-14.663187,-3.385683,...,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786,True,True,True
1079,-525.03500,55.996940,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519,False,True,True


In [39]:
# for Age, True = Old and False = Young
# for Gender, True = Male and False = Female
# for Covid, True = Positive Test Result and False = Negative Test Result

one_hot_encoded_data.rename(columns = {"Age_old" : "Age", "Gender_male" : "Gender", 
                                      "Covid Status_positive" : "Covid"}, inplace = True)

In [40]:
one_hot_encoded_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,34,35,36,37,38,39,40,Age,Gender,Covid
470,-543.15533,90.689514,28.970604,6.338865,9.591214,7.962400,8.999533,5.858377,9.145512,11.388906,...,5.113654,-1.423935,2.840373,-1.566947,-3.431976,-3.701202,-2.503761,False,True,False
432,-727.24084,164.063130,-19.572544,44.866737,-26.744781,22.994564,-29.662718,2.005984,0.186737,15.917499,...,0.054069,2.219139,-1.628018,-0.533565,-0.555858,1.751863,-1.296506,False,True,False
863,-613.50780,17.359932,1.101616,49.072662,-48.691887,25.287312,-10.712454,-0.336582,-12.418156,-4.193436,...,-0.008261,0.137898,-1.873153,-5.381612,-2.450135,-7.963347,-0.397056,False,True,False
549,-710.35500,58.730850,22.247738,-13.321541,35.798714,23.302910,-3.378738,10.243302,11.419808,2.328775,...,-10.796859,-6.332717,-4.476096,-9.461203,7.662711,-10.089844,7.499491,False,True,False
424,-1013.43750,82.807370,16.093830,9.230137,9.918889,17.411755,-1.945211,-8.004393,-8.872688,-14.920727,...,-5.823066,0.351094,3.037789,-2.930956,-0.712987,-0.316106,-1.449122,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1076,-504.75726,55.995552,-4.536501,7.020762,-5.924695,2.774076,-7.127914,-12.114542,-3.710113,-5.438988,...,-5.619906,-5.644806,1.242933,-2.810986,-0.122058,-2.671896,0.472195,True,True,True
1077,-723.34510,-5.390590,83.830670,-18.267603,23.992085,-17.434067,9.397023,-25.693218,5.133441,-13.971018,...,3.071694,-4.513388,-5.473712,0.011834,-0.797911,-1.411863,7.378503,False,True,True
1078,-724.27490,96.186740,-65.017210,62.636177,-31.711601,48.414160,-7.423612,6.581657,-14.663187,-3.385683,...,-8.074186,-0.762944,1.346204,0.544383,-5.299528,0.331113,-0.893786,True,True,True
1079,-525.03500,55.996940,-10.076442,20.016684,0.919657,-5.279638,10.089838,23.879969,2.035708,-17.630470,...,3.318240,-3.117649,-1.775076,-6.552867,-3.958253,-0.738340,1.371519,False,True,True


In [41]:
# X is the features, y is the target variable

X = one_hot_encoded_data.loc[:, one_hot_encoded_data.columns != "Covid"]

y = one_hot_encoded_data["Covid"]

In [72]:
# function to perform grid search cross validation and determine the optimal hyperparameters for the decision tree
# using 5 folds
# best_params are the best parameters
# best_score is the average performance 

def grid_search(X, y, cv):
    param_grid = {"criterion" : ["gini", "entropy"], "max_depth": np.arange(3, 15), 
                  "min_samples_split": [2, 3, 4, 5], "min_samples_leaf": [2, 3, 4, 5]}
    
    decision_tree = DecisionTreeClassifier()
    
    grid_search_cv = GridSearchCV(decision_tree, param_grid, cv = cv)
    grid_search_cv.fit(X, y)
    
    print("Best Parameters: ", grid_search_cv.best_params_)
    print("Best Score: ", grid_search_cv.best_score_)
    
    # return best estimator to use for the decision tree
    return grid_search_cv.best_estimator_

In [77]:
# accuracy score for decision tree model
# approximately 78% accurate
# approximately 582/744 samples classified correctly

best_estimator = grid_search(X, y, 5)
y_pred = best_estimator.predict(X)

print(accuracy_score(y, y_pred, normalize = True))
print(accuracy_score(y, y_pred, normalize = False))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5604298929802285
0.782258064516129
582


In [88]:
# lists to hold metric values for gender before mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

dp_female_selection_rate_before = []
dp_male_selection_rate_before = []

dp_dpr_gender_before = []
dp_dpd_gender_before = []

In [89]:
# lists to hold metric values for gender after mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

dp_female_selection_rate_after = []
dp_male_selection_rate_after = []

dp_dpr_gender_after = []
dp_dpd_gender_after = []

In [90]:
# lists to hold metric values for age before mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

dp_young_selection_rate_before = []
dp_old_selection_rate_before = []


dp_dpr_age_before = []
dp_dpd_age_before = []

In [91]:
# lists to hold metric values for age after mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

dp_young_selection_rate_after = []
dp_old_selection_rate_after = []

dp_dpr_age_after = []
dp_dpd_age_after = []

In [92]:
# run function 30 times
# get y_pred values 30 times
# get metric values 30 times before mitigation algorithm
# get metric values 30 times after mitigation algortihm
# metric values: selection rate, demographic parity ratio, demographic parity difference
# mitigation algorithm: threshold optimizer (use demographic_parity for constraint)


for i in range(1, 31):
    # get best estimator from grid search cv
    best_estimator = grid_search(X, y, 5)
    
    # get y_pred values
    y_pred = best_estimator.predict(X)
    
    # metrics based on gender before mitigation
    # True = Male, False = Female
    print("ITERATION: ", i)
    
    metrics_gender = {"Selection Rate" : selection_rate}


    metric_frame_gender = MetricFrame(metrics = metrics_gender, y_true = y, y_pred = y_pred, 
                                  sensitive_features = X["Gender"])

    # append to lists to hold metric values for gender before mitigation algorithm for each of the 30 iterations
    dp_female_selection_rate_before.append(metric_frame_gender.by_group["Selection Rate"].iloc[0])
    dp_male_selection_rate_before.append(metric_frame_gender.by_group["Selection Rate"].iloc[1])

    dp_dpr_gender_before.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    dp_dpd_gender_before.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    
    # threshold optimizer with demographic parity for gender
    dp_threshold_optimizer_gender = ThresholdOptimizer(estimator = best_estimator, 
                                                       constraints = "demographic_parity", 
                                                       predict_method = "predict_proba", 
                                                       prefit = False)
    
    # fit the model and get y_pred values
    dp_threshold_optimizer_gender.fit(X, y, sensitive_features = X["Gender"])
    dp_y_pred_optimized_gender = dp_threshold_optimizer_gender.predict(X, sensitive_features = X["Gender"])
    
    # metrics based on gender after mitigation
    # True = Male, False = Female
    metric_frame_gender_optimized = MetricFrame(metrics = metrics_gender, y_true = y, 
                                                y_pred = dp_y_pred_optimized_gender, 
                                                sensitive_features = X["Gender"])

    # append to lists to hold metric values for gender after mitigation algorithm for each of the 30 iterations
    dp_female_selection_rate_after.append(metric_frame_gender_optimized.by_group["Selection Rate"].iloc[0])
    dp_male_selection_rate_after.append(metric_frame_gender_optimized.by_group["Selection Rate"].iloc[1])

    dp_dpr_gender_after.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, 
                                                                       y_pred = dp_y_pred_optimized_gender, 
                                                      sensitive_features = X["Gender"], method = "between_groups"))
    dp_dpd_gender_after.append(fairlearn.metrics.demographic_parity_difference(y_true = y,
                                                                               y_pred = dp_y_pred_optimized_gender, 
                                                     sensitive_features = X["Gender"], 
                                                     method = "between_groups"))
    
    # metrics based on age before mitigation
    # True = Old and False = Young
    metrics_age = {"Selection Rate" : selection_rate}


    metric_frame_age = MetricFrame(metrics = metrics_age, y_true = y, y_pred = y_pred, 
                                   sensitive_features = X["Age"])

    # append lists to hold metric values for age before mitigation algorithm for each of the 30 iterations
    dp_young_selection_rate_before.append(metric_frame_age.by_group["Selection Rate"].iloc[0])
    dp_old_selection_rate_before.append(metric_frame_age.by_group["Selection Rate"].iloc[1])

    dp_dpr_age_before.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    dp_dpd_age_before.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    
    # threshold optimizer with demographic parity for age
    dp_threshold_optimizer_age = ThresholdOptimizer(estimator = best_estimator, constraints = "demographic_parity", 
                                                 predict_method = "predict_proba", 
                                                 prefit = False)
    
    # fit the model and get y_pred values
    dp_threshold_optimizer_age.fit(X, y, sensitive_features = X["Age"])
    dp_y_pred_optimized_age = dp_threshold_optimizer_age.predict(X, sensitive_features = X["Age"])
    
    # metrics based on age after mitigation
    # True = Old and False = Young
    metric_frame_age_optimized = MetricFrame(metrics = metrics_age, y_true = y, y_pred = dp_y_pred_optimized_age, 
                                         sensitive_features = X["Age"])

    # append lists to hold metric values for age after mitigation algorithm for each of the 30 iterations
    dp_young_selection_rate_after.append(metric_frame_age_optimized.by_group["Selection Rate"].iloc[0])
    dp_old_selection_rate_after.append(metric_frame_age_optimized.by_group["Selection Rate"].iloc[1])

    dp_dpr_age_after.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = dp_y_pred_optimized_age, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))
    dp_dpd_age_after.append(fairlearn.metrics.demographic_parity_difference(y_true = y,
                                                                               y_pred = dp_y_pred_optimized_age, 
                                                     sensitive_features = X["Age"], 
                                                     method = "between_groups"))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5577362597496827
ITERATION:  1
Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best Score:  0.5604208235080719
ITERATION:  2
Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Score:  0.5617721748594231
ITERATION:  3
Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 4}
Best Score:  0.5604208235080719
ITERATION:  4
Best Parameters:  {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best Score:  0.5590966805731906
ITERATION:  5
Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 4}
Best Score:  0.5590876111010339
ITERATION:  6
Best Parameters:  {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2, 'min_s

In [93]:
# convert results of metrics for gender to a dataframe

dp_results_gender = {
    "Female Selection Rate Before": dp_female_selection_rate_before,
    "Male Selection Rate Before": dp_male_selection_rate_before,
    "Female Selection Rate After": dp_female_selection_rate_after,
    "Male Selection Rate After": dp_male_selection_rate_after,
    "Demographic Parity Ratio Gender Before": dp_dpr_gender_before,
    "Demographic Parity Ratio Gender After": dp_dpr_gender_after,
    "Demographic Parity Difference Gender Before": dp_dpd_gender_before,
    "Demographic Parity Difference Gender After" : dp_dpd_gender_after,
}

dp_metric_results_gender = pd.DataFrame(dp_results_gender)
dp_metric_results_gender

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Gender Before,Demographic Parity Ratio Gender After,Demographic Parity Difference Gender Before,Demographic Parity Difference Gender After
0,0.495763,0.377953,0.377119,0.372047,0.762366,0.986552,0.11781,0.005071
1,0.487288,0.362205,0.360169,0.364173,0.743307,0.989006,0.125083,0.004004
2,0.491525,0.362205,0.394068,0.387795,0.736899,0.984083,0.129321,0.006273
3,0.495763,0.374016,0.34322,0.366142,0.754425,0.937397,0.121747,0.022921
4,0.470339,0.348425,0.347458,0.348425,0.740796,0.997223,0.121914,0.000968
5,0.491525,0.362205,0.368644,0.387795,0.736899,0.950615,0.129321,0.019151
6,0.491525,0.366142,0.334746,0.360236,0.744909,0.92924,0.125384,0.02549
7,0.495763,0.377953,0.394068,0.379921,0.762366,0.964101,0.11781,0.014147
8,0.495763,0.375984,0.364407,0.372047,0.758396,0.979464,0.119778,0.00764
9,0.495763,0.372047,0.385593,0.374016,0.750454,0.969975,0.123715,0.011577


In [94]:
# convert average of each metric for gender to a dataframe

dp_averages_gender = pd.DataFrame(dp_metric_results_gender.mean()).T
dp_averages_gender

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Gender Before,Demographic Parity Ratio Gender After,Demographic Parity Difference Gender Before,Demographic Parity Difference Gender After
0,0.481356,0.358596,0.368927,0.372507,0.74244,0.973957,0.12276,0.00977


In [95]:
# convert results of metrics for age to a dataframe

dp_results_age = {
    "Young Selection Rate Before": dp_young_selection_rate_before,
    "Old Selection Rate Before": dp_old_selection_rate_before,
    "Young Selection Rate After": dp_young_selection_rate_after,
    "Old Selection Rate After": dp_old_selection_rate_after,
    "Demographic Parity Ratio Age Before": dp_dpr_age_before,
    "Demographic Parity Ratio Age After": dp_dpr_age_after,
    "Demographic Parity Difference Age Before": dp_dpd_age_before,
    "Demographic Parity Difference Age After" : dp_dpd_age_after,
}

dp_metric_results_age = pd.DataFrame(dp_results_age)
dp_metric_results_age

Unnamed: 0,Young Selection Rate Before,Old Selection Rate Before,Young Selection Rate After,Old Selection Rate After,Demographic Parity Ratio Age Before,Demographic Parity Ratio Age After,Demographic Parity Difference Age Before,Demographic Parity Difference Age After
0,0.313167,0.730769,0.615658,0.620879,0.428545,0.991591,0.417602,0.005221
1,0.311388,0.681319,0.677936,0.653846,0.457037,0.964466,0.369931,0.02409
2,0.311388,0.686813,0.66726,0.664835,0.453381,0.996366,0.375425,0.002425
3,0.313167,0.71978,0.645907,0.631868,0.435087,0.978264,0.406613,0.014039
4,0.290036,0.686813,0.672598,0.67033,0.422292,0.996628,0.396778,0.002268
5,0.311388,0.686813,0.66548,0.664835,0.453381,0.99903,0.375425,0.000645
6,0.311388,0.697802,0.653025,0.648352,0.446241,0.992844,0.386414,0.004673
7,0.313167,0.730769,0.660142,0.648352,0.428545,0.982139,0.417602,0.011791
8,0.313167,0.725275,0.663701,0.648352,0.431791,0.976873,0.412107,0.015349
9,0.313167,0.714286,0.635231,0.626374,0.438434,0.986056,0.401118,0.008858


In [96]:
# convert average of each metric for age to a dataframe

dp_averages_age = pd.DataFrame(dp_metric_results_age.mean()).T
dp_averages_age

Unnamed: 0,Young Selection Rate Before,Old Selection Rate Before,Young Selection Rate After,Old Selection Rate After,Demographic Parity Ratio Age Before,Demographic Parity Ratio Age After,Demographic Parity Difference Age Before,Demographic Parity Difference Age After
0,0.300652,0.696703,0.636833,0.633516,0.430299,0.98016,0.396051,0.012525


In [97]:
# save metric_results_gender, averages_gender, metric_results_age, and averages_age dataframes as csv files

dp_metric_results_gender.to_csv("../Data/demographic_parity_metric_results_by_gender_coswara_data.csv", index = False)
dp_averages_gender.to_csv("../Data/demographic_parity_averages_for_gender_coswara_data.csv", index = False)

dp_metric_results_age.to_csv("../Data/demographic_parity_metric_results_by_age_coswara_data.csv", index = False)
dp_averages_age.to_csv("../Data/demographic_parity_averages_for_age_coswara_data.csv", index = False)