In [9]:
# import pandas and numpy

import pandas as pd
import numpy as np


# import sklearn

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# fairlearn metrics

import fairlearn
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_ratio, demographic_parity_difference
from fairlearn.metrics import false_negative_rate, equalized_odds_ratio, equalized_odds_difference


# fairlearn reductions

from fairlearn.reductions import DemographicParity, EqualizedOdds


# fairlearn postprocessing

from fairlearn.postprocessing import ThresholdOptimizer

In [10]:
# load in and display copd_covid data

copd_covid = pd.read_csv("../Model Data/copd_covid_model_data.csv")
copd_covid

Unnamed: 0,Age,Gender,Diagnosis,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,23,male,copd,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,...,-7.217814,-0.173979,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454
1,22,male,copd,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,...,-3.756811,-1.768963,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466
2,21,male,copd,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,...,11.844463,0.834329,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673
3,21,female,copd,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,...,-1.074582,-0.676747,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794
4,21,male,copd,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,...,6.575019,12.941100,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,male,covid,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,...,1.910902,1.742900,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586
402,45,female,covid,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,...,-4.420852,-4.611802,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420
403,34,female,covid,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,...,-13.599506,-9.966225,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312
404,7,male,covid,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,...,3.556974,-4.136533,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226


In [11]:
# use pandas to create dummy variables for the columns that have categorical value

one_hot_encoded_data = pd.get_dummies(copd_covid, columns = ["Gender", "Diagnosis"])

In [12]:
# display one_hot_encoded_data

one_hot_encoded_data

Unnamed: 0,Age,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,Gender_female,Gender_male,Diagnosis_copd,Diagnosis_covid
0,23,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,-12.419935,...,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454,False,True,True,False
1,22,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,6.502266,...,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466,False,True,True,False
2,21,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,-5.593486,...,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673,False,True,True,False
3,21,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,-3.921539,...,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794,True,False,True,False
4,21,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,-3.188501,...,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,-17.811794,-15.321136,...,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586,False,True,False,True
402,45,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,7.768967,-5.434897,...,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420,True,False,False,True
403,34,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,-12.632499,-11.627846,...,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312,True,False,False,True
404,7,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,7.978167,-14.775313,...,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226,False,True,False,True


In [13]:
# remove extra columns that were created from pandas dummy variables

one_hot_encoded_data = one_hot_encoded_data.loc[:, ~one_hot_encoded_data.columns.isin(["Gender_female", 
                                                                                       "Diagnosis_covid"])]
one_hot_encoded_data

Unnamed: 0,Age,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,Gender_male,Diagnosis_copd
0,23,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,-12.419935,...,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454,True,True
1,22,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,6.502266,...,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466,True,True
2,21,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,-5.593486,...,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673,True,True
3,21,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,-3.921539,...,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794,False,True
4,21,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,-3.188501,...,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,-17.811794,-15.321136,...,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586,True,False
402,45,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,7.768967,-5.434897,...,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420,False,False
403,34,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,-12.632499,-11.627846,...,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312,False,False
404,7,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,7.978167,-14.775313,...,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226,True,False


In [15]:
# for Gender, True = Male and False = Female
# for Diagnosis, True = COPD and False = COVID

one_hot_encoded_data.rename(columns = {"Gender_male" : "Gender", "Diagnosis_copd" : "Diagnosis"}, inplace = True)

In [16]:
# display one_hot_encoded_data

one_hot_encoded_data

Unnamed: 0,Age,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,Gender,Diagnosis
0,23,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,-12.419935,...,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454,True,True
1,22,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,6.502266,...,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466,True,True
2,21,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,-5.593486,...,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673,True,True
3,21,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,-3.921539,...,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794,False,True
4,21,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,-3.188501,...,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,-17.811794,-15.321136,...,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586,True,False
402,45,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,7.768967,-5.434897,...,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420,False,False
403,34,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,-12.632499,-11.627846,...,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312,False,False
404,7,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,7.978167,-14.775313,...,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226,True,False


In [17]:
# X is the features, y is the target variable

X = one_hot_encoded_data.loc[:, one_hot_encoded_data.columns != "Diagnosis"]

y = one_hot_encoded_data["Diagnosis"]

In [69]:
# function to perform grid search cross validation and determine the optimal hyperparameters for the decision tree
# using 5 folds
# best_params are the best parameters
# best_score is the average performance
# to reduce overfitting, using random for splitter and limiting max_depth to either 1 or 2

def grid_search(X, y, cv):
    param_grid = {"criterion" : ["gini", "entropy"], "max_depth": [1], "min_samples_split": [2, 3, 4, 5], 
                  "min_samples_leaf": [2, 3, 4, 5], "splitter": ["random"]}
        
    decision_tree = DecisionTreeClassifier()
    
    grid_search_cv = GridSearchCV(decision_tree, param_grid, cv = cv)
    grid_search_cv.fit(X, y)
    
    print("Best Parameters: ", grid_search_cv.best_params_)
    print("Best Score: ", grid_search_cv.best_score_)
    
    # return best estimator to use for the decision tree
    return grid_search_cv.best_estimator_

In [74]:
# accuracy score for decision tree model
# approximately 76% accurate
# approximately 309/406 samples classified correctly

best_estimator = grid_search(X, y, 5)
y_pred = best_estimator.predict(X)

print(accuracy_score(y, y_pred, normalize = True))
print(accuracy_score(y, y_pred, normalize = False))

Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 3, 'splitter': 'random'}
Best Score:  0.7782294489611563
0.7610837438423645
309


In [85]:
# lists to hold metric values before mitigation algorithm for each of the 30 iterations
# for EQUALIZED ODDS, only looking at false negative rate, equalized odds rate, and equalized odds
# difference

female_fnr_before = []
male_fnr_before = []

eor_before = []
eod_before = []

In [86]:
# lists to hold metric values after mitigation algorithm for each of the 30 iterations
# for EQUALIZED ODDS, only looking at false negative rate, equalized odds rate, and equalized odds
# difference

female_fnr_after = []
male_fnr_after = []

eor_after = []
eod_after = []

In [87]:
# run function 30 times
# get y_pred values 30 times
# get metric values 30 times before mitigation algorithm
# get metric values 30 times after mitigation algortihm
# metric values: false negative rate, equalized odds ratio, equalized odds difference
# mitigation algorithm: threshold optimizer (use equalized_odds for constraint)


for i in range(1, 31):
    # get best estimator from grid search cv
    best_estimator = grid_search(X, y, 5)
    
    # get y_pred values
    y_pred = best_estimator.predict(X)
    
    # metrics before mitigation
    # True = Male, False = Female
    print("ITERATION: ", i)
    
    metrics = {"False Negative Rate" : false_negative_rate}

    metric_frame = MetricFrame(metrics = metrics, y_true = y, y_pred = y_pred, sensitive_features = X["Gender"])

    # append to lists to hold metric values before mitigation algorithm for each of the 30 iterations
    female_fnr_before.append(metric_frame.by_group["False Negative Rate"].iloc[0])
    male_fnr_before.append(metric_frame.by_group["False Negative Rate"].iloc[1])

    try:
        eor_before.append(fairlearn.metrics.equalized_odds_ratio(y_true = y, y_pred = y_pred, 
                                                             sensitive_features = X["Gender"], 
                                                             method = "between_groups"))
    except ZeroDivisionError:
        eor_before.append(0.0)
    
    eod_before.append(fairlearn.metrics.equalized_odds_difference(y_true = y, y_pred = y_pred, 
                                                                  sensitive_features = X["Gender"], 
                                                                  method = "between_groups"))
    
    # threshold optimizer with equalized odds
    threshold_optimizer = ThresholdOptimizer(estimator = best_estimator, constraints = "equalized_odds", 
                                             predict_method = "predict_proba", prefit = False)
    
    # fit the model and get y_pred values
    threshold_optimizer.fit(X, y, sensitive_features = X["Gender"])
    y_pred_optimized = threshold_optimizer.predict(X, sensitive_features = X["Gender"])
    
    # metrics after mitigation
    # True = Male, False = Female
    metric_frame_optimized = MetricFrame(metrics = metrics, y_true = y, y_pred = y_pred_optimized, 
                                         sensitive_features = X["Gender"])

    # append to lists to hold metric values after mitigation algorithm for each of the 30 iterations
    female_fnr_after.append(metric_frame_optimized.by_group["False Negative Rate"].iloc[0])
    male_fnr_after.append(metric_frame_optimized.by_group["False Negative Rate"].iloc[1])

    try:
        eor_after.append(fairlearn.metrics.equalized_odds_ratio(y_true = y, y_pred = y_pred_optimized, 
                                                                sensitive_features = X["Gender"], 
                                                                method = "between_groups"))
    except ZeroDivisionError:
        eor_after.append(0.0)

    eod_after.append(fairlearn.metrics.equalized_odds_difference(y_true = y, y_pred = y_pred_optimized, 
                                                                 sensitive_features = X["Gender"], 
                                                                 method = "between_groups"))

Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 3, 'splitter': 'random'}
Best Score:  0.8028906955736224
ITERATION:  1
Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'random'}
Best Score:  0.822704004817826
ITERATION:  2
Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'random'}
Best Score:  0.7885275519421862
ITERATION:  3
Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'random'}
Best Score:  0.8522734116230051
ITERATION:  4
Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 3, 'min_samples_split': 4, 'splitter': 'random'}
Best Score:  0.8153267088226439
ITERATION:  5
Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 3, 'min_samples_split': 5, 'splitter': 'random'}
Best Scor

In [88]:
# convert results of metrics to a dataframe

results = {
    "Female False Negative Rate Before": female_fnr_before,
    "Male False Negative Rate Before": male_fnr_before,
    "Female False Negative Rate After": female_fnr_after,
    "Male False Negative Rate After": male_fnr_after,
    "Equalized Odds Ratio Before": eor_before,
    "Equalized Odds Ratio After": eor_after,
    "Equalized Odds Difference Before": eod_before,
    "Equalized Odds Difference After" : eod_after,
}

metric_results = pd.DataFrame(results)
metric_results

Unnamed: 0,Female False Negative Rate Before,Male False Negative Rate Before,Female False Negative Rate After,Male False Negative Rate After,Equalized Odds Ratio Before,Equalized Odds Ratio After,Equalized Odds Difference Before,Equalized Odds Difference After
0,0.0,0.0,0.0,0.006803,0.875677,0.953125,0.05299,0.02459
1,0.017857,0.034014,0.0,0.0,0.938706,0.985501,0.02713,0.004041
2,0.0,0.0,0.142857,0.170068,0.932274,0.958288,0.052182,0.027211
3,0.660714,0.503401,0.0,0.006803,0.683219,0.993197,0.157313,0.006803
4,0.0,0.0,0.0,0.0,0.994635,0.966549,0.004156,0.024128
5,0.053571,0.068027,0.0,0.0,0.776011,0.994976,0.113831,0.004387
6,0.0,0.006803,0.5,0.544218,0.710118,0.911565,0.232856,0.044218
7,0.0,0.0,0.0,0.0,0.88302,0.986076,0.069037,0.010044
8,0.0,0.0,0.0,0.0,0.727459,0.872951,0.122835,0.057262
9,0.285714,0.183673,0.0,0.0,0.723499,0.996944,0.102041,0.002655


In [89]:
# convert average of each metric for to a dataframe

averages = pd.DataFrame(metric_results.mean()).T
averages

Unnamed: 0,Female False Negative Rate Before,Male False Negative Rate Before,Female False Negative Rate After,Male False Negative Rate After,Equalized Odds Ratio Before,Equalized Odds Ratio After,Equalized Odds Difference Before,Equalized Odds Difference After
0,0.120238,0.078231,0.041667,0.045805,0.84168,0.960644,0.088591,0.024976


In [90]:
# save metric_results and averages dataframes as csv files

metric_results.to_csv("../Model Data/copd_covid_equalized_odds_metric_results.csv", index = False)
averages.to_csv("../Model Data/copd_covid_equalized_odds_averages.csv", index = False)