In [1]:
# import pandas and numpy

import pandas as pd
import numpy as np


# import sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


# fairlearn metrics

import fairlearn
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate, demographic_parity_ratio, demographic_parity_difference
from fairlearn.metrics import false_negative_rate, equalized_odds_ratio, equalized_odds_difference


# fairlearn reductions

from fairlearn.reductions import DemographicParity, EqualizedOdds


# fairlearn postprocessing

from fairlearn.postprocessing import ThresholdOptimizer

In [2]:
# load in preprocessed COVID data and display data

covid = pd.read_csv("../COVID Data/preprocessed_covid_data.csv")
covid

Unnamed: 0,Age,Gender,Covid Status,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,27,female,positive,-629.68990,147.978450,-13.323263,23.272245,4.197239,-1.780490,10.516384,...,-6.526163,-2.510284,-2.633717,-3.726629,0.979626,3.089171,0.769896,-4.202895,-0.887942,4.535855
1,37,male,positive,-621.08905,187.245700,-33.129093,48.984450,2.386661,-1.714945,3.957516,...,1.291738,-0.938625,1.572304,4.985484,5.984903,6.893156,2.244276,-2.542755,-4.179516,3.730679
2,26,male,positive,-714.31006,163.695400,-4.930427,7.221747,0.298873,1.058984,-2.476460,...,6.417529,3.738028,-4.648451,-0.070442,5.708490,-2.166859,0.204799,7.862592,0.148901,-7.011946
3,23,male,positive,-628.45636,146.632160,-23.645126,7.047125,6.618930,1.317157,2.142397,...,3.893336,-2.958153,0.872498,0.688001,-2.796172,2.928323,1.826729,3.378086,5.445435,0.356165
4,24,male,positive,-552.36530,142.188900,-49.390152,20.165570,-22.979172,-16.892635,-23.567507,...,1.540499,-6.840363,-1.467400,-4.397162,-2.183991,5.988577,4.378221,4.218623,-5.520302,2.016752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1693,30,male,positive,-879.76390,95.579834,-53.097855,0.492984,14.325769,1.869082,14.210119,...,-2.148999,5.630102,2.391410,-0.614618,-1.801602,-1.525672,-2.467350,0.419104,-2.617154,-7.746522
1694,35,male,positive,-1055.14000,55.788525,-12.435507,-9.886213,8.294209,-0.254150,-3.330456,...,-0.327358,-0.215067,-0.544228,-0.458576,-0.381838,-0.529132,-0.497370,-0.709488,-0.889405,-0.544918
1695,25,female,positive,-1022.70560,92.306760,9.245488,-8.484243,-7.691916,-2.754213,8.891866,...,-0.009391,-0.443159,-1.231367,-0.268812,-0.296077,-0.543105,-0.586759,-0.536058,-0.361928,-1.008212
1696,24,male,positive,-1038.93240,63.736298,-14.272345,-10.701512,11.551987,-2.741023,-3.822017,...,-0.004053,0.243206,-0.361559,0.160220,-0.080098,0.098594,-0.347364,0.374165,-0.407011,0.118158


In [3]:
# since there are 203 values in the preprocessed COPD data, there needs to be a random sample of 203 values 
# selected from the preprocessed COVID data for there to be class label balance; the two classes are COVID and COPD

covid_sample = covid.sample(n = 203, random_state = 8)

In [4]:
# there are 142 males (70%) and 61 females (30%) in the random sample
# biased towards males

males_covid = covid_sample[covid_sample["Gender"] == "male"].count()[0]
females_covid = covid_sample[covid_sample["Gender"] == "female"].count()[0]

print(f"Male: {males_covid} males or{males_covid / 203 : .2%} of random sample")
print(f"Female: {females_covid} females or{females_covid / 203 : .2%} of random sample")

Male: 142 males or 69.95% of random sample
Female: 61 females or 30.05% of random sample


In [5]:
# insert diagnosis column

covid_sample.insert(2, "Diagnosis", ["covid"] * 203)

In [6]:
# filter out "Covid Status" colum

covid_sample = covid_sample.loc[:, ~covid_sample.columns.isin(["Covid Status"])]

In [7]:
# display covid_sample dataframe

covid_sample

Unnamed: 0,Age,Gender,Diagnosis,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
108,21,female,covid,-758.71400,116.467285,-42.722450,40.243660,15.051302,-32.504140,29.035646,...,3.736219,5.898125,4.068265,-7.458551,-4.326640,-0.361382,-4.633030,-1.610763,3.785172,8.650267
1076,33,male,covid,-722.53580,144.314210,26.351463,37.554115,8.743655,-10.747301,-7.107974,...,6.453581,7.119597,10.830614,10.856103,9.916693,6.230020,2.270569,2.519269,-1.700830,2.298438
1473,48,male,covid,-847.09033,111.424220,-21.417720,14.283553,5.518419,-0.635077,16.328243,...,5.918134,4.533440,3.647399,1.894623,-0.030911,4.889910,6.870177,3.249243,-3.368932,-3.474739
1454,48,male,covid,-705.21950,154.211980,22.709944,42.839317,2.723271,-9.014034,15.664713,...,2.115530,5.916591,8.377067,8.119476,2.493041,-1.586517,5.354673,6.208771,-2.935931,0.558458
1585,24,male,covid,-177.89694,84.818756,-77.268170,34.560980,8.455174,-24.375180,14.958786,...,-3.762346,-1.878054,5.012243,-0.147536,-6.793571,-7.252930,-2.604796,-4.250966,-9.005527,-2.103924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376,23,male,covid,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,...,1.910902,1.742900,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586
1568,45,female,covid,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,...,-4.420852,-4.611802,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420
427,34,female,covid,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,...,-13.599506,-9.966225,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312
145,7,male,covid,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,...,3.556974,-4.136533,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226


In [8]:
# load in the preprocessed COPD data and display data

copd = pd.read_csv("../COPD Data/preprocessed_copd_data.csv")
copd

Unnamed: 0,Age,Gender,1,2,3,4,5,6,7,8,...,31,32,33,34,35,36,37,38,39,40
0,23,male,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,...,-7.217814,-0.173979,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454
1,22,male,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,...,-3.756811,-1.768963,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466
2,21,male,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,...,11.844463,0.834329,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673
3,21,female,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,...,-1.074582,-0.676747,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794
4,21,male,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,...,6.575019,12.941100,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,21,female,-563.63370,110.746630,-13.120099,10.522531,-2.103596,9.541611,8.948556,0.051391,...,-3.221737,2.499545,0.932840,0.243084,-0.745922,0.838056,-4.137422,-3.300201,-1.652556,0.991499
199,19,female,-545.49634,105.995530,21.250801,-6.327105,1.219058,-6.581512,7.257843,-2.845460,...,4.823594,3.547462,0.527079,-2.771817,-1.604831,-2.164563,-9.809355,-9.654636,-7.086133,-2.327887
200,19,female,-574.64550,132.846070,10.036890,19.587880,-4.848694,1.605638,-7.204495,-11.853018,...,-0.735855,3.289234,0.006554,-1.706333,8.197672,3.062819,-5.155450,-6.593777,-4.316953,1.142055
201,19,male,-397.94824,55.458410,1.049436,-9.037408,-28.105793,-1.602215,-17.166052,13.540288,...,-0.973857,-2.797350,-9.354880,-4.404431,-9.904248,8.117905,5.170504,1.867130,-4.732729,3.425074


In [9]:
# there are 147 males (72%) and 56 females (28%)
# biased towards males

males_copd = copd[copd["Gender"] == "male"].count()[0]
females_copd = copd[copd["Gender"] == "female"].count()[0]

print(f"Male: {males_copd} males or{males_copd / 203 : .2%} of random sample")
print(f"Female: {females_copd} females or{females_copd / 203 : .2%} of random sample")

Male: 147 males or 72.41% of random sample
Female: 56 females or 27.59% of random sample


In [10]:
# insert diagnosis column

copd.insert(2, "Diagnosis", ["copd"] * 203)

In [11]:
# display copd dataframe

copd

Unnamed: 0,Age,Gender,Diagnosis,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,23,male,copd,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,...,-7.217814,-0.173979,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454
1,22,male,copd,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,...,-3.756811,-1.768963,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466
2,21,male,copd,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,...,11.844463,0.834329,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673
3,21,female,copd,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,...,-1.074582,-0.676747,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794
4,21,male,copd,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,...,6.575019,12.941100,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,21,female,copd,-563.63370,110.746630,-13.120099,10.522531,-2.103596,9.541611,8.948556,...,-3.221737,2.499545,0.932840,0.243084,-0.745922,0.838056,-4.137422,-3.300201,-1.652556,0.991499
199,19,female,copd,-545.49634,105.995530,21.250801,-6.327105,1.219058,-6.581512,7.257843,...,4.823594,3.547462,0.527079,-2.771817,-1.604831,-2.164563,-9.809355,-9.654636,-7.086133,-2.327887
200,19,female,copd,-574.64550,132.846070,10.036890,19.587880,-4.848694,1.605638,-7.204495,...,-0.735855,3.289234,0.006554,-1.706333,8.197672,3.062819,-5.155450,-6.593777,-4.316953,1.142055
201,19,male,copd,-397.94824,55.458410,1.049436,-9.037408,-28.105793,-1.602215,-17.166052,...,-0.973857,-2.797350,-9.354880,-4.404431,-9.904248,8.117905,5.170504,1.867130,-4.732729,3.425074


In [12]:
# combine covid_sample and copd to one dataframe

data = pd.concat([copd, covid_sample], axis = 0)

In [13]:
# display data

data

Unnamed: 0,Age,Gender,Diagnosis,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,23,male,copd,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,...,-7.217814,-0.173979,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454
1,22,male,copd,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,...,-3.756811,-1.768963,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466
2,21,male,copd,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,...,11.844463,0.834329,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673
3,21,female,copd,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,...,-1.074582,-0.676747,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794
4,21,male,copd,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,...,6.575019,12.941100,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376,23,male,covid,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,...,1.910902,1.742900,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586
1568,45,female,covid,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,...,-4.420852,-4.611802,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420
427,34,female,covid,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,...,-13.599506,-9.966225,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312
145,7,male,covid,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,...,3.556974,-4.136533,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226


In [14]:
# reset index for data dataframe to start from 1

data = data.reset_index(drop = True)
data.index += 1

In [15]:
# check that index starts at 1

data

Unnamed: 0,Age,Gender,Diagnosis,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
1,23,male,copd,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,...,-7.217814,-0.173979,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454
2,22,male,copd,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,...,-3.756811,-1.768963,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466
3,21,male,copd,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,...,11.844463,0.834329,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673
4,21,female,copd,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,...,-1.074582,-0.676747,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794
5,21,male,copd,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,...,6.575019,12.941100,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,23,male,covid,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,...,1.910902,1.742900,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586
403,45,female,covid,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,...,-4.420852,-4.611802,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420
404,34,female,covid,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,...,-13.599506,-9.966225,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312
405,7,male,covid,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,...,3.556974,-4.136533,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226


In [16]:
# save data as a csv file

data.to_csv("../Model Data/copd_covid_model_data.csv", index = False)

In [17]:
# load in and display copd_covid data

copd_covid = pd.read_csv("../Model Data/copd_covid_model_data.csv")
copd_covid

Unnamed: 0,Age,Gender,Diagnosis,1,2,3,4,5,6,7,...,31,32,33,34,35,36,37,38,39,40
0,23,male,copd,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,...,-7.217814,-0.173979,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454
1,22,male,copd,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,...,-3.756811,-1.768963,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466
2,21,male,copd,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,...,11.844463,0.834329,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673
3,21,female,copd,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,...,-1.074582,-0.676747,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794
4,21,male,copd,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,...,6.575019,12.941100,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,male,covid,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,...,1.910902,1.742900,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586
402,45,female,covid,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,...,-4.420852,-4.611802,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420
403,34,female,covid,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,...,-13.599506,-9.966225,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312
404,7,male,covid,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,...,3.556974,-4.136533,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226


In [18]:
# use pandas to create dummy variables for the columns that have categorical value

one_hot_encoded_data = pd.get_dummies(copd_covid, columns = ["Gender", "Diagnosis"])

In [19]:
# display one_hot_encoded_data

one_hot_encoded_data

Unnamed: 0,Age,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,Gender_female,Gender_male,Diagnosis_copd,Diagnosis_covid
0,23,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,-12.419935,...,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454,False,True,True,False
1,22,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,6.502266,...,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466,False,True,True,False
2,21,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,-5.593486,...,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673,False,True,True,False
3,21,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,-3.921539,...,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794,True,False,True,False
4,21,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,-3.188501,...,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,-17.811794,-15.321136,...,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586,False,True,False,True
402,45,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,7.768967,-5.434897,...,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420,True,False,False,True
403,34,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,-12.632499,-11.627846,...,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312,True,False,False,True
404,7,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,7.978167,-14.775313,...,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226,False,True,False,True


In [20]:
# remove extra columns that were created from pandas dummy variables

one_hot_encoded_data = one_hot_encoded_data.loc[:, ~one_hot_encoded_data.columns.isin(["Gender_female", 
                                                                                       "Diagnosis_covid"])]
one_hot_encoded_data

Unnamed: 0,Age,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,Gender_male,Diagnosis_copd
0,23,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,-12.419935,...,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454,True,True
1,22,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,6.502266,...,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466,True,True
2,21,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,-5.593486,...,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673,True,True
3,21,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,-3.921539,...,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794,False,True
4,21,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,-3.188501,...,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,-17.811794,-15.321136,...,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586,True,False
402,45,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,7.768967,-5.434897,...,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420,False,False
403,34,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,-12.632499,-11.627846,...,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312,False,False
404,7,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,7.978167,-14.775313,...,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226,True,False


In [22]:
# for Gender, True = Male and False = Female
# for Diagnosis, True = COPD and False = COVID

one_hot_encoded_data.rename(columns = {"Gender_male" : "Gender", "Diagnosis_copd" : "Diagnosis"}, inplace = True)

In [23]:
# display one_hot_encoded_data

one_hot_encoded_data

Unnamed: 0,Age,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,Gender,Diagnosis
0,23,-394.65960,122.432880,-11.079241,-4.542910,8.423446,3.535940,0.911247,1.573281,-12.419935,...,0.653299,1.396321,-2.589551,-2.717702,1.800914,0.560695,2.303327,4.940454,True,True
1,22,-356.18326,-5.952755,9.319948,13.227663,-8.204271,15.134655,2.236810,10.766811,6.502266,...,6.931832,-1.811822,1.178949,1.024486,3.932535,-2.135365,-9.547361,-6.228466,True,True
2,21,-321.02686,21.646652,-2.530532,13.137980,-14.525568,20.054962,-8.855867,8.851731,-5.593486,...,1.950582,-7.954147,-3.649364,7.742988,-6.821154,1.028112,-7.124897,3.204673,True,True
3,21,-485.03717,122.165800,3.868416,25.681168,20.960140,16.672789,5.547796,1.649345,-3.921539,...,-0.238081,0.606901,1.911792,-1.528307,-3.482955,-2.710037,-8.462348,-2.317794,False,True
4,21,-508.93808,84.727740,-26.721167,20.100988,12.095397,16.023178,-3.501391,3.443415,-3.188501,...,-0.219004,0.528005,-4.395512,4.718255,-2.427250,-7.684154,-7.027141,0.850527,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,23,-734.92910,155.501340,38.251410,23.343193,2.969118,-7.418866,-15.685755,-17.811794,-15.321136,...,1.959242,1.119096,0.240030,1.527854,1.176233,3.344530,3.830796,6.302586,True,False
402,45,-204.93680,162.062040,15.908648,-5.323838,-11.134152,6.682496,13.697166,7.768967,-5.434897,...,-2.373542,0.129342,0.794681,-1.801611,-5.394828,-7.025907,-6.528414,-4.389420,False,False
403,34,-779.94710,142.525670,-22.714180,-2.086802,-25.052860,-39.164406,-9.531904,-12.632499,-11.627846,...,-1.246019,3.316795,2.629901,-7.667989,-5.140393,-7.611574,-5.569480,7.441312,False,False
404,7,-437.86990,175.707340,-37.727450,33.700745,6.129578,-15.033535,28.071596,7.978167,-14.775313,...,-5.237600,-1.352637,2.876499,-5.238875,-12.742783,-6.073374,-4.490694,-7.106226,True,False


In [24]:
# X is the features, y is the target variable

X = one_hot_encoded_data.loc[:, one_hot_encoded_data.columns != "Diagnosis"]

y = one_hot_encoded_data["Diagnosis"]

In [25]:
# function to perform grid search cross validation and determine the optimal hyperparameters for the decision tree
# using 5 folds
# best_params are the best parameters
# best_score is the average performance 

def grid_search(X, y, cv):
    param_grid = {"criterion" : ["gini", "entropy"], "max_depth": [1], "min_samples_split": [2, 3, 4, 5], 
                  "min_samples_leaf": [2, 3, 4, 5], "splitter": ["random"]}
    
    decision_tree = DecisionTreeClassifier()
    
    grid_search_cv = GridSearchCV(decision_tree, param_grid, cv = cv)
    grid_search_cv.fit(X, y)
    
    print("Best Parameters: ", grid_search_cv.best_params_)
    print("Best Score: ", grid_search_cv.best_score_)
    
    # return best estimator to use for the decision tree
    return grid_search_cv.best_estimator_

In [29]:
# accuracy score for decision tree model
# approximately 77% accurate
# approximately 314/406 samples classified correctly

best_estimator = grid_search(X, y, 5)
y_pred = best_estimator.predict(X)

print(accuracy_score(y, y_pred, normalize = True))
print(accuracy_score(y, y_pred, normalize = False))

Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'random'}
Best Score:  0.7781090033122553
0.7733990147783252
314


In [30]:
# lists to hold metric values before mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

female_sr_before = []
male_sr_before = []

dpr_before = []
dpd_before = []

In [31]:
# lists to hold metric values after mitigation algorithm for each of the 30 iterations
# for DEMOGRAPHIC PARITY, only looking at selection rate, demographic parity rate, and demographic parity 
# difference

female_sr_after = []
male_sr_after = []

dpr_after = []
dpd_after = []

In [32]:
# run function 30 times
# get y_pred values 30 times
# get metric values 30 times before mitigation algorithm
# get metric values 30 times after mitigation algortihm
# metric values: selection rate, demographic parity ratio, demographic parity difference
# mitigation algorithm: threshold optimizer (use demographic_parity for constraint)


for i in range(1, 31):
    # get best estimator from grid search cv
    best_estimator = grid_search(X, y, 5)
    
    # get y_pred values
    y_pred = best_estimator.predict(X)
    
    # metrics before mitigation
    # True = Male, False = Female
    print("ITERATION: ", i)
    
    metrics = {"Selection Rate" : selection_rate}

    metric_frame = MetricFrame(metrics = metrics, y_true = y, y_pred = y_pred, sensitive_features = X["Gender"])

    # append to lists to hold metric values before mitigation algorithm for each of the 30 iterations
    female_sr_before.append(metric_frame.by_group["Selection Rate"].iloc[0])
    male_sr_before.append(metric_frame.by_group["Selection Rate"].iloc[1])

    dpr_before.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred, 
                                                                 sensitive_features = X["Gender"], 
                                                                 method = "between_groups"))
    dpd_before.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred, 
                                                                      sensitive_features = X["Gender"], 
                                                                      method = "between_groups"))
    
    # threshold optimizer with demographic parity
    threshold_optimizer = ThresholdOptimizer(estimator = best_estimator, constraints = "demographic_parity", 
                                             predict_method = "predict_proba", prefit = False)
    
    # fit the model and get y_pred values
    threshold_optimizer.fit(X, y, sensitive_features = X["Gender"])
    y_pred_optimized = threshold_optimizer.predict(X, sensitive_features = X["Gender"])
    
    # metrics after mitigation
    # True = Male, False = Female
    metric_frame_optimized = MetricFrame(metrics = metrics, y_true = y, y_pred = y_pred_optimized, 
                                         sensitive_features = X["Gender"])

    # append to lists to hold metric values after mitigation algorithm for each of the 30 iterations
    female_sr_after.append(metric_frame_optimized.by_group["Selection Rate"].iloc[0])
    male_sr_after.append(metric_frame_optimized.by_group["Selection Rate"].iloc[1])

    dpr_after.append(fairlearn.metrics.demographic_parity_ratio(y_true = y, y_pred = y_pred_optimized, 
                                                                sensitive_features = X["Gender"], 
                                                                method = "between_groups"))
    dpd_after.append(fairlearn.metrics.demographic_parity_difference(y_true = y, y_pred = y_pred_optimized, 
                                                                     sensitive_features = X["Gender"], 
                                                                     method = "between_groups"))

Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 3, 'min_samples_split': 2, 'splitter': 'random'}
Best Score:  0.8029208069858476
ITERATION:  1
Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2, 'min_samples_split': 5, 'splitter': 'random'}
Best Score:  0.8005721168322794
ITERATION:  2
Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 5, 'splitter': 'random'}
Best Score:  0.8052393857271906
ITERATION:  3
Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 5, 'splitter': 'random'}
Best Score:  0.8032520325203253
ITERATION:  4
Best Parameters:  {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 2, 'splitter': 'random'}
Best Score:  0.8027702499247213
ITERATION:  5
Best Parameters:  {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5, 'min_samples_split': 3, 'splitter': 'random'}
Best Score:  0

In [33]:
# convert results of metrics to a dataframe

results = {
    "Female Selection Rate Before": female_sr_before,
    "Male Selection Rate Before": male_sr_before,
    "Female Selection Rate After": female_sr_after,
    "Male Selection Rate After": male_sr_after,
    "Demographic Parity Ratio Before": dpr_before,
    "Demographic Parity Ratio After": dpr_after,
    "Demographic Parity Difference Before": dpd_before,
    "Demographic Parity Difference After" : dpd_after,
}

metric_results = pd.DataFrame(results)
metric_results

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Before,Demographic Parity Ratio After,Demographic Parity Difference Before,Demographic Parity Difference After
0,0.854701,0.84083,0.726496,0.695502,0.983772,0.957338,0.01387,0.030994
1,0.752137,0.740484,0.717949,0.726644,0.984508,0.988034,0.011652,0.008695
2,0.478632,0.633218,0.726496,0.709343,0.755873,0.976389,0.154586,0.017153
3,0.589744,0.529412,0.777778,0.778547,0.897698,0.999012,0.060332,0.000769
4,0.82906,0.743945,0.735043,0.740484,0.897335,0.992651,0.085115,0.005442
5,0.547009,0.50519,0.641026,0.650519,0.923551,0.985406,0.041818,0.009493
6,0.888889,0.768166,0.820513,0.820069,0.864187,0.999459,0.120723,0.000444
7,0.692308,0.678201,0.589744,0.595156,0.979623,0.990906,0.014107,0.005412
8,0.888889,0.875433,0.871795,0.882353,0.984862,0.988034,0.013456,0.010558
9,0.735043,0.792388,0.692308,0.695502,0.92763,0.995408,0.057345,0.003194


In [34]:
# convert average of each metric for to a dataframe

averages = pd.DataFrame(metric_results.mean()).T
averages

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Before,Demographic Parity Ratio After,Demographic Parity Difference Before,Demographic Parity Difference After
0,0.676353,0.691465,0.714245,0.714994,0.926181,0.986423,0.048477,0.009002


In [36]:
# save metric_results and averages dataframes as csv files

metric_results.to_csv("../Model Data/copd_covid_demographic_parity_metric_results.csv", index = False)
averages.to_csv("../Model Data/copd_covid_demographic_parity_averages.csv", index = False)