In [2]:
# import pandas, scipy.stats, and numpy

import pandas as pd
import scipy.stats as stats
import numpy as np

In [3]:
# read in average data for gender

avg_gender_data = pd.read_csv("../Results/demographic_parity_averages_for_gender_coughvid_data.csv")

In [4]:
# display data

avg_gender_data

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Gender Before,Demographic Parity Ratio Gender After,Demographic Parity Difference Gender Before,Demographic Parity Difference Gender After
0,0.414615,0.31448,0.387436,0.386878,0.752724,0.98191,0.100136,0.006983


In [38]:
# average selection rate before mitigation for gender

avg_sr_gender_before = ((avg_gender_data["Female Selection Rate Before"] + 
                  avg_gender_data["Male Selection Rate Before"]) / 2)[0] * 100

print("Average Selection Rate Before Mitigation: ", avg_sr_gender_before)


# average selection rate after mitigation for gender

avg_sr_gender_after = ((avg_gender_data["Female Selection Rate After"] + 
                  avg_gender_data["Male Selection Rate After"]) / 2)[0] * 100

print("Average Selection Rate After Mitigation: ", avg_sr_gender_after)

Average Selection Rate Before Mitigation:  36.454751131221705
Average Selection Rate After Mitigation:  38.71568627450981


In [5]:
# percent improvement in the demographic parity ratio from before mitigation for gender to the demographic parity 
# ratio after mitigation for gender

gender_dpr_before = avg_gender_data["Demographic Parity Ratio Gender Before"]
gender_dpr_after = avg_gender_data["Demographic Parity Ratio Gender After"]

gender_dpr_percent_improvement = (gender_dpr_after - gender_dpr_before) / gender_dpr_before

In [6]:
# There is a 30.45% improvement in the demographic parity ratio from before mitigation for gender to the 
# demographic parity ratio after mitigation for gender

abs(gender_dpr_percent_improvement[0] * 100)

30.447438015348343

In [15]:
# percent improvement in the demographic parity difference from before mitigation for gender to the demographic 
# parity difference after mitigation for gender

gender_dpd_before = avg_gender_data["Demographic Parity Difference Gender Before"]
gender_dpd_after = avg_gender_data["Demographic Parity Difference Gender After"]

gender_dpd_percent_improvement = (gender_dpd_after - gender_dpd_before) / gender_dpd_before

In [14]:
# There is a 93.03% improvement in the demographic parity difference from before mitigation for gender to the 
# demographic parity difference after mitigation for gender

abs(gender_dpd_percent_improvement[0] * 100)

93.02605814128637

In [16]:
# read in gender data

gender_data = pd.read_csv("../Results/demographic_parity_metric_results_by_gender_coughvid_data.csv")

In [17]:
# display data

gender_data

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Gender Before,Demographic Parity Ratio Gender After,Demographic Parity Difference Gender Before,Demographic Parity Difference Gender After
0,0.342308,0.248869,0.357692,0.355204,0.727032,0.993042,0.093439,0.002489
1,0.338462,0.251131,0.346154,0.330317,0.741979,0.954248,0.08733,0.015837
2,0.426923,0.321267,0.434615,0.436652,0.752517,0.995337,0.105656,0.002036
3,0.503846,0.402715,0.430769,0.418552,0.799282,0.971639,0.101131,0.012217
4,0.346154,0.246606,0.357692,0.359729,0.712418,0.99434,0.099548,0.002036
5,0.496154,0.400452,0.426923,0.432127,0.807114,0.987958,0.095701,0.005204
6,0.507692,0.404977,0.415385,0.409502,0.797683,0.985839,0.102715,0.005882
7,0.515385,0.39819,0.419231,0.418552,0.772608,0.998381,0.117195,0.000679
8,0.496154,0.400452,0.407692,0.41629,0.807114,0.979348,0.095701,0.008597
9,0.342308,0.248869,0.353846,0.346154,0.727032,0.978261,0.093439,0.007692


In [18]:
# need to check variance of the two samples before performing the two sample t-test

# if the ratio of the sample with the larger variance to the sample with the smaller variance is less than 4, 
# we can state that the variances are approximately equal and use Student’s t-test. Otherwise, we have to use 
# Welch's t-test

print("Variance of 'Demographic Parity Difference Gender Before': ", 
      np.var(gender_data["Demographic Parity Difference Gender Before"]))

print("Variance of 'Demographic Parity Difference Gender After': ", 
      np.var(gender_data["Demographic Parity Difference Gender After"]))

print("Variance ratio: ", np.var(gender_data["Demographic Parity Difference Gender Before"]) / 
      np.var(gender_data["Demographic Parity Difference Gender After"]))

Variance of 'Demographic Parity Difference Gender Before':  6.987844911720336e-05
Variance of 'Demographic Parity Difference Gender After':  3.415754978171777e-05
Variance ratio:  2.0457687850492303


In [19]:
# two sample t-test (Student's t-test) to determine whether or not the difference between the mean of the 
# demographic parity difference before mitigation for gender is significantly different from the mean of the 
# demographic parity difference after mitigation for gender

# degrees of freedom = 30

dpd_gender_t_statistic, dpd_gender_p_value = stats.ttest_ind(gender_data["Demographic Parity Difference Gender Before"], 
                                                             gender_data["Demographic Parity Difference Gender After"], 
                                                             equal_var = True)

In [20]:
# The p-value is less than our alpha value of 0.05, so the mean of the demographic parity difference before 
# mitigation for gender is significantly different from the mean of the demographic parity difference after
# mitigation for gender

# The t-statistic, which is calculated as (mean of sample 1 - mean of sample 2) / standard error, is positive, as 
# the mean of the demographic parity difference before mitigation for gender is larger than the mean of the 
# demographic parity difference after mitigation for gender

if dpd_gender_p_value < 0.05:
    print("p-value: ", dpd_gender_p_value)
    print("t-statistic: ", dpd_gender_t_statistic)
    print("Degrees of freedom: 30")
    print("Reject the null hypothesis: The means are significantly different.")
else:
    print("p-value: ", dpd_gender_p_value)
    print("t-statistic: ", dpd_gender_t_statistic)
    print("Degrees of freedom: 30")
    print("Accept the null hypothesis: The means are NOT significantly different.")

p-value:  5.493468421744147e-49
t-statistic:  49.18140565652414
Degrees of freedom: 30
Reject the null hypothesis: The means are significantly different.


In [21]:
# read in average data for age

avg_age_data = pd.read_csv("../Results/demographic_parity_averages_for_age_coughvid_data.csv")

In [22]:
# display data

avg_age_data

Unnamed: 0,Young Selection Rate Before,Old Selection Rate Before,Young Selection Rate After,Old Selection Rate After,Demographic Parity Ratio Age Before,Demographic Parity Ratio Age After,Demographic Parity Difference Age Before,Demographic Parity Difference Age After
0,0.474405,0.196237,0.428912,0.433656,0.416671,0.963838,0.278168,0.016357


In [39]:
# average selection rate before mitigation for age

avg_sr_age_before = ((avg_age_data["Young Selection Rate Before"] + 
                  avg_age_data["Old Selection Rate Before"]) / 2)[0] * 100

print("Average Selection Rate Before Mitigation: ", avg_sr_age_before)


# average selection rate after mitigation for age

avg_sr_age_after = ((avg_age_data["Young Selection Rate After"] + 
                  avg_age_data["Old Selection Rate After"]) / 2)[0] * 100

print("Average Selection Rate After Mitigation: ", avg_sr_age_after)

Average Selection Rate Before Mitigation:  33.532066052227336
Average Selection Rate After Mitigation:  43.12837393021724


In [23]:
# percent improvement in the demographic parity ratio from before mitigation for age to the demographic parity 
# ratio after mitigation for age

age_dpr_before = avg_age_data["Demographic Parity Ratio Age Before"]
age_dpr_after = avg_age_data["Demographic Parity Ratio Age After"]

age_dpr_percent_improvement = (age_dpr_after - age_dpr_before) / age_dpr_before

In [24]:
# There is a 131.32% improvement in the demographic parity ratio from before mitigation for age to the demographic 
# parity ratio after mitigation for age

abs(age_dpr_percent_improvement[0] * 100)

131.31836662798352

In [25]:
# percent improvement in the demographic parity difference from before mitigation for age to the demographic parity
# difference after mitigation for age

age_dpd_before = avg_age_data["Demographic Parity Difference Age Before"]
age_dpd_after = avg_age_data["Demographic Parity Difference Age After"]

age_dpd_percent_improvement = (age_dpd_after - age_dpd_before) / age_dpd_before

In [27]:
# There is a 94.12% improvement in the demographic parity difference from before mitigation for age to the 
# demographic parity difference after mitigation for age

abs(age_dpd_percent_improvement[0] * 100)

94.11965407409599

In [33]:
# read in age data

age_data = pd.read_csv("../Results/demographic_parity_metric_results_by_age_coughvid_data.csv")

In [34]:
# display data

age_data

Unnamed: 0,Young Selection Rate Before,Old Selection Rate Before,Young Selection Rate After,Old Selection Rate After,Demographic Parity Ratio Age Before,Demographic Parity Ratio Age After,Demographic Parity Difference Age Before,Demographic Parity Difference Age After
0,0.377551,0.164516,0.382653,0.377419,0.435745,0.986323,0.213035,0.005234
1,0.380102,0.16129,0.382653,0.390323,0.424334,0.980351,0.218812,0.00767
2,0.484694,0.203226,0.482143,0.474194,0.419287,0.983513,0.281468,0.007949
3,0.612245,0.222581,0.466837,0.5,0.363548,0.933673,0.389664,0.033163
4,0.382653,0.158065,0.382653,0.36129,0.413075,0.944172,0.224589,0.021363
5,0.584184,0.248387,0.471939,0.493548,0.425187,0.956216,0.335797,0.02161
6,0.596939,0.248387,0.47449,0.506452,0.416101,0.936891,0.348552,0.031962
7,0.596939,0.245161,0.469388,0.441935,0.410698,0.941515,0.351777,0.027452
8,0.602041,0.225806,0.459184,0.454839,0.375068,0.990538,0.376234,0.004345
9,0.377551,0.164516,0.380102,0.387097,0.435745,0.98193,0.213035,0.006995


In [44]:
# need to check variance of the two samples before performing the two sample t-test

# if the ratio of the sample with the larger variance to the sample with the smaller variance is less than 4, we 
# can state that the variances are approximately equal and use Student’s t-test. Otherwise, we have to use Welch's 
# t-test

print("Variance of 'Demographic Parity Difference Age Before': ", 
      np.var(age_data["Demographic Parity Difference Age Before"]))

print("Variance of 'Demographic Parity Difference Age After': ", 
      np.var(age_data["Demographic Parity Difference Age After"]))

print("Variance ratio: ", np.var(age_data["Demographic Parity Difference Age Before"]) / 
      np.var(age_data["Demographic Parity Difference Age After"]))

Variance of 'Demographic Parity Difference Age Before':  0.0043841873853217314
Variance of 'Demographic Parity Difference Age After':  0.00016826403145796847
Variance ratio:  26.055404398276764


In [45]:
# two sample t-test (Welch's t-test) to determine whether or not the difference between the mean of the 
# demographic parity difference before mitigation for age is significantly different from the mean of the 
# demographic parity difference after mitigation for age

# degrees of freedom = 30

dpd_age_t_statistic, dpd_age_p_value = stats.ttest_ind(age_data["Demographic Parity Difference Age Before"], 
                                                       age_data["Demographic Parity Difference Age After"], 
                                                       equal_var = False)

In [46]:
# The p-value is less than our alpha value of 0.05, so the mean of the demographic parity difference before 
# mitigation for age is significantly different from the mean of the demographic parity difference after 
# mitigation for age

# The t-statistic, which is calculated as (mean of sample 1 - mean of sample 2) / standard error, is positive, as 
# the mean of the demographic parity difference before mitigation for age is larger than the mean of the 
# demographic parity difference after mitigation for age

if dpd_age_p_value < 0.05:
    print("p-value: ", dpd_age_p_value)
    print("t-statistic: ", dpd_age_t_statistic)
    print("Degrees of freedom: 30")
    print("Reject the null hypothesis: The means are significantly different.")
else:
    print("p-value: ", dpd_age_p_value)
    print("t-statistic: ", dpd_age_t_statistic)
    print("Degrees of freedom: 30")
    print("Accept the null hypothesis: The means are NOT significantly different.")

p-value:  6.482665308702066e-20
t-statistic:  20.89604745790458
Degrees of freedom: 30
Reject the null hypothesis: The means are significantly different.
