In [1]:
# import pandas, scipy.stats, and numpy

import pandas as pd
import scipy.stats as stats
import numpy as np

In [2]:
# read in average data

avg_data = pd.read_csv("../Data/demographic_parity_averages.csv")

In [3]:
# display data

avg_data

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Before,Demographic Parity Ratio After,Demographic Parity Difference Before,Demographic Parity Difference After
0,0.358333,0.743333,0.456667,0.441667,0.480311,0.899823,0.385,0.045


In [4]:
# average selection rate before mitigation

avg_sr_before = ((avg_data["Female Selection Rate Before"] + avg_data["Male Selection Rate Before"]) / 2)[0] * 100

print("Average Selection Rate Before Mitigation: ", avg_sr_before)


# average selection rate after mitigation

avg_sr_after = ((avg_data["Female Selection Rate After"] + avg_data["Male Selection Rate After"]) / 2)[0] * 100

print("Average Selection Rate After Mitigation: ", avg_sr_after)

Average Selection Rate Before Mitigation:  55.08333333333334
Average Selection Rate After Mitigation:  44.91666666666666


In [5]:
# percent improvement in the demographic parity ratio from before mitigation to the demographic parity ratio after 
# mitigation

dpr_before = avg_data["Demographic Parity Ratio Before"]
dpr_after = avg_data["Demographic Parity Ratio After"]

dpr_percent_improvement = (dpr_after - dpr_before) / dpr_before

In [6]:
# There is a 87.34% improvement in the demographic parity ratio from before mitigation to the demographic parity 
# ratio after mitigation

abs(dpr_percent_improvement[0] * 100)

87.3416529450848

In [7]:
# percent improvement in the demographic parity difference from before mitigation to the demographic parity 
# difference after mitigation

dpd_before = avg_data["Demographic Parity Difference Before"]
dpd_after = avg_data["Demographic Parity Difference After"]

dpd_percent_improvement = (dpd_after - dpd_before) / dpd_before

In [8]:
# There is a 88.31% improvement in the demographic parity difference from before mitigation to the demographic 
# parity difference after mitigation

abs(dpd_percent_improvement[0] * 100)

88.31168831168831

In [9]:
# read in data

data = pd.read_csv("../Data/demographic_parity_metric_results.csv")

In [10]:
# dispplay data

data

Unnamed: 0,Female Selection Rate Before,Male Selection Rate Before,Female Selection Rate After,Male Selection Rate After,Demographic Parity Ratio Before,Demographic Parity Ratio After,Demographic Parity Difference Before,Demographic Parity Difference After
0,0.4,0.8,0.5,0.5,0.5,1.0,0.4,0.0
1,0.3,0.7,0.45,0.3,0.428571,0.666667,0.4,0.15
2,0.4,0.8,0.55,0.55,0.5,1.0,0.4,0.0
3,0.25,0.65,0.65,0.65,0.384615,1.0,0.4,0.0
4,0.3,0.7,0.45,0.45,0.428571,1.0,0.4,0.0
5,0.3,0.7,0.4,0.3,0.428571,0.75,0.4,0.1
6,0.3,0.75,0.45,0.45,0.4,1.0,0.45,0.0
7,0.4,0.8,0.3,0.4,0.5,0.75,0.4,0.1
8,0.4,0.8,0.35,0.4,0.5,0.875,0.4,0.05
9,0.3,0.65,0.4,0.35,0.461538,0.875,0.35,0.05


In [12]:
# need to check variance of the two samples before performing the two sample t-test

# if the ratio of the sample with the larger variance to the sample with the smaller variance is less than 4, 
# we can state that the variances are approximately equal and use Student’s t-test. Otherwise, we have to use 
# Welch's t-test

print("Variance of 'Demographic Parity Difference Before': ", 
      np.var(data["Demographic Parity Difference Before"]))

print("Variance of 'Demographic Parity Difference After': ", 
      np.var(data["Demographic Parity Difference After"]))

print("Variance ratio: ", np.var(data["Demographic Parity Difference After"]) / 
      np.var(data["Demographic Parity Difference Before"]))

Variance of 'Demographic Parity Difference Before':  0.0016916666666666666
Variance of 'Demographic Parity Difference After':  0.0018916666666666656
Variance ratio:  1.118226600985221


In [13]:
# two sample t-test (Student's t-test) to determine whether or not the difference between the mean of the 
# demographic parity difference before mitigation is significantly different from the mean of the demographic 
# parity difference after mitigation

# degrees of freedom = 28

dpd_t_statistic, dpd_p_value = stats.ttest_ind(data["Demographic Parity Difference Before"], 
                                               data["Demographic Parity Difference After"], equal_var = True)

In [14]:
# The p-value is less than our alpha value of 0.05, so the mean of the demographic parity difference before 
# mitigation is significantly different from the mean of the demographic parity difference after mitigation

# The t-statistic, which is calculated as (mean of sample 1 - mean of sample 2) / standard error, is positive, as 
# the mean of the demographic parity difference before mitigation is larger than the mean of the demographic 
# parity difference after mitigation

if dpd_p_value < 0.05:
    print("p-value: ", dpd_p_value)
    print("t-statistic: ", dpd_t_statistic)
    print("Degrees of freedom: 28")
    print("Reject the null hypothesis: The means are significantly different.")
else:
    print("p-value: ", dpd_p_value)
    print("t-statistic: ", dpd_t_statistic)
    print("Degrees of freedom: 28")
    print("Accept the null hypothesis: The means are NOT significantly different.")

p-value:  1.785802290593625e-37
t-statistic:  30.586818866500217
Degrees of freedom: 28
Reject the null hypothesis: The means are significantly different.
