In [4]:
import pandas as pd
import numpy as np
from scipy.stats import kstest
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
#import shap
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error as MSE

In [60]:
# read in data with candidate evaluation prediction
df = pd.read_csv('data/random_df_final.csv')
df = df.replace(np.nan, 'N/A') # replace N/A to string to show up in counts

#check data info
df.head()
df.describe()
df.columns
cat_cols = ['School Name', 'GPA', 'Degree', 'Location', 'Gender',
       'Veteran status', 'Work authorization', 'Disability', 'Ethnicity', 'Role 1', 'Role 2', 'Role 3']
#for col in cat_cols:
    #print(df[col].value_counts())

df.columns

Index(['Applicant ID', 'School Name', 'GPA', 'Degree', 'Location', 'Gender',
       'Veteran status', 'Work authorization', 'Disability', 'Ethnicity',
       'Role 1', 'Start 1', 'End 1', 'Role 2', 'Start 2', 'End 2', 'Role 3',
       'Start 3', 'End 3', 'Resume score', 'Candidate evaluation'],
      dtype='object')

In [61]:
# define sensitive attributes
attributes = ['Gender',
              'Work authorization',
              'Disability',
              'Ethnicity']

for attr in attributes:
    print(df[attr].value_counts())

F      501
N/A    500
M      499
Name: Gender, dtype: int64
1    770
0    730
Name: Work authorization, dtype: int64
1.0    508
0.0    506
N/A    486
Name: Disability, dtype: int64
4    324
0    307
3    297
2    291
1    281
Name: Ethnicity, dtype: int64


In [72]:
# define fairness metrics functions

def independence(y_hat, group, class_1, class_2):
  """
  Computes an independence metric between two specific groups.

  Args:
    y_hat (np.ndarray): Classifier predictions.
    group (np.ndarray): Array of indices corresponding to group membership.
    class_1, class_2 : The two classes within that group we want to compare, ex. "M" and "F"
  Returns:
    float: independence measure
  """

  group_A = y_hat[np.where(group == class_1)[0]]
  group_B = y_hat[np.where(group == class_2)[0]]

  ind = np.mean(group_A) / np.mean(group_B)

  return ind

def spd(sensitive_attribute, dataset, prediction_col, minority_class, majority_class):
    """
    Calculate the Statistical Parity Difference (SPD) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - prediction_col (str): Name of the column representiing predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - spd (float): Statistical Parity Difference between majority and minority classes.
    """
    # TODO: Compute the spd value
    dataset = dataset.copy(deep=True)
    minority = dataset[dataset[sensitive_attribute] == minority_class]
    majority = dataset[dataset[sensitive_attribute] == majority_class]

    spd_val = np.mean(minority[prediction_col]) - np.mean(majority[prediction_col])

    return spd_val

def di(sensitive_attribute, dataset, prediction_col, majority_class, minority_class):
    """
    Calculate the Disparate Impact (DI) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - prediction_col (str): Name of the column representiing predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - di (float): Disparate Impact between majority and minority classes.
    """
    dataset = dataset.copy(deep=True)

    minority = dataset[dataset[sensitive_attribute] == minority_class]
    majority = dataset[dataset[sensitive_attribute] == majority_class]
    di_val = np.mean(minority[prediction_col]) / np.mean(majority[prediction_col])

    return di_val

In [75]:
# Calculte metrics for each of the 4 sensitive attributes

# start a list for result tuples to later turn into dataframe
scores = []
for attr in attributes:
  unique_groups = df[attr].unique()
  for i in range(len(unique_groups)):
    group_i = unique_groups[i]
    for j in range(i+1, len(unique_groups)):
      group_j = unique_groups[j]
      independence_score = independence(df['Candidate evaluation'],df[attr],group_i,group_j)
      spd_score = spd(attr,df,'Candidate evaluation',group_i,group_j)
      #di_score = di(attr, df, 'Candidate evaluation',group_i,group_j)
      scores.append((attr, group_i, group_j, independence_score, spd_score))

scores_df = pd.DataFrame(scores, columns=['Sensitive attribute', 'Group_A', 'Group_B', 'AvsB_Independence', 'AvsB_SPD'])
print(scores_df)


   Sensitive attribute Group_A Group_B  AvsB_Independence  AvsB_SPD
0               Gender       F     N/A                inf  0.341317
1               Gender       F       M           0.621596 -0.207781
2               Gender     N/A       M           0.000000 -0.549098
3   Work authorization       0       1           0.925727 -0.022861
4           Disability     0.0     N/A           1.237091  0.060980
5           Disability     0.0     1.0           1.016581  0.005190
6           Disability     N/A     1.0           0.821751 -0.055790
7            Ethnicity       4       2           0.842014 -0.052119
8            Ethnicity       4       1           0.867284 -0.042507
9            Ethnicity       4       3           0.970588 -0.008418
10           Ethnicity       4       0           1.015212  0.004162
11           Ethnicity       2       1           1.030011  0.009612
12           Ethnicity       2       3           1.152699  0.043702
13           Ethnicity       2       0          

  ind = np.mean(group_A) / np.mean(group_B)


In [77]:
scores_df.to_csv('scores_df.csv', index=False)