<h1> HEADER </h1>

<h3> Libraries </h3>

In [2]:
import sys
sys.path.append("../")

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 500)
from aequitas.engine import Aequitas, NpEncoder
import aequitas.tools.data_manip as dm
import aequitas.tools as tools
from aequitas.gateway import Gateway

<h3> Importing the dataset </h3>

In [5]:
dataset_name="IT_candidates.csv"
dataset_directory="../datasets/"+dataset_name
dataset = pd.read_csv(dataset_directory)

<h3> Dataset pre-processing </h3>

In [17]:
# Dataset already preprocessed for missing values|
dataset.drop(columns=dataset.columns[0],inplace=True)

<h3> Examples of used of the AEQUITAS object </h3>

In [26]:
# Empty parameters file (Example 1)
parameters={ }
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.structure(verbose=True)

Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                           Values
0               Age      text                   Binary              2                                       [<35, >35]
1     Accessibility      text                   Binary              2                                        [No, Yes]
2           EdLevel      text      Categorical/Ordinal              5  [Master, Undergraduate, PhD, Other, NoHigherEd]
3        Employment   float64                   Binary              2                                       [1.0, 0.0]
4            Gender      text      Categorical/Ordinal              3                          [Man, Woman, NonBinary]
..              ...       ...                      ...            ...                                              ...
124  Play Framework   float64                   Binary              2                                       [0.0, 1.0]
125         Phoenix   float64          

In [27]:
dataset.columns[:13]

Index(['Age', 'Accessibility', 'EdLevel', 'Employment', 'Gender', 'MentalHealth', 'MainBranch', 'YearsCode', 'YearsCodePro', 'Country', 'PreviousSalary', 'ComputerSkills', 'Employed'], dtype='object')

In [28]:
#Basic parameters file (Example 2)
parameters={
    "class_attribute":{
        "name": 'Employed',
    },
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.descriptive_stats(verbose=True)

Proportions: (Employed)
              0
True   0.536721
False  0.463279



In [38]:
# A parameters file without any expectations of privileged groups  (Example 3)
# but with two possible tasks at hand: a classification task over 'Employed' and a regression task over 'PreviousSalary' (To Do)
parameters={
    "class_attribute":{
        "name": 'Employed',
        "positive_value":'True'
    },
    "sensitive_attributes":
    [
        {
            "name": 'Gender',
        },
        {
            "name": 'Age',
        }
    ]
}
Aeq_dataset=Aequitas(dataset,parameters)
Aeq_dataset.descriptive_stats(verbose=True)

Proportions: (Employed)
              0
True   0.536721
False  0.463279

Proportions: (Gender)
                  0
Man        0.933539
Woman      0.047794
NonBinary  0.018668

Proportions: (Age)
            0
<35  0.650949
>35  0.349051

Outcome distribution by group:
              True      False
Man        0.541235  0.458765
NonBinary  0.533966  0.466034
Woman      0.449643  0.550357

Outcome distribution by group:
        True      False
<35  0.547803  0.452197
>35  0.516056  0.483944


Association between Gender and Age.
Contingency Table:
Age          <35    >35
Gender                 
Man        44273  24189
NonBinary    895    474
Woman       2570    935

Chi-squared statistic: 109.99624855978593
Cramer's V: 0.03837502151091483
Degrees of Freedom: 2
p-value: 1.3020213636227068e-24
There is a statistically significant association between Gender and Age.

Association between Gender and Employed.
Contingency Table:
Employed   False  True 
Gender                 
Man        31408  3

In [39]:
#### Mattias Addition
gw = Gateway('demonstrator', host='http://localhost:6060/')
fs_only = False

gw.save_element(Aeq_dataset.parameters, element_key="dataset", filesystem=fs_only)

ConnectionError: HTTPConnectionPool(host='localhost', port=6060): Max retries exceeded with url: /project/demonstrator/data/dataset/0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f9633143b10>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [40]:
# Lets go forward with a more detailed fairness analysis

#Lets split the dataset into training and test samples
training_sample,test_sample = dm.split_dataset(dataset,ratio=0.3, random_state=42)

In [37]:
# Define a parameters file with privileged groups
parameters={
    "class_attribute":{
        "name": 'Employed',
        "positive_value":'True'
    },
    "sensitive_attributes":
    [
        {
            "name": 'Gender',
        }
    ]
}
# Define two Aequitas Objects
Aeq_training=Aequitas(training_sample,parameters)
Aeq_test=Aequitas(test_sample,parameters)

In [45]:
# Get data on Aeq_training object
Aeq_training.structure(verbose=True)
Aeq_training.descriptive_stats()

Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                           Values
0               Age      text                   Binary              2                                       [<35, >35]
1     Accessibility      text                   Binary              2                                        [No, Yes]
2           EdLevel      text      Categorical/Ordinal              5  [Undergraduate, Master, Other, PhD, NoHigherEd]
3        Employment   float64                   Binary              2                                       [1.0, 0.0]
4            Gender      text      Categorical/Ordinal              3                          [Man, Woman, NonBinary]
..              ...       ...                      ...            ...                                              ...
124  Play Framework   float64                   Binary              2                                       [0.0, 1.0]
125         Phoenix   float64          

In [43]:
# you can use the folowing techniques without defining privileged groups. in that case the results will be displayed
# as if all values could be privileged.
Aeq_training.statistical_parity(verbose=True)

Probabilities:
            Man    Woman  NonBinary
True   0.539914  0.46303   0.529412
False  0.460086  0.53697   0.470588

Statistical/Demographic Parity:
Outcome:  True
                Man     Woman  NonBinary
Man        0.000000  0.076883   0.010502
Woman     -0.076883  0.000000  -0.066381
NonBinary -0.010502  0.066381   0.000000

Outcome:  False
                Man     Woman  NonBinary
Man        0.000000 -0.076883  -0.010502
Woman      0.076883  0.000000   0.066381
NonBinary  0.010502 -0.066381   0.000000




In [44]:
Aeq_training.disparate_impact(verbose=True)

Probabilities:
            Man    Woman  NonBinary
True   0.539914  0.46303   0.529412
False  0.460086  0.53697   0.470588

Disparate Impact:
Outcome:  True
                Man     Woman  NonBinary
Man        1.000000  0.857601   0.980549
Woman      1.166044  1.000000   1.143363
NonBinary  1.019837  0.874613   1.000000

Outcome:  False
                Man     Woman  NonBinary
Man        1.000000  1.167106   1.022826
Woman      0.856820  1.000000   0.876378
NonBinary  0.977684  1.141061   1.000000




In [15]:
# # Define appropriate transformations for dataset
# transform_dictionary = {
#     "income": {
#         "encode": "labeling",
#         "labels": {
#             "<=50K": 0,
#             ">50K": 1, 
#         }
#     },
#     "gender": {
#         "encode": "labeling",
#         "labels": {
#             "Female": 0,
#             "Male": 1, 
#         }
#     },
#     "race": {
#         "encode": "labeling",
#         "labels": {
#             "Minority": 0,
#             "White": 1, 
#         } 
#     },
#     "workclass": {
#         "encode": "labeling",
#         "scaling": "min-max"
#     },
#     "marital-status": {
#         "encode": "labeling",
#         "scaling": "min-max"
#     },
#     "occupation": {
#         "encode": "labeling", 
#         "scaling": "min-max"
#     },
#     "relationship": {
#         "encode": "labeling", 
#         "scaling": "min-max"
#     },
#     "native-country": {
#         "encode": "labeling", 
#         "scaling": "min-max"
#     },
#     "age":{
#         "scaling": "standard"
#     },
#     "educational-num":{
#         "scaling": "min-max"
#     },
#     "capital-gain":{
#         "scaling": "standard"
#     },
#     "capital-loss":{
#         "scaling": "standard"
#     },
#     "hours-per-week":{
#         "scaling": "standard"
#     }
# }

# # add transform instructions for techniques
# Aeq_training.transform_instructions(transform_dictionary)
# Aeq_test.transform_instructions(transform_dictionary)

In [52]:
print(dataset.iloc[:,:13].columns)
Aeq_training.structure(verbose=True)

Index(['Age', 'Accessibility', 'EdLevel', 'Employment', 'Gender', 'MentalHealth', 'MainBranch', 'YearsCode', 'YearsCodePro', 'Country', 'PreviousSalary', 'ComputerSkills', 'Employed'], dtype='object')
Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                           Values
0               Age      text                   Binary              2                                       [<35, >35]
1     Accessibility      text                   Binary              2                                        [No, Yes]
2           EdLevel      text      Categorical/Ordinal              5  [Undergraduate, Master, Other, PhD, NoHigherEd]
3        Employment   float64                   Binary              2                                       [1.0, 0.0]
4            Gender      text      Categorical/Ordinal              3                          [Man, Woman, NonBinary]
..              ...       ...                      ...            ...       

In [53]:
dataset.YearsCode.value_counts()

YearsCode
10.0    6335
8.0     4596
7.0     4489
6.0     4374
5.0     4194
15.0    4137
12.0    3865
20.0    3691
9.0     3112
4.0     2844
11.0    2530
14.0    2373
13.0    2282
25.0    2138
3.0     1945
16.0    1863
30.0    1653
18.0    1642
17.0    1602
22.0    1376
2.0     1133
35.0     939
23.0     932
21.0     867
24.0     845
19.0     808
40.0     790
27.0     629
26.0     622
28.0     514
32.0     413
1.0      362
37.0     347
38.0     340
33.0     305
34.0     294
36.0     289
29.0     259
31.0     216
42.0     210
39.0     190
0.0      183
41.0     166
45.0     153
43.0     124
50.0     122
44.0     101
46.0      57
47.0      35
48.0      26
49.0      24
Name: count, dtype: int64

In [None]:
# Define appropriate transformations for dataset
transform_dictionary = {
    "Age": {
        "encode": "one-hot",
    },
    "Accessibility": {
        "encode": "one-hot"
    },
    "EdLevel": {
        "encode": "one-hot"
    },
    "Employment": {
        "encode": "one-hot"
    },
    "Gender": {
        "encode": "one-hot"
    },
    "MentalHealth": {
        "encode": "one-hot"
    },
    "MainBranch": {
        "encode": "one-hot"
    },
    # ------------- up to here
    "YearsCode": {
        "encode": "labeling", 
        "scaling": "min-max"
    },
    "age":{
        "scaling": "standard"
    },
    "educational-num":{
        "scaling": "min-max"
    },
    "capital-gain":{
        "scaling": "standard"
    },
    "capital-loss":{
        "scaling": "standard"
    },
    "hours-per-week":{
        "scaling": "standard"
    }
}

# add transform instructions for techniques
Aeq_training.transform_instructions(transform_dictionary)
Aeq_test.transform_instructions(transform_dictionary)

In [16]:

# mitigate bias on data (massaging) / you can use also uniform_sampling or preferential_sampling
Aeq_training_unbiased=Aeq_training.mitigation(method='massaging', sensitive_attribute='gender')

# check statistical parity on new unbiased object
Aeq_training_unbiased.statistical_parity(verbose=True)

Probabilities:
          Male    Female
>50K  0.237753  0.237881

Statistical/Demographic Parity:
Outcome:  >50K
      Male    Female
Male   0.0 -0.000128




In [17]:
# Lets do a classification to see the results on the test sample

# transform object's dataset to numeric values
Aeq_training_unbiased.transform()
Aeq_test.transform()

# define classifier parameters
classifier_type="Decision_Tree"
classifier_params={
    "random_state":42, 
    "min_samples_leaf":10
}
class_attribute=Aeq_training_unbiased.parameters["class_attribute"]["name"]

# Train a classifier on training sample
clf=tools.train_classifier(Aeq_training_unbiased.dataset,class_attribute,classifier_type,classifier_params)

# Test classifier on test sample
predicted_test_sample, _, _, _= tools.test_classifier(clf,Aeq_test.dataset,class_attribute,verbose=True)

# Inverse transform the predicted test sample and the test sample
Aeq_training_unbiased.inverse_transform()
Aeq_test.inverse_transform()

Classifier Accuracy: 0.80


In [18]:
# define a new prediction test sample
Aeq_predicted_test=Aeq_test.copy()
Aeq_predicted_test.set_dataset(predicted_test_sample)
Aeq_predicted_test.inverse_transform()

# check statistical parity on new prediction test sample
Aeq_predicted_test.statistical_parity(verbose=True)

Probabilities:
          Male    Female
>50K  0.204854  0.221832

Statistical/Demographic Parity:
Outcome:  >50K
      Male    Female
Male   0.0 -0.016979




In [19]:
prediction=np.array(Aeq_predicted_test.dataset[class_attribute])

# check equal opportunity / equal odds on test sample
Aeq_test.equal_opportunity(prediction,verbose=True)
Aeq_test.equal_odds(prediction,verbose=True)

Confusion Metrics:  (Positive_outcome='>50K')
          Female         Male
TP    404.000000  1461.000000
TN   3611.000000  6266.000000
FP    671.000000   548.000000
FN    160.000000  1532.000000
TPR     0.716312     0.488139
TNR     0.843298     0.919577
FPR     0.156702     0.080423
FNR     0.283688     0.511861
FDR     0.624186     0.272773
FOR     0.042429     0.196461
PPV     0.375814     0.727227
NPV     0.957571     0.803539
RPP     0.221832     0.204854
RNP     0.778168     0.795146
ACC     0.828518     0.787907
Equality of Opportunity:  (Positive_outcome='>50K')
        Female  Male
Male -0.228173   0.0

Confusion Metrics:  (Positive_outcome='>50K')
          Female         Male
TP    404.000000  1461.000000
TN   3611.000000  6266.000000
FP    671.000000   548.000000
FN    160.000000  1532.000000
TPR     0.716312     0.488139
TNR     0.843298     0.919577
FPR     0.156702     0.080423
FNR     0.283688     0.511861
FDR     0.624186     0.272773
FOR     0.042429     0.196461
PPV

In [20]:
#display parameters file
Aeq_training_unbiased.display()

#save paramenters file to remote server
gw.save_element(Aeq_training_unbiased.parameters, element_key="dataset", version="unbiased")

Aequitas Dataset parameters:
{
    "class_attribute": {
        "name": "income",
        "positive_value": ">50K"
    },
    "sensitive_attributes": [
        {
            "name": "gender",
            "privileged_group": "Male"
        }
    ],
    "Mitigation": "True",
    "Mitigation_technique": "massaging",
    "transform_dictionary": {
        "income": {
            "encode": "labeling",
            "labels": {
                "<=50K": 0,
                ">50K": 1
            }
        },
        "gender": {
            "encode": "labeling",
            "labels": {
                "Female": 0,
                "Male": 1
            }
        },
        "race": {
            "encode": "labeling",
            "labels": {
                "Minority": 0,
                "White": 1
            }
        },
        "workclass": {
            "encode": "labeling",
            "scaling": "min-max"
        },
        "marital-status": {
            "encode": "labeling",
            "scali

In [21]:
#display parameters file
Aeq_training.display()

#save paramenters file to remote server
gw.save_element(Aeq_training_unbiased.parameters, element_key="dataset", version="training")

Aequitas Dataset parameters:
{
    "class_attribute": {
        "name": "income",
        "positive_value": ">50K"
    },
    "sensitive_attributes": [
        {
            "name": "gender",
            "privileged_group": "Male"
        }
    ],
    "Mitigation": "False",
    "proportions": {
        "income": {
            "<=50K": 0.7622042177308491,
            ">50K": 0.2377957822691509
        },
        "gender": {
            "Male": 0.668138875076779,
            "Female": 0.3318611249232209
        }
    },
    "outcome_distribution_by_group": {
        "income/gender": {
            "Female": {
                "<=50K": 0.8937951701040014,
                ">50K": 0.1062048298959986
            },
            "Male": {
                "<=50K": 0.6968436720220637,
                ">50K": 0.30315632797793635
            }
        }
    },
    "contingency": [
        {
            "attribute1": "gender",
            "attribute2": "income",
            "contingency_table": {
   

In [22]:
#display parameters file
Aeq_test.display()

#save paramenters file to remote server
gw.save_element(Aeq_training_unbiased.parameters, element_key="dataset", version="test")

Aequitas Dataset parameters:
{
    "class_attribute": {
        "name": "income",
        "positive_value": ">50K"
    },
    "sensitive_attributes": [
        {
            "name": "gender",
            "privileged_group": "Male"
        }
    ],
    "Mitigation": "False",
    "transform_dictionary": {
        "income": {
            "encode": "labeling",
            "labels": {
                "<=50K": 0,
                ">50K": 1
            }
        },
        "gender": {
            "encode": "labeling",
            "labels": {
                "Female": 0,
                "Male": 1
            }
        },
        "race": {
            "encode": "labeling",
            "labels": {
                "Minority": 0,
                "White": 1
            }
        },
        "workclass": {
            "encode": "labeling",
            "scaling": "min-max"
        },
        "marital-status": {
            "encode": "labeling",
            "scaling": "min-max"
        },
        "occup