# Part 1 - Install the required packages



In [None]:
!pip install autoviz -q

In [None]:
!pip install pandas_profiling==2.9.0 -U -q

In [None]:
!pip install shap -q 

In [None]:
!pip install openml

# Part 2 - Import packages

In [None]:
import openml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport


# Part 3 - Get the OpenML dataset

This method imports the employee_salaries from an existing task in OpenML

In [None]:
task = openml.tasks.get_task(295794)
data = openml.datasets.get_dataset(task.dataset_id)

In [None]:
X, y, categorical_indicator, attribute_names = data.get_data(
    dataset_format="dataframe", target=data.default_target_attribute)
es = pd.DataFrame(X, columns=attribute_names)
es["salary"] = y
es.head()

# Do initial EDA

Run a profile report to get an idea of all parameters/features

In [None]:
ProfileReport(es, explorative=True)

In [None]:
es.info()

# Complete the dataset by filling in NaN/missing values

Dropping NaN values does introduce bias however the data is incomplete and analysis can only be done on a complete set of features.

In [None]:
es_new = es.copy()

es_new = es_new.drop(columns=['full_name', 'department'])
es_new['2016_overtime_pay'] = es_new['2016_overtime_pay'].fillna(0)
es_new[['underfilled_job']] = es_new[['underfilled_job_title']].where(es_new[['underfilled_job_title']].isnull(), 1).fillna(0).astype(int)
es_new['underfilled_job_title'] = es_new['underfilled_job_title'].fillna("None")
es_new.date_first_hired = pd.to_datetime(es_new.date_first_hired)
es_new.year_first_hired = pd.to_datetime(es_new.year_first_hired, format="%Y")
es_new.dropna()

In [None]:
es_new.info()

In [None]:
es_new.head(5)

# Encode gender and assignment_category

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

es_new.gender = enc.fit_transform(es_new.gender)
es_new.assignment_category = enc.fit_transform(es_new.assignment_category)
es_new['higher_salary'] = es_new['salary'].apply(lambda x: 1 if x >= 	91969.07 else 0)
es_new.head(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.figure(figsize=(20,10))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

plt.subplot(3, 3, 1)
sns.barplot(x = "gender", y='salary', data = es_new)
plt.title('Probability of salary per gender type')

plt.subplot(3, 3, 2)
sns.barplot(x = "department_name", y='salary', data = es_new)
plt.title('Probability of salary per department')

plt.subplot(3, 3, 3)
sns.barplot(x = "division", y='salary', data = es_new)
plt.title('Probability of salary per division')

plt.subplot(3, 3, 4)
sns.barplot(x = "assignment_category", y='salary', data = es_new)
plt.title('Probability of salary depending on full-time/part-time')

plt.subplot(3, 3, 5)
sns.barplot(x = "employee_position_title", y='salary', data = es_new)
plt.title('Probability of salary per employment position')

plt.subplot(3, 3, 6)
sns.barplot(x = "2016_overtime_pay", y='salary', data = es_new)
plt.title('Probability of salary per overtime pay')

plt.subplot(3, 3, 7)
sns.barplot(x = "year_first_hired", y='salary', data = es_new)
plt.title('Probability of salary per year')

From the above charts we can see that there is some bias in gender.  However, the bias is more aparent in department_name, divisions and year_first_hired.

There may also be some correlation between overtime_pay and salary.

# Try AIF360

The AIF360 library is an AI Fairness library which is used to identify and mitigate bias.

In [None]:
# Install the AIF 360 library
!pip install aif360 -q
!pip install fairlearn -q
!pip install lime -q

In [None]:
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing
from sklearn.model_selection import train_test_split

In [None]:
features = pd.concat([es_new.gender, 
                      es_new.assignment_category,
                      es_new.salary,
                      es_new.higher_salary,
                      es_new['2016_overtime_pay'],
                      es_new['2016_gross_pay_received'], 
                      es_new.underfilled_job,
                      pd.get_dummies(es_new.department_name)], 
                      axis=1)


features = features.dropna()
features

In [None]:
x = features.drop(columns=['salary']).values
y = features['salary'].values


df_aif = BinaryLabelDataset(df=features, label_names=['higher_salary'], protected_attribute_names=['gender'])

privileged_groups = [{'gender': 0}]
unprivileged_groups = [{'gender': 1}]

features_orig_trn, features_orig_val, features_orig_tst = df_aif.split([0.5, 0.8], shuffle=True)
print([x.features.shape for x in [features_orig_trn, features_orig_val, features_orig_tst]])


In [None]:
from IPython.display import Markdown, display

metric_orig_trn = BinaryLabelDatasetMetric(features_orig_trn, unprivileged_groups, privileged_groups)

display(Markdown("#### Original training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_orig_trn.mean_difference())

# Mitigate bias by Reweighing

In [None]:
RW = Reweighing(unprivileged_groups=unprivileged_groups,
                privileged_groups=privileged_groups)
dataset_transf_train = RW.fit_transform(features_orig_trn)

# Compute fairness metric on transformed dataset

In [None]:
metric_transf_train = BinaryLabelDatasetMetric(dataset_transf_train, 
                                               unprivileged_groups=unprivileged_groups,
                                               privileged_groups=privileged_groups)
display(Markdown("#### Transformed training dataset"))
print("Difference in mean outcomes between unprivileged and privileged groups = %f" % metric_transf_train.mean_difference())

**We selected gender as the first attribute to identify bias, mitigate and re-evaluate the metrics.  
The bias before reweighing was 0.024380 and after -0.000000**