In [1]:
import pandas as pd
import numpy as np
np.random.seed(7)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", palette=[sns.color_palette('muted')[i] for i in [0,2]], 
        color_codes=True, context="talk")
from IPython import display
%matplotlib inline

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

print(f"sklearn: {sk.__version__}")
print(f"pandas: {pd.__version__}")
print(f"tensorflow: {tf.__version__}")

sklearn: 0.24.2
pandas: 1.1.5
tensorflow: 2.0.0


In [2]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital_status', 'occupation', 'relationship', 'race', 'sex', 
                'capital_gain', 'capital_loss', 'hours_per_week', 'country', 'target']

input_data = (pd.read_csv('E:/canada syntex/Github/fair_classifier_ml/data/adult.data', names=column_names, 
                              na_values="?", sep=r'\s*,\s*', engine='python') # here seperator -- 0 or more whitespace then , then 0 or more whitespace --
                              .loc[lambda df: df['race'].isin(['White', 'Black'])])

input_data = input_data.dropna()

# labeling data
lb_make = LabelEncoder()
input_data['workclass'] = lb_make.fit_transform(input_data['workclass'])
input_data['education'] = lb_make.fit_transform(input_data['workclass'])
input_data['marital_status'] = lb_make.fit_transform(input_data['marital_status'])
input_data['occupation'] = lb_make.fit_transform(input_data['occupation'])
input_data['relationship'] = lb_make.fit_transform(input_data['relationship'])
input_data['country'] = lb_make.fit_transform(input_data['country'])
input_data['target'] = lb_make.fit_transform(input_data['target'])

print(input_data.shape)
input_data.head(10).T


(28750, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
age,39,50,38,53,28,37,49,52,31,42
workclass,5,4,2,2,2,2,2,4,2,2
fnlwgt,77516,83311,215646,234721,338409,284582,160187,209642,45781,159449
education,5,4,2,2,2,2,2,4,2,2
education_num,13,13,9,7,13,14,5,9,14,13
marital_status,4,2,0,2,2,2,3,2,4,2
occupation,0,3,5,5,9,3,7,3,9,3
relationship,1,0,1,0,5,5,1,0,1,0
race,White,White,White,Black,Black,White,Black,White,White,White
sex,Male,Male,Male,Male,Female,Female,Female,Male,Female,Male


## Race

### creating dataset for white race

In [3]:
white_data = input_data[(input_data["race"]=="White")]
sensitive_data = white_data.loc[:, ['race', 'sex']].drop(columns=['sex'])
# sensitive_data[sensitive_data["race"]=="White"] = 1
white_data = white_data.drop(columns=['target', 'race', 'sex'])

print(white_data.shape)
print(sensitive_data.shape)
white_data.head().T

(25933, 12)
(25933, 1)


Unnamed: 0,0,1,2,5,7
age,39,50,38,37,52
workclass,5,4,2,2,4
fnlwgt,77516,83311,215646,284582,209642
education,5,4,2,2,4
education_num,13,13,9,14,9
marital_status,4,2,0,2,2
occupation,0,3,5,3,3
relationship,1,0,1,5,0
capital_gain,2174,0,0,0,0
capital_loss,0,0,0,0,0


In [4]:
mut_race_white = mutual_info_classif(white_data, sensitive_data['race'])
print(type(mut_race_white))
print(mut_race_white)

<class 'numpy.ndarray'>
[1.54243628e-04 3.04631165e-03 0.00000000e+00 3.23911618e-03
 9.83303127e-04 2.35221532e-03 7.13376779e-04 5.01291790e-04
 1.92804535e-05 5.78413604e-05 1.38819265e-03 6.72887826e-03]


### creating dataset for black race

In [5]:
black_data = input_data[(input_data["race"]=="Black")]
sensitive_data = black_data.loc[:, ['race', 'sex']].drop(columns=['sex'])
black_data = black_data.drop(columns=['target', 'race', 'sex'])

print(black_data.shape)
print(sensitive_data.shape)
black_data.head().T

(2817, 12)
(2817, 1)


Unnamed: 0,3,4,6,10,13
age,53,28,49,37,32
workclass,2,2,2,2,2
fnlwgt,234721,338409,160187,280464,205019
education,2,2,2,2,2
education_num,7,13,5,10,12
marital_status,2,2,3,2,4
occupation,5,9,7,3,11
relationship,0,5,1,0,1
capital_gain,0,0,0,0,0
capital_loss,0,0,0,0,0


In [6]:
mut_race_black = mutual_info_classif(black_data, sensitive_data['race'])
print(mut_race_black)

[3.33066907e-16 3.54987575e-04 3.33066907e-16 5.32481363e-04
 1.77493788e-04 1.77493788e-04 3.54987575e-04 3.33066907e-16
 3.33066907e-16 3.33066907e-16 1.77493788e-04 7.09975151e-04]


## Sex

### Male

In [7]:
male_data = input_data[(input_data["sex"]=="Male")]
sensitive_data = male_data.loc[:, ['race', 'sex']].drop(columns=['race'])
male_data = male_data.drop(columns=['target', 'race', 'sex'])

print(male_data.shape)
print(sensitive_data.shape)
male_data.head().T

(19456, 12)
(19456, 1)


Unnamed: 0,0,1,2,3,7
age,39,50,38,53,52
workclass,5,4,2,2,4
fnlwgt,77516,83311,215646,234721,209642
education,5,4,2,2,4
education_num,13,13,9,7,9
marital_status,4,2,0,2,2
occupation,0,3,5,5,3
relationship,1,0,1,0,0
capital_gain,2174,0,0,0,0
capital_loss,0,0,0,0,0


In [8]:
mut_sex_male = mutual_info_classif(male_data, sensitive_data['sex'])
print(mut_sex_male)

[5.13980263e-05 1.97882401e-03 1.22124533e-15 2.28721217e-03
 9.76562500e-04 2.67269737e-03 3.85485197e-04 3.85485197e-04
 7.70970395e-05 1.22124533e-15 1.43914474e-03 4.98560855e-03]


### Female

In [9]:
female_data = input_data[(input_data["sex"]=="Female")]
sensitive_data = female_data.loc[:, ['race', 'sex']].drop(columns=['race'])
female_data = female_data.drop(columns=['target', 'race', 'sex'])

print(female_data.shape)
print(sensitive_data.shape)
female_data.head().T

(9294, 12)
(9294, 1)


Unnamed: 0,4,5,6,8,12
age,28,37,49,31,23
workclass,2,2,2,2,2
fnlwgt,338409,284582,160187,45781,122272
education,2,2,2,2,2
education_num,13,14,5,14,13
marital_status,2,2,3,4,4
occupation,9,3,7,9,0
relationship,5,5,1,1,3
capital_gain,0,0,0,14084,0
capital_loss,0,0,0,0,0


In [10]:
mut_sex_female = mutual_info_classif(female_data, sensitive_data['sex'])
print(mut_sex_female)

[0.         0.00096837 0.         0.00118356 0.00080697 0.00043039
 0.00037659 0.00026899 0.         0.         0.00043039 0.00252851]


## Feature selection

In [11]:
mut_race = mut_race_white * mut_race_black
mut_sex = mut_sex_male * mut_sex_female
mut = mut_race + mut_sex
print(mut)

[5.13734481e-20 2.99763004e-06 0.00000000e+00 4.43182020e-06
 9.62589025e-07 1.56779298e-06 3.98408625e-07 1.03691951e-07
 6.42168101e-21 1.92650430e-20 8.65782159e-07 1.73835125e-05]


In [12]:
mut_percent = (mut/np.sum(mut))*100
print(mut_percent)

[1.78931563e-13 1.04406196e+01 0.00000000e+00 1.54358437e+01
 3.35265716e+00 5.46055715e+00 1.38764051e+00 3.61154713e-01
 2.23664454e-14 6.70993361e-14 3.01548291e+00 6.05460442e+01]


In [13]:
for i in mut_percent:
    print(i)

1.7893156301064985e-13
10.440619594048515
0.0
15.435843718120537
3.352657162812929
5.460557148328273
1.3876405136102177
0.3611547125342888
2.2366445375944854e-14
6.709933612873611e-14
3.015482914711145
60.54604423583383


In [14]:
remove_features = []
features_name = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
            'marital_status', 'occupation', 'relationship','capital_gain',
            'capital_loss', 'hours_per_week', 'country']

for i in range(len(features_name)):
    if mut_percent[i] < 0.1:
        remove_features.append(features_name[i])

print(remove_features)

['age', 'fnlwgt', 'capital_gain', 'capital_loss']


In [15]:
new_input_data = input_data.drop(columns = remove_features)
new_input_data.head()

Unnamed: 0,workclass,education,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,country,target
0,5,5,13,4,0,1,White,Male,40,36,0
1,4,4,13,2,3,0,White,Male,13,36,0
2,2,2,9,0,5,1,White,Male,40,36,0
3,2,2,7,2,5,0,Black,Male,40,36,0
4,2,2,13,2,9,5,Black,Female,40,3,0


In [16]:
sensitive_attribs = ['race', 'sex']
Z = (new_input_data.loc[:, sensitive_attribs]
        .assign(race=lambda df: (df['race'] == 'White').astype(int),
        sex=lambda df: (df['sex'] == 'Male').astype(int)))

y = new_input_data['target']

X = new_input_data.drop(columns=['target', 'race', 'sex']).pipe(pd.get_dummies, drop_first=True)

print(f"features X: {X.shape[0]} samples, {X.shape[1]} attributes")
print(f"targets y: {y.shape[0]} samples")
print(f"sensitives Z: {Z.shape[0]} samples, {Z.shape[1]} attributes")

features X: 28750 samples, 8 attributes
targets y: 28750 samples
sensitives Z: 28750 samples, 2 attributes


In [None]:
# for i, row in X.iterrows():
#     print(i, row)
#     break