In [51]:
import pandas as pd
import numpy as np
np.random.seed(7)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", palette=[sns.color_palette('muted')[i] for i in [0,2]], 
        color_codes=True, context="talk")
from IPython import display
%matplotlib inline

import sklearn as sk
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

print(f"sklearn: {sk.__version__}")
print(f"pandas: {pd.__version__}")
print(f"tensorflow: {tf.__version__}")

sklearn: 1.1.2
pandas: 1.4.3
tensorflow: 2.10.0


In [52]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital_status', 'occupation', 'relationship', 'race', 'sex', 
                'capital_gain', 'capital_loss', 'hours_per_week', 'country', 'target']

input_data = (pd.read_csv('E:/canada syntex/Github/fair_classifier_ml/data/adult.data', names=column_names, 
                              na_values="?", sep=r'\s*,\s*', engine='python') # here seperator -- 0 or more whitespace then , then 0 or more whitespace --
                              .loc[lambda df: df['race'].isin(['White', 'Black'])])

input_data = input_data.dropna()

# labeling data
lb_make = LabelEncoder()
input_data['workclass'] = lb_make.fit_transform(input_data['workclass'])
input_data['education'] = lb_make.fit_transform(input_data['workclass'])
input_data['marital_status'] = lb_make.fit_transform(input_data['marital_status'])
input_data['occupation'] = lb_make.fit_transform(input_data['occupation'])
input_data['relationship'] = lb_make.fit_transform(input_data['relationship'])
input_data['country'] = lb_make.fit_transform(input_data['country'])
input_data['target'] = lb_make.fit_transform(input_data['target'])

print(input_data.shape)
input_data.head(10).T


(28750, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
age,39,50,38,53,28,37,49,52,31,42
workclass,5,4,2,2,2,2,2,4,2,2
fnlwgt,77516,83311,215646,234721,338409,284582,160187,209642,45781,159449
education,5,4,2,2,2,2,2,4,2,2
education_num,13,13,9,7,13,14,5,9,14,13
marital_status,4,2,0,2,2,2,3,2,4,2
occupation,0,3,5,5,9,3,7,3,9,3
relationship,1,0,1,0,5,5,1,0,1,0
race,White,White,White,Black,Black,White,Black,White,White,White
sex,Male,Male,Male,Male,Female,Female,Female,Male,Female,Male


## Race

### creating dataset for white race

In [69]:
white_data = input_data[(input_data["race"]=="White")]
sensitive_data = white_data.loc[:, ['race', 'sex']].drop(columns=['sex'])
# sensitive_data[sensitive_data["race"]=="White"] = 1
white_data = white_data.drop(columns=['target', 'race', 'sex'])

print(white_data.shape)
print(sensitive_data.shape)
white_data.head().T

(25933, 12)
(25933, 1)


Unnamed: 0,0,1,2,5,7
age,39,50,38,37,52
workclass,5,4,2,2,4
fnlwgt,77516,83311,215646,284582,209642
education,5,4,2,2,4
education_num,13,13,9,14,9
marital_status,4,2,0,2,2
occupation,0,3,5,3,3
relationship,1,0,1,5,0
capital_gain,2174,0,0,0,0
capital_loss,0,0,0,0,0


In [71]:
mut_race_white = mutual_info_classif(white_data, sensitive_data['race'])
print(type(mut_race_white))
print(mut_race_white)

<class 'numpy.ndarray'>
[1.15682721e-04 3.02703120e-03 0.00000000e+00 2.73782439e-03
 1.00258358e-03 2.39077623e-03 5.78413604e-04 7.90498593e-04
 1.92804535e-05 1.92804535e-05 1.33035129e-03 6.61319554e-03]


### creating dataset for black race

In [60]:
black_data = input_data[(input_data["race"]=="Black")]
sensitive_data = black_data.loc[:, ['race', 'sex']].drop(columns=['sex'])
black_data = black_data.drop(columns=['target', 'race', 'sex'])

print(black_data.shape)
print(sensitive_data.shape)
black_data.head().T

(2817, 12)
(2817, 1)


Unnamed: 0,3,4,6,10,13
age,53,28,49,37,32
workclass,2,2,2,2,2
fnlwgt,234721,338409,160187,280464,205019
education,2,2,2,2,2
education_num,7,13,5,10,12
marital_status,2,2,3,2,4
occupation,5,9,7,3,11
relationship,0,5,1,0,1
capital_gain,0,0,0,0,0
capital_loss,0,0,0,0,0


In [61]:
mut_race_black = mutual_info_classif(black_data, sensitive_data['race'])
print(mut_race_black)

[3.33066907e-16 3.54987575e-04 3.33066907e-16 5.32481363e-04
 1.77493788e-04 1.77493788e-04 3.54987575e-04 3.33066907e-16
 3.33066907e-16 3.33066907e-16 1.77493788e-04 7.09975151e-04]


## Sex

### Male

In [64]:
male_data = input_data[(input_data["sex"]=="Male")]
sensitive_data = male_data.loc[:, ['race', 'sex']].drop(columns=['race'])
male_data = male_data.drop(columns=['target', 'race', 'sex'])

print(male_data.shape)
print(sensitive_data.shape)
male_data.head().T

(19456, 12)
(19456, 1)


Unnamed: 0,0,1,2,3,7
age,39,50,38,53,52
workclass,5,4,2,2,4
fnlwgt,77516,83311,215646,234721,209642
education,5,4,2,2,4
education_num,13,13,9,7,9
marital_status,4,2,0,2,2
occupation,0,3,5,5,3
relationship,1,0,1,0,0
capital_gain,2174,0,0,0,0
capital_loss,0,0,0,0,0


In [65]:
mut_sex_male = mutual_info_classif(male_data, sensitive_data['sex'])
print(mut_sex_male)

[1.02796053e-04 2.31291118e-03 1.22124533e-15 2.33861020e-03
 5.13980263e-04 1.90172697e-03 1.79893092e-04 4.88281250e-04
 1.22124533e-15 2.56990132e-05 1.00226151e-03 5.34539474e-03]


### Female

In [66]:
female_data = input_data[(input_data["sex"]=="Female")]
sensitive_data = female_data.loc[:, ['race', 'sex']].drop(columns=['race'])
female_data = female_data.drop(columns=['target', 'race', 'sex'])

print(female_data.shape)
print(sensitive_data.shape)
female_data.head().T

(9294, 12)
(9294, 1)


Unnamed: 0,4,5,6,8,12
age,28,37,49,31,23
workclass,2,2,2,2,2
fnlwgt,338409,284582,160187,45781,122272
education,2,2,2,2,2
education_num,13,14,5,14,13
marital_status,2,2,3,4,4
occupation,9,3,7,9,0
relationship,5,5,1,1,3
capital_gain,0,0,0,14084,0
capital_loss,0,0,0,0,0


In [68]:
mut_sex_female = mutual_info_classif(female_data, sensitive_data['sex'])
print(mut_sex_female)

[0.         0.00145255 0.         0.00161394 0.00037659 0.00064558
 0.00026899 0.00091457 0.         0.         0.00043039 0.00188294]


In [None]:
# for i, row in X.iterrows():
#     print(i, row)
#     break