In [1]:
import sys
sys.path.append("../")

In [2]:
import numpy as np
import pandas as pd
import aequitas.tools.data_manip as dm
import aequitas.detection.descriptive_stats as dstats
import aequitas.detection.metrics as metrics
pd.set_option('display.width', 500)

In [3]:
#Import dataset
dataset_name="Census_Income_Dataset.csv"
dataset_directory="../datasets/"+dataset_name
dataset = pd.read_csv(dataset_directory)

In [4]:
# Dataset Pre-Processing

# remove fnlwgt column (per instructions)
dataset = dataset.drop('fnlwgt', axis=1)

# remove education column since there is an educution_num
dataset = dataset.drop('education', axis=1)

# impute the missing values
num_data = dataset.shape[0]
col_names = dataset.columns
for c in col_names:
	dataset[c] = dataset[c].replace("?", np.NaN)
dataset = dataset.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [5]:
# display dataset structure
dataset_struct=dstats.analyse_dataset(dataset,verbose=True)

Dataset:
        Column Name Data Type Column Type (suggestion)  Number_Values                                             Values
0               age     int64               Continuous             74                                                  -
1         workclass      text      Categorical/Ordinal              8  [Private, Local-gov, Self-emp-not-inc, Federal...
2   educational-num     int64      Categorical/Ordinal             16  [7, 9, 12, 10, 6, 15, 4, 13, 14, 16, 3, 11, 5,...
3    marital-status      text      Categorical/Ordinal              7  [Never-married, Married-civ-spouse, Widowed, D...
4        occupation      text      Categorical/Ordinal             14  [Machine-op-inspct, Farming-fishing, Protectiv...
5      relationship      text      Categorical/Ordinal              6  [Own-child, Husband, Not-in-family, Unmarried,...
6              race      text      Categorical/Ordinal              5  [Black, White, Asian-Pac-Islander, Other, Amer...
7            gender    

In [6]:
# Display dataset proportions for class outcome
pr1=dstats.proportions(dataset,'income',verbose=True)

Proportions: (income)
              0
<=50K  0.760718
>50K   0.239282



In [7]:
# Display dataset proportions for some interesting features 
pr2=dstats.proportions(dataset,'gender',verbose=True)
pr3=dstats.proportions(dataset,'race',verbose=True)

Proportions: (gender)
               0
Male    0.668482
Female  0.331518

Proportions: (race)
                           0
White               0.855043
Black               0.095922
Asian-Pac-Islander  0.031100
Amer-Indian-Eskimo  0.009623
Other               0.008313



In [8]:
# compute outcome distribution of gender and race
outdist1=dstats.outcome_distribution_by_group(dataset, 'income', 'gender',verbose=True)

# compute outcome distribution of race and race
outdist1=dstats.outcome_distribution_by_group(dataset, 'income', 'race',verbose=True)

Outcome distribution by group:
           <=50K      >50K
Female  0.890749  0.109251
Male    0.696233  0.303767

Outcome distribution by group:
                       <=50K      >50K
Amer-Indian-Eskimo  0.882979  0.117021
Asian-Pac-Islander  0.730744  0.269256
Black               0.879189  0.120811
Other               0.876847  0.123153
White               0.746013  0.253987



In [9]:
# Regroup race feature to White and Minorities
groups = [['White'], ['Black','Asian-Pac-Islander','Other','Amer-Indian-Eskimo']]
labels=['White','Minority']
dataset["race"]=dm.merge_values(dataset["race"],groups,labels)
print("Unique values: ",dataset["race"].unique())

Unique values:  ['Minority' 'White']


In [10]:
# compute the assosiation between features | chi^2 test and Cramer's V test
rel=dstats.contingency(dataset, 'income', ['gender','race'],verbose=True)


Association between gender and race.
Contingency Table:
race    Minority  White
gender                 
Female      3165  13027
Male        3915  28735

Chi-squared statistic: 497.9678182429906
Cramer's V: 0.10087228311688282
Degrees of Freedom: 1
p-value: 2.6310785315092373e-110
There is a statistically significant association between gender and race.

Association between gender and income.
Contingency Table:
income  <=50K  >50K
gender             
Female  14423  1769
Male    22732  9918

Chi-squared statistic: 2248.847679013691
Cramer's V: 0.21453154666546925
Degrees of Freedom: 1
p-value: 0.0
There is a statistically significant association between gender and income.

Association between race and income.
Contingency Table:
income    <=50K   >50K
race                  
Minority   6000   1080
White     31155  10607

Chi-squared statistic: 341.69392421526
Cramer's V: 0.08351988884542906
Degrees of Freedom: 1
p-value: 2.7287685643121156e-76
There is a statistically significant associat

In [11]:
# Compute the mutual information between features
mut=dstats.mutual_information(dataset, 'income', ['gender','race'],verbose=True)

Mutual Information between gender and race: 0.004913186885676296
Mutual Information between gender and income: 0.02543127128464205
Mutual Information between race and income: 0.003810673439274867


In [12]:
# compute outcome probabilities of gender and race
prob_gender=metrics.stats(dataset,'income','gender',verbose=True)
prob_gender=metrics.stats(dataset,'income','race',verbose=True)

Probabilities:
           Male    Female
<=50K  0.696233  0.890749
>50K   0.303767  0.109251

Probabilities:
       Minority     White
<=50K  0.847458  0.746013
>50K   0.152542  0.253987



In [13]:
# compute statistical parity for gender and race
positive_outcome='>50K'
priviledged_gender='Male'
priviledged_race='White'
sp_gender=metrics.statistical_parity(dataset,'income','gender',positive_outcome=positive_outcome,privileged_group=priviledged_gender,verbose=True)
sp_race=metrics.statistical_parity(dataset,'income','race',positive_outcome=positive_outcome,privileged_group=priviledged_race,verbose=True)

Statistical/Demographic Parity:
Outcome:  >50K
      Male    Female
Male   0.0  0.194516


Statistical/Demographic Parity:
Outcome:  >50K
       Minority  White
White  0.101445    0.0




In [14]:
# compute disparate impact for gender and race
positive_outcome='>50K'
priviledged_gender='Male'
priviledged_race='White'
sp_gender=metrics.disparate_impact(dataset,'income','gender',positive_outcome=positive_outcome,privileged_group=priviledged_gender,verbose=True)
sp_race=metrics.disparate_impact(dataset,'income','race',positive_outcome=positive_outcome,privileged_group=priviledged_race,verbose=True)

Disparate Impact:
Outcome:  >50K
      Male    Female
Male   1.0  0.359655


Disparate Impact:
Outcome:  >50K
       Minority  White
White  0.600592    1.0


