In [25]:
import pandas as pd
import numpy as np
import os
import itertools

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from data import construct_hier

from folktables import ACSDataSource, ACSEmployment, ACSIncome, ACSPublicCoverage

In [26]:
path = 'datasets/adult/adult_reconstruction.csv'
df = pd.read_csv(path, header=0, na_values='?')
pd.to_numeric(df['income'])
df['label'] = df['income'] >= 50000
X, y = df.drop(['income', 'label'], axis=1), df['label']

In [27]:
# Get categorical and numerical features
cat_idx = X.select_dtypes(include=['object', 'bool']).columns
num_idx = X.select_dtypes(include=['int64', 'float64']).columns
steps = [('cat', OneHotEncoder(handle_unknown='ignore'), cat_idx), ('num', StandardScaler(), num_idx)]
col_transf = ColumnTransformer(steps)

In [33]:
# Group logic
ALL = [True] * y.shape[0]

EDU_HS = ['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
          'Preschool', 'HS-grad'] 
EDU_COL= ['Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate',
          'Masters', 'Prof-school', 'Some-college']

# Age
young = np.array(X['age'] <= 35)
mid = np.array((X['age'] > 35) & (X['age'] <= 50))
old = np.array(X['age'] > 50)
age_group_names = ['Ya', 'Ma', 'Oa']
age_groups = [young, mid, old]

# Education
edu_hs = np.array(X['education'].isin(EDU_HS))
edu_col = np.array(X['education'].isin(EDU_COL))
smoker_groups = [edu_hs, edu_col]
smoker_group_names = ['HS', 'COL']

# Race groups
race_aie = np.array(X['race'] == 'Amer-Indian-Eskimo')
race_api = np.array(X['race'] == 'Asian-Pac-Islander')
race_b = np.array(X['race'] == 'Black')
race_o = np.array(X['race'] == 'Other')
race_w = np.array(X['race'] == 'White')
race_groups = [race_aie, race_api, race_b, race_o, race_w]
race_group_names = ["AIE", "API", "B", "O", "W"]

# Sex groups
sex_groups = [np.array(X['gender'] == 'Male'), 
              np.array(X['gender'] == 'Female')]
sex_group_names = ['M', 'F']

groups, group_names, tree = construct_hier([[ALL], sex_groups, race_groups],
                                           [["ALL"], sex_group_names, race_group_names])