In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import os
import numpy as np
import tensorflow as tf
from sklearn import model_selection
from sklearn.metrics import log_loss
import sklearn
from copy import deepcopy
import pandas as pd
from sklearn.utils import shuffle
from sklearn import model_selection

BASE_DIR = '../../../'
import sys
sys.path.append(BASE_DIR)

# custom code
import utils.utils
CONFIG = utils.utils.load_config("../../config.json")

Using TensorFlow backend.


In [4]:
DATASET = os.path.basename(os.getcwd()) # name of folder this file is in
RANDOM_SEED = CONFIG['random_seed']
EVAL_GROUPS = CONFIG['experiment_configs'][DATASET]['eval_groups']
HYPER_TRAIN_SPLIT = CONFIG["experiment_configs"][DATASET]["hyper_train_split"]
VAL_FULL_SPLIT = CONFIG["experiment_configs"][DATASET]["val_full_split"]

print(RANDOM_SEED)
print(f"EVAL_GROUPS: {EVAL_GROUPS}")

DATA_F = os.path.join(BASE_DIR, f'data/adult/')
PROCESSED_DIR = os.path.join(BASE_DIR, f'processed/{DATASET}/rs={RANDOM_SEED}')
MODELS_DIR = os.path.join(BASE_DIR, f'models/{DATASET}/rs={RANDOM_SEED}')

PROCESSED_SAVEPATH = utils.utils.get_savepath(PROCESSED_DIR, DATASET, ".pkl", eg=EVAL_GROUPS)

# processing saved here
if os.path.exists(PROCESSED_SAVEPATH):
    print(f"warning: processing has been done for rs={RANDOM_SEED}")

15
EVAL_GROUPS: ['gender_Male', 'gender_Female']


In [5]:
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

In [6]:
def get_adult_data():

    CATEGORICAL_COLUMNS = [
      'workclass', 'education', 'occupation', 'relationship', 
      'race', 'gender', 'native_country'
    ]
    CONTINUOUS_COLUMNS = [
      'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'education_num'
    ]
    COLUMNS = [
      'age', 'workclass', 'fnlwgt', 'education', 'education_num',
      'marital_status', 'occupation', 'relationship', 'race', 'gender',
      'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
      'income_bracket'
    ]
    LABEL_COLUMN = 'label'

    train_df_raw = pd.read_csv(
        os.path.join( DATA_F, 'adult.data' ),
        names=COLUMNS,
        skipinitialspace=True,
    )
    test_df_raw = pd.read_csv(
        os.path.join( DATA_F, 'adult.test' ),
        names=COLUMNS,
        skipinitialspace=True,
        skiprows=1,
    )

    train_df_raw[LABEL_COLUMN] = (train_df_raw['income_bracket'].apply(
        lambda x: '>50K' in x)).astype(int)
    test_df_raw[LABEL_COLUMN] = (test_df_raw['income_bracket'].apply(
        lambda x: '>50K' in x)).astype(int)

    # Preprocessing Features
    pd.options.mode.chained_assignment = None  # default='warn'

    # Functions for preprocessing categorical and continuous columns.
    def binarize_categorical_columns(
        input_train_df, input_test_df, categorical_columns=[]):

        def fix_columns(input_train_df, input_test_df):
            test_df_missing_cols = set(input_train_df.columns) - set(
                input_test_df.columns)
            for c in test_df_missing_cols:
                input_test_df[c] = 0
                train_df_missing_cols = set(input_test_df.columns) - set(
                    input_train_df.columns)
            for c in train_df_missing_cols:
                input_train_df[c] = 0
                input_train_df = input_train_df[input_test_df.columns]
            return input_train_df, input_test_df

        # Binarize categorical columns.
        binarized_train_df = pd.get_dummies(
            input_train_df, columns=categorical_columns)
        binarized_test_df = pd.get_dummies(
            input_test_df, columns=categorical_columns)
        # Make sure the train and test dataframes have the same binarized columns.
        fixed_train_df, fixed_test_df = fix_columns(
            binarized_train_df, binarized_test_df)
        return fixed_train_df, fixed_test_df

    def bucketize_continuous_column(input_train_df,
                                input_test_df,
                                continuous_column_name,
                                num_quantiles=None,
                                bins=None):
        assert (num_quantiles is None or bins is None)
        if num_quantiles is not None:
            train_quantized, bins_quantized = pd.qcut(
                input_train_df[continuous_column_name],
                num_quantiles,
                retbins=True,
                labels=False)
            input_train_df[continuous_column_name] = pd.cut(
                input_train_df[continuous_column_name], bins_quantized, 
                labels=False)
            input_test_df[continuous_column_name] = pd.cut(
                input_test_df[continuous_column_name], bins_quantized, labels=False)
        elif bins is not None:
            input_train_df[continuous_column_name] = pd.cut(
                input_train_df[continuous_column_name], bins, labels=False)
            input_test_df[continuous_column_name] = pd.cut(
                input_test_df[continuous_column_name], bins, labels=False)

    # Filter out all columns except the ones specified.
    train_df = (
        train_df_raw[CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS + [LABEL_COLUMN]])
    test_df = (
        test_df_raw[CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS + [LABEL_COLUMN]])
  
    # Bucketize continuous columns.
    bucketize_continuous_column(train_df, test_df, 'age', num_quantiles=4)
    bucketize_continuous_column(
        train_df, test_df, 'capital_gain', bins=[-1, 1, 4000, 10000, 100000])
    bucketize_continuous_column(
        train_df, test_df, 'capital_loss', bins=[-1, 1, 1800, 1950, 4500])
    bucketize_continuous_column(
        train_df, test_df, 'hours_per_week', bins=[0, 39, 41, 50, 100])
    bucketize_continuous_column(
        train_df, test_df, 'education_num', bins=[0, 8, 9, 11, 16])
  
    train_df, test_df = binarize_categorical_columns(
        train_df, test_df, 
        categorical_columns=CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS)

    train_df["private_workforce"] = train_df["workclass_Private"]
    train_df["non_private_workforce"] = 1- train_df["workclass_Private"]
    test_df["private_workforce"] = test_df["workclass_Private"]
    test_df["non_private_workforce"] = 1 - test_df["workclass_Private"]
    
    train_df.rename(columns={"race_Amer-Indian-Eskimo": "race_Amer_Indian_Eskimo", 
                             "race_Asian-Pac-Islander": "race_Asian_Pac_Islander"}, inplace = True)
    test_df.rename(columns={"race_Amer-Indian-Eskimo": "race_Amer_Indian_Eskimo", 
                             "race_Asian-Pac-Islander": "race_Asian_Pac_Islander"}, inplace = True)
    
    cols = [c for c in train_df.columns if 'workclass' not in c]
    train_df = train_df[cols]
    test_df = test_df[cols]

    return train_df, test_df

In [9]:
train_data, test_data = get_adult_data()

In [10]:
# split off train/hyper_train
train_df, hyper_train_df = model_selection.train_test_split(
    train_data,
    test_size=HYPER_TRAIN_SPLIT,
    random_state=RANDOM_SEED,
    stratify=train_data['label'].values,
)

del train_data

# split off val from test
test_df, val_full_df = model_selection.train_test_split(
    test_data,
    test_size=VAL_FULL_SPLIT,
    random_state=RANDOM_SEED,
    stratify=test_data[EVAL_GROUPS + ['label']].values,
)

del test_data

# we do not use a val/hyper_val in this setting so they are made the same
val_df = val_full_df
hyper_val_df = val_full_df

del val_full_df

In [11]:
print(train_df.shape, hyper_train_df.shape, val_df.shape, hyper_val_df.shape, test_df.shape)

(21815, 109) (10746, 109) (1629, 109) (1629, 109) (14652, 109)


In [12]:
train_df.to_csv(os.path.join(PROCESSED_DIR, "train.csv"), index=False)
hyper_train_df.to_csv(os.path.join(PROCESSED_DIR, "hyper_train.csv"), index=False)
val_df.to_csv(os.path.join(PROCESSED_DIR, "val.csv"), index=False)
hyper_val_df.to_csv(os.path.join(PROCESSED_DIR, "hyper_val.csv"), index=False)
test_df.to_csv(os.path.join(PROCESSED_DIR, "test.csv"), index=False)