# Obtaining parsed folktables datasets

In [1]:
import sys
import logging
from pathlib import Path

import pandas as pd
import numpy as np
from folktables import ACSDataSource

**NOTE**: use `MAX_SENSITIVE_GROUPS=2` to generate datasets for binary-group experiments.

In [2]:
# Important constants!
TRAIN_SIZE = 0.6
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.2

MAX_SENSITIVE_GROUPS = 4          # keep only samples from the 4 largest groups
# MAX_SENSITIVE_GROUPS = None     # keep samples from all groups

SEED = 42

In [3]:
assert TRAIN_SIZE + TEST_SIZE + (VALIDATION_SIZE or 0.) == 1  # sanity check

**Change** these paths according to where you want the data to be saved to.

In [4]:
root_dir = Path("~").expanduser()
data_dir = root_dir / "data" / "folktables"
data_dir.mkdir(parents=True, exist_ok=True)

In [5]:
# download 2018 ACS data
from folktables.load_acs import state_list

data_source = ACSDataSource(
    survey_year='2018', horizon='1-Year', survey='person',
    root_dir=str(data_dir),
)

In [6]:
# data is 3236107 rows x 286 columns
acs_data = data_source.get_data(states=state_list, download=True)  # use download=True if not yet downloaded
acs_data.shape

(3236107, 286)

According to the dataset's datasheet, train/test splits should be stratified by state
(at least for ACSIncome, the remaining tasks seem ambiguous).

In [7]:
STATE_COL = "ST"

ACS_CATEGORICAL_COLS = {
    'COW',  # class of worker
    'MAR',  # marital status
    'OCCP', # occupation code
    'POBP', # place of birth code
    'RELP', # relationship status
    'SEX',
    'RAC1P', # race code
    'DIS',  # disability
    'ESP',  # employment status of parents
    'CIT',  # citizenship status
    'MIG',  # mobility status
    'MIL',  # military service
    'ANC',  # ancestry
    'NATIVITY',
    'DEAR',
    'DEYE',
    'DREM',
    'ESR',
    'ST',
    'FER',
    'GCL',
    'JWTR',
#     'PUMA',
#     'POWPUMA',
}

In [8]:
import logging
from copy import deepcopy
from typing import Tuple
from functools import reduce
from operator import or_

from sklearn.model_selection import train_test_split
from folktables import BasicProblem

def split_folktables_task(
        acs_data: pd.DataFrame,
        acs_task: BasicProblem,
        train_size: float,
        test_size: float,
        validation_size: float = None,
        max_sensitive_groups: int = None,
        stratify_by_state: bool = True,
        save_to_disk: Path = None,
        file_prefix: str = "",
        seed: int = 42,
    ) -> Tuple[pd.DataFrame, ...]:
    """Train/test split a given folktables task (for train/test/validation).
    
    According to the dataset's datasheet, (at least) the ACSIncome
    task should be stratified by state.
    
    Parameters
    ----------
    acs_data : pd.DataFrame
    acs_task : folktables.BasicProblem
    train_size : float
    test_size : float
    validation_size : float
    max_sensitive_groups : int, optional
        If the number of protected groups exceeds this, discard samples belonging to
        the groups with lowest relative size.
    stratify_by_state : bool, optional
        Whether to stratify splits by state.
    seed : int, optional
    
    Returns
    -------
    (train_data, test_data, validation_data) : Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
    """
    # Sanity check
    assert train_size + test_size + (validation_size or 0.0) == 1
    assert all(val is None or 0 <= val <= 1 for val in (train_size, test_size, validation_size))

    # Add State to the feature columns so we can do stratified splits (will be removed later)
    remove_state_col_later = False # only remove the state column later if we were the ones adding it
    if stratify_by_state:
        if STATE_COL not in acs_task.features:
            acs_task = deepcopy(acs_task) # we're gonna need to change this task object
            acs_task.features.append(STATE_COL)
            remove_state_col_later = True
        else:
            remove_state_col_later = False

    # Pre-process data + select task-specific features
    features, label, group = acs_task.df_to_numpy(acs_data)

    # Make a DataFrame with all processed data
    df = pd.DataFrame(data=features, columns=acs_task.features)
    df[acs_task.target] = label

    # Correct column ordering (1st: label, 2nd: group, 3rd and onwards: features)
    cols_order = ([acs_task.target, acs_task.group] +
        list(set(acs_task.features) - {acs_task.group}))
    if remove_state_col_later:
        cols_order = [col for col in cols_order if col != STATE_COL]

    # Save state_col for stratified split
    if stratify_by_state:
        state_col_data = df[STATE_COL]

    # Enforce correct ordering in df
    df = df[cols_order]

    # Drop samples from sensitive groups with low relative size
    # (e.g., original paper has only White and Black races)
    if max_sensitive_groups is not None and max_sensitive_groups > 0:
        group_sizes = df.value_counts(acs_task.group, sort=True, ascending=False)
        big_groups = group_sizes.index.to_list()[: max_sensitive_groups]

        big_groups_filter = reduce(
            or_,
            [(df[acs_task.group].to_numpy() == g) for g in big_groups],
        )
        
        # Keep only big groups
        df = df[big_groups_filter]
        state_col_data = state_col_data[big_groups_filter]
        
        # Group values must be sorted, and start at 0
        # (e.g., if we deleted group=2 but kept group=3, the later should now have value 2)
        if df[acs_task.group].max() > df[acs_task.group].nunique():
            map_to_sequential = {g: idx for g, idx in zip(big_groups, range(len(big_groups)))}
            df[acs_task.group] = [map_to_sequential[g] for g in df[acs_task.group]]

            logging.warning(f"Using the following group value mapping: {map_to_sequential}")
            assert df[acs_task.group].max() == df[acs_task.group].nunique() - 1

    ## Try to enforce correct types
    # All columns should be encoded as integers, dtype=int
    types_dict = {
        col: int for col in df.columns
        if df.dtypes[col] != "object"
    }
    
    df = df.astype(types_dict)
    # ^ set int types right-away so that categories don't have floating points
    
    # Set categorical columns to start at value=0! (necessary for sensitive attributes)
    for col in (ACS_CATEGORICAL_COLS & set(df.columns)):
        df[col] = df[col] - df[col].min()

    # Set categorical columns to the correct dtype "category"
    types_dict.update({
        col: "category" for col in (ACS_CATEGORICAL_COLS & set(df.columns))
        # if df[col].nunique() < 10
    })

    # Plus the group is definitely categorical
    types_dict.update({acs_task.group: "category"})
    
    # And the target is definitely integer
    types_dict.update({acs_task.target: int})
    
    # Set df to correct types
    df = df.astype(types_dict)

    # ** Split data in train/test/validation **
    train_idx, other_idx = train_test_split(
        df.index,
        train_size=train_size,
        stratify=state_col_data if stratify_by_state else None,
        random_state=seed,
        shuffle=True)

    train_df, other_df = df.loc[train_idx], df.loc[other_idx]
    assert len(set(train_idx) & set(other_idx)) == 0

    # Split validation
    if validation_size is not None and validation_size > 0:
        new_test_size = test_size / (test_size + validation_size)

        val_idx, test_idx = train_test_split(
            other_df.index,
            test_size=new_test_size,
            stratify=state_col_data.loc[other_idx] if stratify_by_state else None,
            random_state=seed,
            shuffle=True)

        val_df, test_df = other_df.loc[val_idx], other_df.loc[test_idx]
        assert len(train_idx) + len(val_idx) + len(test_idx) == len(df)
        assert np.isclose(len(val_df) / len(df), validation_size)

    else:
        test_idx = other_idx
        test_df = other_df

    assert np.isclose(len(train_df) / len(df), train_size)
    assert np.isclose(len(test_df) / len(df), test_size)
    
    # Optionally, save data to disk
    # Warning: depends on global notebook variables
    if save_to_disk:
        subfolder_name = f"train={train_size:.2}_test={test_size:.2}"
        if validation_size:
            subfolder_name = f"{subfolder_name}_validation={validation_size:.2}"
        if max_sensitive_groups is not None and max_sensitive_groups > 0:
            subfolder_name = f"{subfolder_name}_max-groups={max_sensitive_groups}"

        # Create folder
        save_to_disk = save_to_disk / subfolder_name
        save_to_disk.mkdir(parents=True, exist_ok=True)

        print(f"Saving data to folder '{str(save_to_disk)}' with prefix '{file_prefix}'.")
        train_df.to_csv(save_to_disk / f"{file_prefix}.train.csv", header=True, index_label="index")
        test_df.to_csv(save_to_disk / f"{file_prefix}.test.csv", header=True, index_label="index")
        
        if validation_size:
            val_df.to_csv(save_to_disk / f"{file_prefix}.validation.csv", header=True, index_label="index")

    return (train_df, test_df, val_df) if validation_size else (train_df, test_df)

In [None]:
import folktables
from tqdm.auto import tqdm

all_acs_tasks = [
    'ACSIncome',
    'ACSPublicCoverage',
    'ACSMobility',
    'ACSEmployment',
    'ACSTravelTime',
]

const_predictor_acc = {}

# Generate data and save to disk, for all tasks
for task_name in tqdm(all_acs_tasks):

    # Dynamically import/load task object
    task_obj = getattr(folktables, task_name)

    # Process data
    data = split_folktables_task(
        acs_data,
        task_obj,
        train_size=TRAIN_SIZE,
        test_size=TEST_SIZE,
        validation_size=VALIDATION_SIZE,
        max_sensitive_groups=MAX_SENSITIVE_GROUPS,
        stratify_by_state=True,
        seed=SEED,
        save_to_disk=data_dir,
        file_prefix=task_name,
    )
    
    const_predictor_acc[task_name] = {
        curr_type: max(curr_data[task_obj.target].mean(), 1-curr_data[task_obj.target].mean())
        for curr_type, curr_data in zip(["train", "test", "validation"], data)
    }

## Log the constant classifier accuracy for each dataset and data type
The constant classifier always predicts either class 1 or 0 (whichever has highest prevalence in the dataset).

In [10]:
import json
print(json.dumps(const_predictor_acc, indent=2))

{
  "ACSIncome": {
    "train": 0.6276484057384071,
    "test": 0.6267948098880045,
    "validation": 0.6275487157120827
  },
  "ACSPublicCoverage": {
    "train": 0.705777716173447,
    "test": 0.7080615054796799,
    "validation": 0.7048998544597685
  },
  "ACSMobility": {
    "train": 0.7366843896876353,
    "test": 0.7369037614281713,
    "validation": 0.7358002054311933
  },
  "ACSEmployment": {
    "train": 0.5401004328892394,
    "test": 0.5415303116248027,
    "validation": 0.5397890715398488
  },
  "ACSTravelTime": {
    "train": 0.5622398192707292,
    "test": 0.5624768352647576,
    "validation": 0.5620214827548473
  }
}


---