In [1]:
import pandas as pd
import numpy as np

ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ),
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee",
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}

def df_to_pandas(self, df, categories=None, dummies=False):
        """Filters and processes a DataFrame (received from ```ACSDataSource''').

        Args:
            df: pd.DataFrame (received from ```ACSDataSource''')
            categories: nested dict with columns of categorical features
                and their corresponding encodings (see examples folder)
            dummies: bool to indicate the creation of dummy variables for
                categorical features (see examples folder)

        Returns:
            pandas.DataFrame."""

        df = self._preprocess(df)

        variables = df[self.features]

        if categories:
            variables = variables.replace(categories)

        if dummies:
            variables = pd.get_dummies(variables)

        variables = pd.DataFrame(self._postprocess(variables.to_numpy()),
                                 columns=variables.columns)

        if self.target_transform is None:
            target = df[self.target]
        else:
            target = self.target_transform(df[self.target])

        target = pd.DataFrame(target).reset_index(drop=True)

        if self._group:
            group = self.group_transform(df[self.group])
            group = pd.DataFrame(group).reset_index(drop=True)
        else:
            group = pd.DataFrame(0, index=np.arange(len(target)), columns=["group"])

        return variables, target, group

In [2]:

from folktables import ACSDataSource, ACSIncome, ACSEmployment

data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
ca_data = data_source.get_data(download=True)  # CT 20,000 rows

features, labels, group = df_to_pandas(ACSIncome, ca_data, ACSIncome_categories)

Downloading data for 2017 1-Year person survey for AZ...
Downloading data for 2017 1-Year person survey for AR...
Downloading data for 2017 1-Year person survey for CA...
Downloading data for 2017 1-Year person survey for CO...
Downloading data for 2017 1-Year person survey for CT...
Downloading data for 2017 1-Year person survey for DE...
Downloading data for 2017 1-Year person survey for FL...
Downloading data for 2017 1-Year person survey for GA...
Downloading data for 2017 1-Year person survey for HI...
Downloading data for 2017 1-Year person survey for ID...
Downloading data for 2017 1-Year person survey for IL...
Downloading data for 2017 1-Year person survey for IN...
Downloading data for 2017 1-Year person survey for IA...
Downloading data for 2017 1-Year person survey for KS...
Downloading data for 2017 1-Year person survey for KY...
Downloading data for 2017 1-Year person survey for LA...
Downloading data for 2017 1-Year person survey for ME...
Downloading data for 2017 1-Yea

In [3]:
combined = pd.concat([features, labels], axis=1)
combined = combined.sample(n=100_000)
combined.to_csv('acs_income_all_2017_1y_100000.csv', index=False, sep=";")
combined

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
1613902,48,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Married,7340.0,55,0,45.0,Male,White alone,True
663190,79,Employee of a private for-profit company orbus...,Grade 8,Married,6330.0,301,0,32.0,Male,White alone,False
1212905,33,State government employee,Master's degree,Divorced,2320.0,24,2,40.0,Female,White alone,True
954827,38,Employee of a private for-profit company orbus...,Bachelor's degree,Married,4700.0,35,0,55.0,Male,Some Other Race alone,True
899205,40,Employee of a private for-profit company orbus...,Master's degree,Never married or under 15 years old,4700.0,25,0,45.0,Female,White alone,True
...,...,...,...,...,...,...,...,...,...,...,...
998283,42,Employee of a private for-profit company orbus...,Associate's degree,Never married or under 15 years old,4760.0,36,2,30.0,Female,White alone,False
343117,51,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Married,840.0,363,1,50.0,Female,White alone,True
1173223,36,Self-employed in own not incorporated business...,Bachelor's degree,Never married or under 15 years old,4760.0,39,4,10.0,Male,White alone,False
771557,22,Employee of a private for-profit company orbus...,Regular high school diploma,Never married or under 15 years old,4600.0,12,2,40.0,Female,White alone,False


In [4]:
test_acs = pd.read_csv("acs_income_all_2017_1y_500000.csv", delimiter=";")
test_acs

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,51,Employee of a private for-profit company orbus...,Associate's degree,Married,2540.0,303,1,24.0,Female,White alone,False
1,24,Employee of a private for-profit company orbus...,GED or alternative credential,Never married or under 15 years old,4850.0,6,2,30.0,Male,Asian alone,False
2,62,State government employee,Bachelor's degree,Married,2340.0,37,0,18.0,Female,White alone,False
3,29,Employee of a private for-profit company orbus...,Regular high school diploma,Never married or under 15 years old,6240.0,31,13,40.0,Male,White alone,False
4,28,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Divorced,9130.0,12,2,20.0,Male,White alone,False
...,...,...,...,...,...,...,...,...,...,...,...
499995,65,"Self-employed in own incorporated business,pro...",Doctorate degree,Divorced,1820.0,6,0,65.0,Male,White alone,True
499996,22,Federal government employee,"1 or more years of college credit, no degree",Married,700.0,36,1,40.0,Female,White alone,False
499997,54,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Divorced,4020.0,17,0,36.0,Male,White alone,False
499998,50,Employee of a private for-profit company orbus...,Bachelor's degree,Married,2200.0,34,1,40.0,Female,White alone,False


In [5]:
features

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P
0,31,Employee of a private for-profit company orbus...,Bachelor's degree,Never married or under 15 years old,350.0,13,0,42.0,Female,White alone
1,41,Employee of a private for-profit company orbus...,GED or alternative credential,Married,6260.0,13,0,42.0,Male,White alone
2,37,State government employee,Regular high school diploma,Divorced,230.0,1,0,30.0,Female,Black or African American alone
3,32,"Self-employed in own incorporated business,pro...",Regular high school diploma,Married,8140.0,303,0,60.0,Male,White alone
4,54,Employee of a private for-profit company orbus...,Associate's degree,Married,136.0,1,1,40.0,Female,Black or African American alone
...,...,...,...,...,...,...,...,...,...,...
1642968,20,State government employee,"1 or more years of college credit, no degree",Never married or under 15 years old,4640.0,72,17,60.0,Female,Black or African American alone
1642969,21,Federal government employee,"Some college, but less than 1 year",Never married or under 15 years old,7210.0,41,17,80.0,Male,White alone
1642970,26,State government employee,Grade 9,Never married or under 15 years old,4250.0,72,16,15.0,Male,Some Other Race alone
1642971,31,Employee of a private for-profit company orbus...,"Some college, but less than 1 year",Divorced,9610.0,72,17,40.0,Male,White alone


In [6]:
from sklearn.preprocessing import OneHotEncoder
from mlmq.monkeypatching._mlinspect_ndarray import MlinspectNdarray
test_combined = OneHotEncoder().fit_transform(MlinspectNdarray(test_acs[["COW", "SCHL", "SEX"]].to_numpy())).shape
test_combined

(500000, 34)

In [7]:
test_a = OneHotEncoder().fit_transform(test_acs[["COW"]]).shape
test_b = OneHotEncoder().fit_transform(test_acs[["SCHL"]]).shape
test_c = OneHotEncoder().fit_transform(test_acs[["SEX"]]).shape
(test_a[0], test_a[1] + test_b[1] + test_c[1])

(500000, 34)