In [3]:
import pandas as pd
import numpy as np

ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ),
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee",
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}

def df_to_pandas(self, df, categories=None, dummies=False):
        """Filters and processes a DataFrame (received from ```ACSDataSource''').

        Args:
            df: pd.DataFrame (received from ```ACSDataSource''')
            categories: nested dict with columns of categorical features
                and their corresponding encodings (see examples folder)
            dummies: bool to indicate the creation of dummy variables for
                categorical features (see examples folder)

        Returns:
            pandas.DataFrame."""

        df = self._preprocess(df)

        variables = df[self.features]

        if categories:
            variables = variables.replace(categories)

        if dummies:
            variables = pd.get_dummies(variables)

        variables = pd.DataFrame(self._postprocess(variables.to_numpy()),
                                 columns=variables.columns)

        if self.target_transform is None:
            target = df[self.target]
        else:
            target = self.target_transform(df[self.target])

        target = pd.DataFrame(target).reset_index(drop=True)

        if self._group:
            group = self.group_transform(df[self.group])
            group = pd.DataFrame(group).reset_index(drop=True)
        else:
            group = pd.DataFrame(0, index=np.arange(len(target)), columns=["group"])

        return variables, target, group

In [4]:

from folktables import ACSDataSource, ACSIncome, ACSEmployment

data_source = ACSDataSource(survey_year='2017', horizon='5-Year', survey='person')
ca_data = data_source.get_data(states=["RI"], download=True)  # CT 20,000 rows

features, labels, group = df_to_pandas(ACSIncome, ca_data, ACSIncome_categories)

Downloading data for 2017 5-Year person survey for RI...


In [5]:
combined = pd.concat([features, labels], axis=1)
combined.to_csv('acs_income_RI_2017_5y.csv', index=False, sep=";")
combined

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,63,"Local government employee (city, county, etc.)",Bachelor's degree,Married,5120.0,129,0,55.0,Male,White alone,True
1,55,"Local government employee (city, county, etc.)",Associate's degree,Married,5860.0,44,1,19.0,Female,White alone,True
2,17,Employee of a private for-profit company orbus...,Grade 11,Never married or under 15 years old,4220.0,44,2,5.0,Male,White alone,False
3,48,Employee of a private for-profit company orbus...,Bachelor's degree,Married,4700.0,36,0,50.0,Male,Black or African American alone,True
4,45,Employee of a private for-profit company orbus...,Master's degree,Married,230.0,25,1,40.0,Female,Black or African American alone,True
...,...,...,...,...,...,...,...,...,...,...,...
29232,19,Employee of a private for-profit company orbus...,"Some college, but less than 1 year",Never married or under 15 years old,3955.0,44,17,22.0,Male,White alone,False
29233,20,Employee of a private for-profit company orbus...,Bachelor's degree,Never married or under 15 years old,4760.0,341,17,7.0,Female,Two or More Races,False
29234,19,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Never married or under 15 years old,4150.0,9,17,20.0,Female,White alone,False
29235,20,"Employee of a private not-for-profit, tax-exem...",Regular high school diploma,Never married or under 15 years old,726.0,416,17,7.0,Female,Black or African American alone,False


In [6]:
test_acs = pd.read_csv("acs_income_RI_2017_5y.csv", delimiter=";")
test_acs

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,63,"Local government employee (city, county, etc.)",Bachelor's degree,Married,5120.0,129,0,55.0,Male,White alone,True
1,55,"Local government employee (city, county, etc.)",Associate's degree,Married,5860.0,44,1,19.0,Female,White alone,True
2,17,Employee of a private for-profit company orbus...,Grade 11,Never married or under 15 years old,4220.0,44,2,5.0,Male,White alone,False
3,48,Employee of a private for-profit company orbus...,Bachelor's degree,Married,4700.0,36,0,50.0,Male,Black or African American alone,True
4,45,Employee of a private for-profit company orbus...,Master's degree,Married,230.0,25,1,40.0,Female,Black or African American alone,True
...,...,...,...,...,...,...,...,...,...,...,...
29232,19,Employee of a private for-profit company orbus...,"Some college, but less than 1 year",Never married or under 15 years old,3955.0,44,17,22.0,Male,White alone,False
29233,20,Employee of a private for-profit company orbus...,Bachelor's degree,Never married or under 15 years old,4760.0,341,17,7.0,Female,Two or More Races,False
29234,19,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Never married or under 15 years old,4150.0,9,17,20.0,Female,White alone,False
29235,20,"Employee of a private not-for-profit, tax-exem...",Regular high school diploma,Never married or under 15 years old,726.0,416,17,7.0,Female,Black or African American alone,False


In [8]:
features

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P
0,63,"Local government employee (city, county, etc.)",Bachelor's degree,Married,5120.0,129,0,55.0,Male,White alone
1,55,"Local government employee (city, county, etc.)",Associate's degree,Married,5860.0,44,1,19.0,Female,White alone
2,17,Employee of a private for-profit company orbus...,Grade 11,Never married or under 15 years old,4220.0,44,2,5.0,Male,White alone
3,48,Employee of a private for-profit company orbus...,Bachelor's degree,Married,4700.0,36,0,50.0,Male,Black or African American alone
4,45,Employee of a private for-profit company orbus...,Master's degree,Married,230.0,25,1,40.0,Female,Black or African American alone
...,...,...,...,...,...,...,...,...,...,...
29232,19,Employee of a private for-profit company orbus...,"Some college, but less than 1 year",Never married or under 15 years old,3955.0,44,17,22.0,Male,White alone
29233,20,Employee of a private for-profit company orbus...,Bachelor's degree,Never married or under 15 years old,4760.0,341,17,7.0,Female,Two or More Races
29234,19,Employee of a private for-profit company orbus...,"1 or more years of college credit, no degree",Never married or under 15 years old,4150.0,9,17,20.0,Female,White alone
29235,20,"Employee of a private not-for-profit, tax-exem...",Regular high school diploma,Never married or under 15 years old,726.0,416,17,7.0,Female,Black or African American alone


In [16]:
from sklearn.preprocessing import OneHotEncoder
from mlmq.monkeypatching._mlinspect_ndarray import MlinspectNdarray
test_combined = OneHotEncoder().fit_transform(MlinspectNdarray(test_acs[["COW", "SCHL", "SEX"]].to_numpy())).shape
test_combined

(29237, 34)

In [14]:
test_a = OneHotEncoder().fit_transform(test_acs[["COW"]]).shape
test_b = OneHotEncoder().fit_transform(test_acs[["SCHL"]]).shape
test_c = OneHotEncoder().fit_transform(test_acs[["SEX"]]).shape
(test_a[0], test_a[1] + test_b[1] + test_c[1])

(29237, 34)