In [12]:
import pandas as pd
import numpy as np
import folktables

In [13]:
year = '2018'
state = 'NY'
pred = "income"

output = "../../data/ACS/income_"+state+"_"+year+".csv"
root_dir = 'data'
data_source = folktables.ACSDataSource(survey_year=year, horizon='1-Year', survey='person', root_dir = root_dir)
acs_data = data_source.get_data(states=[state], download=True)

Downloading data for 2018 1-Year person survey for NY...


In [14]:
ACSIncome = folktables.BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'WAOB',
        'WKHP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    group='RAC1P',
)

In [15]:
df, df_labels, _ = ACSIncome.df_to_pandas(acs_data)
df["income"] = (df_labels["PINCP"])
df = df.dropna()
len(df)

103756

In [16]:
# AGEP: Age
# COW: Class of Worker
# SCHL: School
# MAR: Marital Status
# OCCP: Occupation Code
# WAOB: World Area of Birth
# WKHP: Hours Worked Per Week

# https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2018.pdf

In [17]:
df.loc[df["SCHL"].isin([2,3,4,5,6,7,8, 9, 10, 11]), "SCHL"] = 2 # group < high school
df.loc[df["SCHL"].isin([12, 13, 14, 15]), "SCHL"] = 12 # group some high school, no diploma

df = pd.get_dummies(df, columns=["COW", "SCHL", "MAR", "WAOB"])

In [18]:
OCCP = {
    "OCCP_1": [0,1000], # Management, Business, and Financial
    "OCCP_2": [1000, 2000], # Computer, Engineering, and Science
    "OCCP_3": [2000, 3000], # Education, Legal, Community Service, Arts, and Media
    "OCCP_4": [3000, 3600], # Healthcare Practitioners and Technical
    "OCCP_5": [3600, 4700], # Service
    "OCCP_6": [4700, 5000], # Sales
    "OCCP_7": [5000, 6000], # Office and Administrative Support
    "OCCP_8": [6000, 6200], # Farming, Fishing, and Forestry
    "OCCP_9": [6200, 7000], # Construction
    "OCCP_10": [7000, 7700], # Installation, Maintenance, and Repair
    "OCCP_11": [7700, 9000], # Production
    "OCCP_12": [9000, 10000] # Transportation
}

for k,v in OCCP.items():
    df[k] = 0
    df.loc[(df["OCCP"]>=v[0])&(df["OCCP"]<v[1]), k] = 1
df = df.drop(columns=["OCCP"])

In [19]:
df["sex"] = (df["SEX"]==1).astype(int)
df["race"] = (df["RAC1P"]==1).astype(int)

df = df.drop(columns=["SEX", "RAC1P"])

In [20]:
# Min Max Normalization for Continuous Features 

df["AGEP"] = (df["AGEP"]-df["AGEP"].min()) / (df["AGEP"].max() - df["AGEP"].min())
df["WKHP"] = (df["WKHP"]-df["WKHP"].min()) / (df["WKHP"].max() - df["WKHP"].min())

In [21]:
df.head()

Unnamed: 0,AGEP,WKHP,income,COW_1.0,COW_2.0,COW_3.0,COW_4.0,COW_5.0,COW_6.0,COW_7.0,...,OCCP_5,OCCP_6,OCCP_7,OCCP_8,OCCP_9,OCCP_10,OCCP_11,OCCP_12,sex,race
0,0.126582,0.5,5200.0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0.063291,0.010204,1000.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
4,0.037975,0.030612,870.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0.075949,0.5,4000.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,0.037975,0.193878,2000.0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [22]:
df.to_csv(output, index=False)

In [23]:
len(df)

103756