In [1]:
import os
import time

from tqdm import tqdm_notebook
import pandas as pd
# Census API wrapper package:
#       https://github.com/datamade/census
from census import Census

In [2]:
os.chdir("/media/wkg/storage/mcbi-datapalooza-2019")
#os.chdir("/Users/wigasper/Documents/mcbi-datapalooza-2019")

In [15]:
zip_data = pd.read_csv("zipcodes.csv", index_col=None)

# Remove pesky "Unnamed" column
zip_data = zip_data.loc[:, ~zip_data.columns.str.contains('Unnamed')]

# Change zips to str and pad with 0s
zip_data["zip"] = zip_data["zip"].apply(lambda x: str(x).zfill(5))

# Create Census object with API key
cens = Census("641afb80c092a21ba85b039d816e211551bccad4")

zip_data contains basic geographic zip code data for 44,336 zip codes in the US.

In [16]:
zip_data.head()

Unnamed: 0,zip,city,state,latitude,longitude
0,210,Portsmouth,NH,43.005895,-71.013202
1,211,Portsmouth,NH,43.005895,-71.013202
2,212,Portsmouth,NH,43.005895,-71.013202
3,213,Portsmouth,NH,43.005895,-71.013202
4,214,Portsmouth,NH,43.005895,-71.013202


In [17]:
zip_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44336 entries, 0 to 44335
Data columns (total 5 columns):
zip          44336 non-null object
city         44336 non-null object
state        44336 non-null object
latitude     43689 non-null float64
longitude    43689 non-null float64
dtypes: float64(2), object(3)
memory usage: 1.7+ MB


This function, get_census_val() is used to retrieve data from the Census's 5-year American Community Survey, which provides accurate estimates for a huge number of variables for US zip codes. We are using this data for a number of features in our mdoel.

In [None]:
# get_census_val() : Gets the value of a given variable for a given zipcode
# Args: cens_obj - A Census object (census package)
#       variable - The variable to get a value for. From:
#           https://api.census.gov/data/2017/acs/acs5/variables.html
#       zipcode - The zipcode to get the variable's value for
# Returns the value of the variable for the zipcode, or None if a connection
# error. Also returns 0.0 if there is no value - this may need to be tweaked
# and is not adequately abstracted.
# Still needs to be tested with every variable change
def get_census_val(cens_obj, variable, zipcode):
    try:
        result = cens_obj.acs5.zipcode(variable, zipcode)
        if len(result) > 0:
            return result[0].get(variable)
        else:
            return 0.0
    except ConnectionError:
        return None
    except CensusException:
        return None

Next, we get total population values for each zip code.

In [None]:
# Put zip codes into a list for ease of processing
zips = [[zipcode, None] for zipcode in zip_data["zip"]]

# Get populations for zip codes if value is None. I did it this way to be able
# non-redundantly call the API in batches in case of the common ConnectionError
for zipcode in tqdm_notebook(zips):
    if zipcode[1] is None:
        zipcode[1] = get_census_val(cens, "B01003_001E", zipcode[0])

In [None]:
# stuff for combining everything here once i get it

The IRS's Individual Income Tax Statistics organized by zip code for 2016 is available at:

https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2016-zip-code-data-soi

Using this data, we add the count of returns in the provided income ranges for every zip code to the zip_data dataframe:

In [18]:
income = pd.read_csv("16zpallagi.csv", index_col=None)

agis = pd.DataFrame(income, columns=["zipcode", "agi_stub", "N1"])

agis = pd.pivot_table(agis, values="N1", index="zipcode", columns="agi_stub")
agis = agis.reset_index()

agis["zipcode"] = agis["zipcode"].apply(lambda x: str(x).zfill(5))

agis = agis.rename(index=str, columns={"zipcode": "zip",
                                       1: "num_tax_returns_0-25k",
                                       2: "num_tax_returns_25k-50k",
                                       3: "num_tax_returns_50k-75k",
                                       4: "num_tax_returns_75k-100k",
                                       5: "num_tax_returns_100k-200k",
                                       6: "num_tax_returns_200k-inf"})

agis = agis[1:]

zip_data = pd.merge(zip_data, agis, how="inner", on="zip")

Some random samples from the dataframe after the addition of income counts.

In [19]:
zip_data.sample(n=10, random_state=2)

Unnamed: 0,zip,city,state,latitude,longitude,num_tax_returns_0-25k,num_tax_returns_25k-50k,num_tax_returns_50k-75k,num_tax_returns_75k-100k,num_tax_returns_100k-200k,num_tax_returns_200k-inf
1534,6111,Newington,CT,41.688899,-72.73101,4320.0,3340.0,2870.0,2110.0,3100.0,470.0
3709,14202,Buffalo,NY,42.886357,-78.8779,650.0,440.0,190.0,120.0,170.0,180.0
4265,15623,Claridge,PA,40.366748,-79.61645,140.0,100.0,60.0,40.0,50.0,0.0
28304,95326,Hughson,CA,37.59471,-120.86419,1530.0,1050.0,650.0,450.0,680.0,160.0
286,1952,Salisbury,MA,42.85048,-70.86153,1530.0,1060.0,690.0,500.0,750.0,160.0
18574,59858,Philipsburg,MT,46.293656,-113.36273,290.0,180.0,100.0,70.0,80.0,20.0
787,3824,Durham,NH,43.128085,-70.96035,1000.0,390.0,300.0,290.0,820.0,410.0
21832,69043,Stratton,NE,40.150605,-101.23375,80.0,60.0,30.0,30.0,0.0,0.0
18625,60026,Glenview Nas,IL,41.811929,-87.68732,1590.0,860.0,670.0,510.0,1420.0,1700.0
4090,15213,Pittsburgh,PA,40.443269,-79.95487,2550.0,1370.0,590.0,300.0,410.0,400.0
