In [32]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [33]:
import pandas as pd
import geopandas as gpd
from census import Census
import us

CENSUS_KEY = os.environ['CENSUS_KEY']
c = Census(CENSUS_KEY)

In [34]:
IRS_COLUMNS = {'STATEFIPS': 'statefips',
               'STATE': 'state_name',
               'COUNTYFIPS': 'countyfips',
               'COUNTYNAME': 'county_name',
               'N1': 'total_returns',
               'N2': 'total_individuals',
               'TOTAL_VITA': 'total_volunteer_returns',
               'VITA': 'total_vita_returns',
               'TCE': 'total_tce_returns',
               'VITA_EIC': 'total_vita_eic_returns',
               'N59660': 'total_eic_returns',
               'A59660': 'eic_amount'}

ACS_COLUMNS = {
    'NAME': 'name',
    'B19001_001E': 'total', 'B19001_001M': 'me_total',
    'B19001_002E': 'total_less_than_10000', 'B19001_002M': 'me_less_than_1000',
    'B19001_003E': 'total_10000_to_14999', 'B19001_003M': 'me_10000_to_14999',
    'B19001_004E': 'total_15000_to_19999', 'B19001_004M': 'me_15000_to_19999',
    'B19001_005E': 'total_20000_to_24999', 'B19001_005M': 'me_20000_to_24999',
    'B19001_006E': 'total_25000_to_29999', 'B19001_006M': 'me_25000_to_29999',
    'B19001_007E': 'total_30000_to_34999', 'B19001_007M': 'me_30000_to_34999',
    'B19001_008E': 'total_35000_to_39999', 'B19001_008M': 'me_35000_to_39999',
    'B19001_009E': 'total_40000_to_44999', 'B19001_009M': 'me_40000_to_44999',
    'B19001_010E': 'total_45000_to_49999', 'B19001_010M': 'me_45000_to_49999',
    'B19001_011E': 'total_50000_to_59999', 'B19001_011M': 'me_50000_to_59999'
}

## IRS Data

The [IRS SOI individual income tax dataset](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-return-form-1040-statistics) we're using contains information about individual income tax at the county level. Included in the dataset are several columns we'll be looking at:

IRS column name | column renamed to | definition
:---------------|:------------------|:-----------
`N1`            | `total_returns`   | number of returns
`N2`            | `total_individuals` | number of individuals
`TOTAL_VITA`    | `total_volunteer_returns` | number of returns prepared by IRS-certified volunteers to taxpayers with limited income, persons with disabilities, limited English speaking taxpayers, current and former members of the military, and taxpayers who are 60 years of age and older.
`VITA`          | `total_vita_returns ` | Number of volunteer income tax assistance (VITA) prepared returns
`TCE`           | `total_tce_returns`   | Number of tax counseling for the elderly (TCE) prepared returns
`VITA_EIC`      | `total_vita_eic_returns` | Number of volunteer prepared returns with Earned Income Credit
`N59660`        | `total_eic_returns` | Number of returns with earned income credit
`A59660`        | `eic_amount`        | Earned income credit amount (includes both the refundable and non-refundable portions)

In [35]:
raw_irs_county_df = pd.read_csv("./data/IRS_SOI 2019_county.csv")

In [36]:
irs_county_df = raw_irs_county_df[[key for key in IRS_COLUMNS.keys()]].copy() #keep only columns we want
irs_county_df.rename(columns=IRS_COLUMNS, inplace=True)

In [37]:
irs_county_df.head(10)

Unnamed: 0,statefips,state_name,countyfips,county_name,total_returns,total_individuals,total_volunteer_returns,total_vita_returns,total_tce_returns,total_vita_eic_returns,total_eic_returns,eic_amount
0,1,AL,0,Alabama,2130240,4128580,35180,25260,9920,6150,488640,1351717
1,1,AL,1,Autauga County,25230,50890,310,160,150,0,5060,13607
2,1,AL,3,Baldwin County,105840,201400,1500,600,900,90,17680,44510
3,1,AL,5,Barbour County,9490,18190,120,120,0,30,3060,9201
4,1,AL,7,Bibb County,8230,16840,0,0,0,0,2070,5737
5,1,AL,9,Blount County,23210,48770,100,50,50,0,4540,11600
6,1,AL,11,Bullock County,3880,7420,0,0,0,0,1470,4532
7,1,AL,13,Butler County,8190,15970,20,20,0,0,2840,8708
8,1,AL,15,Calhoun County,48800,93080,1470,1400,70,340,11650,30371
9,1,AL,17,Chambers County,14060,26420,60,60,0,0,4450,12726


Getting rid of state-wide rows, which have a county FIPS of 0.

In [38]:
irs_county_df = irs_county_df[irs_county_df.countyfips != 0].copy()

## Census Data

We'll be joining the IRS data above to Census ACS 5-year estimates. Specifically, we'll be getting the following information from the "HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2019 INFLATION-ADJUSTED DOLLARS)" concept in the ACS:

IRS column name | column renamed to | definition
:---------------|:------------------|:-----------
`name`          | `total_returns`   | number of returns
`B19001_001E`   | `total_individuals` | [estimated total](https://api.census.gov/data/2019/acs/acs5/variables/B19001_001E.json)
`B19001_002E`   | `total_less_than_10000` | estimated total - less than \\$10,000 
`B19001_003E`   | `total_10000_to_14999` | estimated total - \\$10,000 to \\$14,999
`B19001_004E`   | `total_15000_to_19999` | estimated total - $15,000 to $19,999
`B19001_005E`   | `total_20000_to_24999` | estimated total - $20,000 to $24,999
`B19001_006E`   | `total_25000_to_29999` | estimated total - $25,000 to $29,999
`B19001_007E`   | `total_30000_to_34999` | estimated total - $30,000 to $34,999
`B19001_008E`   | `total_35000_to_39999` | estimated total - $35,000 to $39,999
`B19001_009E`   | `total_40000_to_44999` | estimated total - $40,000 to $44,999
`B19001_010E`   | `total_45000_to_49999` | estimated total - $45,000 to $49,999
`B19001_011E`   | `total_50000_to_59999` | estimated total - $50,000 to $59,999

Per the Census, "The data on income were derived from answers to Questions 43 and 44 in the 2019 American
Community Survey (ACS), which were asked of the population 15 years old and over.
'Total income' is the sum of the amounts reported separately for wage or salary income; net
self-employment income; interest, dividends, or net rental or royalty income or income from
estates and trusts; Social Security or Railroad Retirement income; Supplemental Security
Income (SSI); public assistance or welfare payments; retirement, survivor, or disability
pensions; and all other income." [Census](https://www2.census.gov/programs-surveys/acs/tech_docs/subject_definitions/2019_ACSSubjectDefinitions.pdf)

We are looking at household income, which is the sum of each of these forms of income for every member of the household, whether or not they were related to the householder.[Census, pg 86](https://www2.census.gov/programs-surveys/acs/tech_docs/subject_definitions/2019_ACSSubjectDefinitions.pdf) This may get us artificially high incomes for unrelated households, but the number of households is much closer to the number of returns in a random selection of counties.

In [39]:
census_df = pd.DataFrame.from_records(c.acs5.get([i for i in ACS_COLUMNS.keys()], geo={'for': 'county:*', 'in': 'state:*'}))
census_df.rename(columns=ACS_COLUMNS, inplace=True)

Preparing for merge.

In [40]:
irs_county_df.groupby(['statefips', 'countyfips']).size()

statefips  countyfips
1          1             1
           3             1
           5             1
           7             1
           9             1
                        ..
56         37            1
           39            1
           41            1
           43            1
           45            1
Length: 3142, dtype: int64

In [41]:
census_df.groupby(['state', 'county']).size()

state  county
01     001       1
       003       1
       005       1
       007       1
       009       1
                ..
72     145       1
       147       1
       149       1
       151       1
       153       1
Length: 3220, dtype: int64

In [42]:
census_df.state = census_df.state.str.replace("^0+", "", regex=True)
census_df.county = census_df.county.str.replace("^0+", "", regex=True)

IRS doesn't have data on PR, so drop those rows.

In [43]:
census_df = census_df[census_df.state != 72].copy()

In [44]:
census_df['state'] = census_df.state.astype('int64')
census_df['county'] = census_df.county.astype('int64')
irs_county_df['statefips'] = irs_county_df.statefips.astype('int64')
irs_county_df['countyfips'] = irs_county_df.countyfips.astype('int64')

## Merging IRS and ACS data

In [45]:
census_irs_df = pd.merge(irs_county_df, census_df, how="inner", left_on=['statefips', 'countyfips'], right_on=['state', 'county'])

In [46]:
census_irs_df.sample(15)

Unnamed: 0,statefips,state_name,countyfips,county_name,total_returns,total_individuals,total_volunteer_returns,total_vita_returns,total_tce_returns,total_vita_eic_returns,...,total_35000_to_39999,me_35000_to_39999,total_40000_to_44999,me_40000_to_44999,total_45000_to_49999,me_45000_to_49999,total_50000_to_59999,me_50000_to_59999,state,county
1829,36,NY,7,Broome County,88540,154880,3050,780,2270,230,...,3838.0,356.0,3405.0,348.0,3365.0,360.0,6183.0,479.0,36,7
990,20,KS,209,Wyandotte County,74040,142580,1410,670,740,150,...,3461.0,335.0,2883.0,306.0,3759.0,370.0,5157.0,371.0,20,209
1407,28,MS,17,Chickasaw County,7590,14660,20,20,0,0,...,452.0,153.0,335.0,150.0,321.0,113.0,338.0,107.0,28,17
543,13,GA,319,Wilkinson County,3900,7490,0,0,0,0,...,110.0,41.0,88.0,42.0,178.0,64.0,227.0,62.0,13,319
1220,25,MA,11,Franklin County,36060,60880,340,270,70,130,...,1194.0,180.0,1347.0,206.0,1347.0,200.0,2461.0,273.0,25,11
1171,22,LA,121,West Baton Rouge Par,12500,24320,180,180,0,40,...,405.0,251.0,216.0,131.0,367.0,135.0,954.0,242.0,22,121
851,19,IA,129,Mills County,6740,13650,40,40,0,0,...,172.0,49.0,160.0,47.0,281.0,80.0,343.0,90.0,19,129
1646,30,MT,101,Toole County,2260,4020,0,0,0,0,...,90.0,52.0,74.0,44.0,106.0,51.0,137.0,59.0,30,101
2115,39,OH,149,Shelby County,24280,44570,500,30,470,0,...,716.0,143.0,1002.0,198.0,822.0,165.0,1479.0,200.0,39,149
2994,54,WV,9,Brooke County,10640,19150,330,50,280,0,...,401.0,128.0,537.0,164.0,495.0,133.0,709.0,160.0,54,9


In [47]:
census_irs_df['total_under_60k'] = census_irs_df.total_less_than_10000 + \
                                   census_irs_df.total_10000_to_14999 + \
                                   census_irs_df.total_15000_to_19999 + \
                                   census_irs_df.total_20000_to_24999 + \
                                   census_irs_df.total_25000_to_29999 + \
                                   census_irs_df.total_30000_to_34999 + \
                                   census_irs_df.total_35000_to_39999 + \
                                   census_irs_df.total_40000_to_44999 + \
                                   census_irs_df.total_45000_to_49999 + \
                                   census_irs_df.total_50000_to_59999

In [48]:
census_irs_df.to_csv("./data/census_irs_county.csv", index=False)