Using the Census API to get zip-code level population and median income data for NYC zip codes.

In [1]:
import json
import requests
from census import Census
from us import states
from us import states
import pandas as pd


In [2]:
# load keys
with open('keys.json') as f:
    keys = json.load(f)

# create census object
c = Census(keys['census_api_key'])

In [3]:
# define a function to get data from a particular table
def get_from_census(table_id, year):
    response = requests.get(f"https://api.census.gov/data/{year}/acs/acs5?get=NAME,group({table_id})&for=zip%20code%20tabulation%20area:*&key={keys['census_api_key']}")
    data = response.json()
    df = pd.DataFrame(data, columns=data[0]).drop(0)
    return df

Percenatge below poverty

In [4]:
# get poverty data for 2021
poverty_2021 = get_from_census("B17017", "2021")

# clean it up
poverty_2021 = poverty_2021[['zip code tabulation area', 'B17017_002E', 'B17017_001E']]
poverty_2021.rename(columns={
    'zip code tabulation area': 'zip',
})

Unnamed: 0,zip,B17017_002E,B17017_001E
1,00601,3520,5397
2,00602,6300,12858
3,00603,10047,19295
4,00606,957,1968
5,00610,4222,8934
...,...,...,...
33770,99923,0,13
33771,99925,63,345
33772,99926,55,457
33773,99927,0,11


In [5]:
# rename columns
poverty_2021.rename(columns={'B17017_002E': 'households_below_poverty', 'B17017_001E': 'total_households', 'zip code tabulation area': 'zip'}, inplace=True)



In [6]:
poverty_2021

Unnamed: 0,zip,households_below_poverty,total_households
1,00601,3520,5397
2,00602,6300,12858
3,00603,10047,19295
4,00606,957,1968
5,00610,4222,8934
...,...,...,...
33770,99923,0,13
33771,99925,63,345
33772,99926,55,457
33773,99927,0,11


In [7]:
poverty_2021['pct_below_poverty_2021'] = poverty_2021['households_below_poverty'].astype(int) / poverty_2021['total_households'].astype(int) * 100
poverty_2021

Unnamed: 0,zip,households_below_poverty,total_households,pct_below_poverty_2021
1,00601,3520,5397,65.221419
2,00602,6300,12858,48.996734
3,00603,10047,19295,52.070485
4,00606,957,1968,48.628049
5,00610,4222,8934,47.257667
...,...,...,...,...
33770,99923,0,13,0.000000
33771,99925,63,345,18.260870
33772,99926,55,457,12.035011
33773,99927,0,11,0.000000


In [8]:
# get poverty data for 2020 
poverty_2020 = get_from_census("B17017", "2020")

# clean it up
poverty_2020 = poverty_2020[['zip code tabulation area', 'B17017_002E', 'B17017_001E']]
poverty_2020.rename(columns={
    'zip code tabulation area': 'zip',
})

Unnamed: 0,zip,B17017_002E,B17017_001E
1,29590,333,986
2,93306,4006,21974
3,93660,288,949
4,93110,749,6385
5,93212,1074,4014
...,...,...,...
33116,16623,29,216
33117,16627,138,880
33118,16634,14,146
33119,16640,49,299


In [9]:
poverty_2020.rename(columns={'B17017_002E': 'households_below_poverty', 'B17017_001E': 'total_households', 'zip code tabulation area': 'zip'}, inplace=True)


In [10]:
poverty_2020['pct_below_poverty_2020'] = poverty_2020['households_below_poverty'].astype(int) / poverty_2020['total_households'].astype(int) * 100
poverty_2020

Unnamed: 0,zip,households_below_poverty,total_households,pct_below_poverty_2020
1,29590,333,986,33.772819
2,93306,4006,21974,18.230636
3,93660,288,949,30.347734
4,93110,749,6385,11.730619
5,93212,1074,4014,26.756353
...,...,...,...,...
33116,16623,29,216,13.425926
33117,16627,138,880,15.681818
33118,16634,14,146,9.589041
33119,16640,49,299,16.387960


In [11]:
pct_below_poverty = pd.merge(poverty_2020, poverty_2021, on='zip', how='outer')
pct_below_poverty

Unnamed: 0,zip,households_below_poverty_x,total_households_x,pct_below_poverty_2020,households_below_poverty_y,total_households_y,pct_below_poverty_2021
0,29590,333,986,33.772819,309,908,34.030837
1,93306,4006,21974,18.230636,3918,21818,17.957650
2,93660,288,949,30.347734,291,868,33.525346
3,93110,749,6385,11.730619,670,6474,10.349089
4,93212,1074,4014,26.756353,1112,4230,26.288416
...,...,...,...,...,...,...,...
33966,99635,,,,0,0,
33967,99675,,,,6,11,54.545455
33968,99707,,,,0,0,
33969,99725,,,,57,314,18.152866


In [12]:
# drop columns

pct_below_poverty.drop(columns=['households_below_poverty_x', 'total_households_x', 'households_below_poverty_y', 'total_households_y'], inplace=True)



In [13]:
# rename columns

pct_below_poverty.rename(columns={'pct_below_poverty_2020_x': 'pct_below_poverty_2020', 'pct_below_poverty_2020_y': 'pct_below_poverty_2021'}, inplace=True)
pct_below_poverty

Unnamed: 0,zip,pct_below_poverty_2020,pct_below_poverty_2021
0,29590,33.772819,34.030837
1,93306,18.230636,17.957650
2,93660,30.347734,33.525346
3,93110,11.730619,10.349089
4,93212,26.756353,26.288416
...,...,...,...
33966,99635,,
33967,99675,,54.545455
33968,99707,,
33969,99725,,18.152866


In [14]:
# save to csv

# pct_below_poverty.to_csv('pct_below_poverty.csv', index=False)

Median income 

In [15]:
# get income data for 2021
median_income_2021 = get_from_census("B19013", "2021")

# clean it up
median_income_2021 = median_income_2021[['zip code tabulation area', 'B19013_001E']]
median_income_2021.rename(columns={
    'zip code tabulation area': 'zip',
})

Unnamed: 0,zip,B19013_001E
1,00601,15292
2,00602,18716
3,00603,16789
4,00606,18835
5,00610,21239
...,...,...
33770,99923,-666666666
33771,99925,70625
33772,99926,58229
33773,99927,-666666666


In [16]:
# create a column for the year
median_income_2021['year'] = 2021
median_income_2021

Unnamed: 0,zip code tabulation area,B19013_001E,year
1,00601,15292,2021
2,00602,18716,2021
3,00603,16789,2021
4,00606,18835,2021
5,00610,21239,2021
...,...,...,...
33770,99923,-666666666,2021
33771,99925,70625,2021
33772,99926,58229,2021
33773,99927,-666666666,2021


In [17]:
# rename zip code tabulation area to zip    
median_income_2021.rename(columns={'zip code tabulation area': 'zip', 'B19013_001E': 'med_inc_2021'}, inplace=True)
median_income_2021

Unnamed: 0,zip,med_inc_2021,year
1,00601,15292,2021
2,00602,18716,2021
3,00603,16789,2021
4,00606,18835,2021
5,00610,21239,2021
...,...,...,...
33770,99923,-666666666,2021
33771,99925,70625,2021
33772,99926,58229,2021
33773,99927,-666666666,2021


In [18]:
# get income data for 2020
median_income_2020 = get_from_census("B19013", "2020")

# clean it up
median_income_2020 = median_income_2020[['zip code tabulation area', 'B19013_001E']]
median_income_2020.rename(columns={
    'zip code tabulation area': 'zip',
})

Unnamed: 0,zip,B19013_001E
1,29590,30985
2,93306,54450
3,93660,39625
4,93110,93264
5,93212,42983
...,...,...
33116,16623,51667
33117,16627,45000
33118,16634,51500
33119,16640,55982


In [19]:
# create a column for the year
median_income_2020['year'] = 2020
median_income_2020

Unnamed: 0,zip code tabulation area,B19013_001E,year
1,29590,30985,2020
2,93306,54450,2020
3,93660,39625,2020
4,93110,93264,2020
5,93212,42983,2020
...,...,...,...
33116,16623,51667,2020
33117,16627,45000,2020
33118,16634,51500,2020
33119,16640,55982,2020


In [20]:
# rename zip code tabulation area to zip    
median_income_2020.rename(columns={'zip code tabulation area': 'zip', 'B19013_001E': 'med_inc_2020'}, inplace=True)
median_income_2020

Unnamed: 0,zip,med_inc_2020,year
1,29590,30985,2020
2,93306,54450,2020
3,93660,39625,2020
4,93110,93264,2020
5,93212,42983,2020
...,...,...,...
33116,16623,51667,2020
33117,16627,45000,2020
33118,16634,51500,2020
33119,16640,55982,2020


In [21]:
median_income = pd.merge(median_income_2020, median_income_2021, on='zip', how='outer')
median_income

Unnamed: 0,zip,med_inc_2020,year_x,med_inc_2021,year_y
0,29590,30985,2020.0,52679,2021.0
1,93306,54450,2020.0,60857,2021.0
2,93660,39625,2020.0,40000,2021.0
3,93110,93264,2020.0,99261,2021.0
4,93212,42983,2020.0,46312,2021.0
...,...,...,...,...,...
33966,99635,,,-666666666,2021.0
33967,99675,,,34375,2021.0
33968,99707,,,-666666666,2021.0
33969,99725,,,45931,2021.0


In [22]:
# drop columns year_x and year_y

median_income.drop(columns=['year_x', 'year_y'], inplace=True)
median_income

Unnamed: 0,zip,med_inc_2020,med_inc_2021
0,29590,30985,52679
1,93306,54450,60857
2,93660,39625,40000
3,93110,93264,99261
4,93212,42983,46312
...,...,...,...
33966,99635,,-666666666
33967,99675,,34375
33968,99707,,-666666666
33969,99725,,45931


Population

In [23]:
# get population data for 2021
population_2021 = get_from_census("B01003", "2021")

# clean it up
population_2021 = population_2021\
        [['zip code tabulation area', 'B01003_001E']]\
        .rename(columns={'zip code tabulation area': 'zip'})

population_2021

Unnamed: 0,zip,B01003_001E
1,00601,17126
2,00602,37895
3,00603,49136
4,00606,5751
5,00610,26153
...,...,...
33770,99923,13
33771,99925,917
33772,99926,1445
33773,99927,11


In [24]:
#rename columns
population_2021.rename(columns={'B01003_001E': 'pop_2021'}, inplace=True)
population_2021

Unnamed: 0,zip,pop_2021
1,00601,17126
2,00602,37895
3,00603,49136
4,00606,5751
5,00610,26153
...,...,...
33770,99923,13
33771,99925,917
33772,99926,1445
33773,99927,11


In [25]:
# get population data for 2020
population_2020 = get_from_census("B01003", "2020")

# clean it up
population_2020 = population_2020\
        [['zip code tabulation area', 'B01003_001E']]\
        .rename(columns={'zip code tabulation area': 'zip'})

population_2020

Unnamed: 0,zip,B01003_001E
1,29590,3543
2,93306,74296
3,93660,4082
4,93110,15777
5,93212,22596
...,...,...
33116,16623,552
33117,16627,2118
33118,16634,315
33119,16640,707


In [26]:
#rename columns
population_2020.rename(columns={'B01003_001E': 'pop_2020'}, inplace=True)
population_2020

Unnamed: 0,zip,pop_2020
1,29590,3543
2,93306,74296
3,93660,4082
4,93110,15777
5,93212,22596
...,...,...
33116,16623,552
33117,16627,2118
33118,16634,315
33119,16640,707


In [27]:
population = pd.merge(population_2020, population_2021, on='zip', how='outer')
population

Unnamed: 0,zip,pop_2020,pop_2021
0,29590,3543,3560
1,93306,74296,74518
2,93660,4082,3826
3,93110,15777,16345
4,93212,22596,23148
...,...,...,...
33966,99635,,107
33967,99675,,58
33968,99707,,0
33969,99725,,611


In [28]:
# merge population and median income data into one dataframe

census_data = pd.merge(population, median_income, on='zip', how='inner')
census_data


Unnamed: 0,zip,pop_2020,pop_2021,med_inc_2020,med_inc_2021
0,29590,3543,3560,30985,52679
1,93306,74296,74518,54450,60857
2,93660,4082,3826,39625,40000
3,93110,15777,16345,93264,99261
4,93212,22596,23148,42983,46312
...,...,...,...,...,...
33966,99635,,107,,-666666666
33967,99675,,58,,34375
33968,99707,,0,,-666666666
33969,99725,,611,,45931


Merge with zip code csv

In [31]:
# read csv
zip_codes = pd.read_csv('nyc_zip_codes_cleaned.csv')
zip_codes

Unnamed: 0,zip,borough
0,10001,Manhattan
1,10002,Manhattan
2,10003,Manhattan
3,10004,Manhattan
4,10005,Manhattan
...,...,...
475,10309,Staten
476,10310,Staten
477,10311,Staten
478,10312,Staten


In [32]:
zip_codes.dtypes

zip         int64
borough    object
dtype: object

In [33]:
census_data.dtypes


zip             object
pop_2020        object
pop_2021        object
med_inc_2020    object
med_inc_2021    object
dtype: object

In [34]:
# convert zip codes to int

census_data['zip'] = census_data['zip'].astype(int)
census_data.dtypes


zip              int64
pop_2020        object
pop_2021        object
med_inc_2020    object
med_inc_2021    object
dtype: object

In [35]:
# left join zip codes with census data

census = pd.merge(zip_codes, census_data, on='zip', how='left')
census

Unnamed: 0,zip,borough,pop_2020,pop_2021,med_inc_2020,med_inc_2021
0,10001,Manhattan,25026,26966,96787,101409
1,10002,Manhattan,74363,76807,35607,37093
2,10003,Manhattan,54671,54447,129981,137533
3,10004,Manhattan,3310,4795,204949,216017
4,10005,Manhattan,8664,8637,184681,197188
...,...,...,...,...,...,...
475,10309,Staten,33896,35832,102730,107500
476,10310,Staten,24168,25976,86895,96161
477,10311,Staten,0,0,-666666666,-666666666
478,10312,Staten,61114,63935,96785,100875


In [36]:
# data for 2022 is not available yet, so we will use 2021 data for 2022
census['pop_2022'] = census['pop_2021']
census['med_inc_2022'] = census['med_inc_2021']
census

Unnamed: 0,zip,borough,pop_2020,pop_2021,med_inc_2020,med_inc_2021,pop_2022,med_inc_2022
0,10001,Manhattan,25026,26966,96787,101409,26966,101409
1,10002,Manhattan,74363,76807,35607,37093,76807,37093
2,10003,Manhattan,54671,54447,129981,137533,54447,137533
3,10004,Manhattan,3310,4795,204949,216017,4795,216017
4,10005,Manhattan,8664,8637,184681,197188,8637,197188
...,...,...,...,...,...,...,...,...
475,10309,Staten,33896,35832,102730,107500,35832,107500
476,10310,Staten,24168,25976,86895,96161,25976,96161
477,10311,Staten,0,0,-666666666,-666666666,0,-666666666
478,10312,Staten,61114,63935,96785,100875,63935,100875


In [38]:
# how many unique zip codes are there?

census['zip'].nunique()

240

In [None]:
# save to csv
# census.to_csv('census_data.csv', index=False)

In [37]:
# look only zip 10467

census[census['zip'] == 10467]

Unnamed: 0,zip,borough,pop_2020,pop_2021,med_inc_2020,med_inc_2021,pop_2022,med_inc_2022
125,10467,Bronx,100867,102209,40639,42639,102209,42639
256,10467,Bronx,100867,102209,40639,42639,102209,42639
