In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
from pprint import pprint

# Census API Key
from config import census_api_key
c = Census(census_api_key, year=2017)

In [2]:
# Create a dictionary of the census codes we are interested in, plus a more user friendly name.
# Codes pulled from documentation on Census website.
# This dict is so I do not have to remember that "B01002_001E" is the code for "Median Age"
census_codes_csv = "Resources/census_codes.csv"
census_codes_df = pd.read_csv(census_codes_csv)
census_codes_df.head()

census_dict = dict(zip(census_codes_df["Census API Code"], census_codes_df["Field Name"]))

# dictionary to rename columns after pulling from census
rename_dict = census_dict.copy()
rename_dict["zip code tabulation area"] = "Zip Code"

# This is a tuple of codes we want from census api.  This was format api seemed to want.
census_codes = tuple(census_dict.keys())

#pprint(rename_dict)
#pprint(census_codes)

In [3]:
# Run Census Search to retrieve data on all zip codes (2017 ACS5 Census)

# Retrieve fields as defined by code above, for all zip codes
census_data = c.acs5.get(census_codes, {'for': 'zip code tabulation area:*'})

# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

# Rename ugly old mainframe-ish looking census fields to our more human readable column names
census_pd = census_pd.rename(columns=rename_dict)

#census_pd = pd.read_csv("Resources/Census_Data_Demo.csv")
# Visualize
print(len(census_pd))
census_pd.head()

33120


Unnamed: 0,Median Age,Population,State Born,Other State Born,Foreign Born,Household Income,Name,Zip Code
0,26.9,345.0,291.0,54.0,0.0,71806.0,ZCTA5 84306,84306
1,58.9,421.0,245.0,167.0,9.0,-666666666.0,ZCTA5 84775,84775
2,57.2,211.0,157.0,54.0,0.0,-666666666.0,ZCTA5 84762,84762
3,51.3,203.0,130.0,73.0,0.0,33958.0,ZCTA5 84772,84772
4,31.6,224.0,129.0,95.0,0.0,48000.0,ZCTA5 84781,84781


In [4]:
# Looking at my home zip code to confirm pulled data looks accurate
test_df = census_pd[(census_pd["Zip Code"] == "08820")]
test_df

Unnamed: 0,Median Age,Population,State Born,Other State Born,Foreign Born,Household Income,Name,Zip Code
31434,42.1,40080.0,12230.0,4720.0,22848.0,120911.0,ZCTA5 08820,8820


In [5]:
# Calculate U.S. Born from those born in state + those born in another state
us_born_series = census_pd["State Born"] + census_pd["Other State Born"]
new_loc = census_pd.columns.get_loc("Other State Born") + 1
census_pd.insert(loc=new_loc, column="U.S. Born", value=us_born_series)

# Calculate Foreign Born percentage by dividing foreign born by total population
foreign_perc_series = census_pd["Foreign Born"] / census_pd["Population"]
new_loc = census_pd.columns.get_loc("Foreign Born") + 1
census_pd.insert(loc=new_loc, column="Foreign Percentage", value=foreign_perc_series)

In [6]:
# Looking at my home zip code to confirm pulled data looks accurate
test_df = census_pd[(census_pd["Zip Code"] == "08820")]
test_df

Unnamed: 0,Median Age,Population,State Born,Other State Born,U.S. Born,Foreign Born,Foreign Percentage,Household Income,Name,Zip Code
31434,42.1,40080.0,12230.0,4720.0,16950.0,22848.0,0.57006,120911.0,ZCTA5 08820,8820


In [7]:
# drop fields no longer needed
census_pd = census_pd.drop(columns=["State Born", "Other State Born", "Name"])
census_pd.head()

Unnamed: 0,Median Age,Population,U.S. Born,Foreign Born,Foreign Percentage,Household Income,Zip Code
0,26.9,345.0,345.0,0.0,0.0,71806.0,84306
1,58.9,421.0,412.0,9.0,0.021378,-666666666.0,84775
2,57.2,211.0,211.0,0.0,0.0,-666666666.0,84762
3,51.3,203.0,203.0,0.0,0.0,33958.0,84772
4,31.6,224.0,224.0,0.0,0.0,48000.0,84781


In [8]:
# Get rid of pandas default index and make zip code the index
# this will make things easier when we start adding in extended data about zip codes that was not on census
census_pd.set_index("Zip Code", inplace=True)
census_pd.head()

Unnamed: 0_level_0,Median Age,Population,U.S. Born,Foreign Born,Foreign Percentage,Household Income
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84306,26.9,345.0,345.0,0.0,0.0,71806.0
84775,58.9,421.0,412.0,9.0,0.021378,-666666666.0
84762,57.2,211.0,211.0,0.0,0.0,-666666666.0
84772,51.3,203.0,203.0,0.0,0.0,33958.0
84781,31.6,224.0,224.0,0.0,0.0,48000.0


In [9]:
# re-sort by new zip code index
census_sorted_df = census_pd.sort_index()
census_sorted_df.head()

Unnamed: 0_level_0,Median Age,Population,U.S. Born,Foreign Born,Foreign Percentage,Household Income
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
601,38.9,17599.0,298.0,53.0,0.003012,11757.0
602,40.9,39209.0,2557.0,400.0,0.010202,16190.0
603,40.4,50135.0,4162.0,688.0,0.013723,16645.0
606,42.8,6304.0,125.0,164.0,0.026015,13387.0
610,41.4,27590.0,1231.0,105.0,0.003806,18741.0


In [10]:
# List of NJ zip codes pulled from google search
nj_zipcode_df = pd.read_csv("Resources/NJ_ZipCodes.csv", dtype=str)
nj_zipcode_df.set_index("Zip Code", inplace=True)
print(len(nj_zipcode_df))
print(len(census_sorted_df))
nj_zipcode_df.head()

739
33120


Unnamed: 0_level_0,ZipCode_Int,City,County,Zip Code Map
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7001,7001,Avenel,Middlesex,View Map
7002,7002,Bayonne,Hudson,View Map
7003,7003,Bloomfield,Essex,View Map
7004,7004,Fairfield,Essex,View Map
7005,7005,Boonton,Morris,View Map


In [11]:
# Drop fields we do not care about from zip code df
nj_zipcode_df = nj_zipcode_df.drop(columns=["ZipCode_Int", "Zip Code Map"])
nj_zipcode_df.head()

Unnamed: 0_level_0,City,County
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1
7001,Avenel,Middlesex
7002,Bayonne,Hudson
7003,Bloomfield,Essex
7004,Fairfield,Essex
7005,Boonton,Morris


In [12]:
# join full census df with nj zip code df, using inner join
# this will reduce to just census data for NJ
census_nj_df = census_sorted_df.join(nj_zipcode_df, how="inner")
census_nj_df.head()

Unnamed: 0_level_0,Median Age,Population,U.S. Born,Foreign Born,Foreign Percentage,Household Income,City,County
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7001,40.7,18244.0,12141.0,5645.0,0.309417,68426.0,Avenel,Middlesex
7002,38.6,66719.0,44833.0,19665.0,0.294744,56701.0,Bayonne,Hudson
7003,37.7,48892.0,34427.0,12653.0,0.258795,74961.0,Bloomfield,Essex
7004,46.2,7584.0,6333.0,1188.0,0.156646,107417.0,Fairfield,Essex
7005,42.1,15350.0,12866.0,2333.0,0.151987,109888.0,Boonton,Morris


In [13]:
# List of latitude and longitude for each NJ zip code
# Pulled as a csv file from public.opendatasoft.com

zip_code_xref_df = pd.read_csv("Resources/NJ_Zip_Lat_Long.csv", converters={'Zip Code': str})
zip_code_xref_df.set_index("Zip Code", inplace=True)
zip_code_xref_df = zip_code_xref_df.drop(columns=["City", "State", "Timezone", "Daylight savings time flag", "geopoint", "Unnamed: 8"])
zip_code_xref_df.head()

Unnamed: 0_level_0,Latitude,Longitude
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1
7001,40.578996,-74.27987
7002,40.666552,-74.11768
7003,40.803,-74.18895
7004,40.879049,-74.29378
7005,40.912798,-74.41516


In [14]:
# Join latitude/logitude data with current census_df
# We will need coordinates in the google places api part of project

census_nj_coord_df = census_nj_df.join(zip_code_xref_df, how="inner")
census_nj_coord_df.head()

Unnamed: 0_level_0,Median Age,Population,U.S. Born,Foreign Born,Foreign Percentage,Household Income,City,County,Latitude,Longitude
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7001,40.7,18244.0,12141.0,5645.0,0.309417,68426.0,Avenel,Middlesex,40.578996,-74.27987
7002,38.6,66719.0,44833.0,19665.0,0.294744,56701.0,Bayonne,Hudson,40.666552,-74.11768
7003,37.7,48892.0,34427.0,12653.0,0.258795,74961.0,Bloomfield,Essex,40.803,-74.18895
7004,46.2,7584.0,6333.0,1188.0,0.156646,107417.0,Fairfield,Essex,40.879049,-74.29378
7005,42.1,15350.0,12866.0,2333.0,0.151987,109888.0,Boonton,Morris,40.912798,-74.41516


In [15]:
# Looking at my home zip code to confirm pulled data looks accurate
test_df = pd.DataFrame([census_nj_coord_df.loc["08820"]])
test_df

Unnamed: 0,Median Age,Population,U.S. Born,Foreign Born,Foreign Percentage,Household Income,City,County,Latitude,Longitude
8820,42.1,40080.0,16950.0,22848.0,0.57006,120911.0,Edison,Middlesex,40.575503,-74.35781


In [16]:
census_nj_coord_df.to_csv("Resources/NJ_Census_Coord_Data.csv")