Import Dependencies 

In [2]:
# !pip install lxml
# !pip install beautifulsoup4 html5lib

In [3]:
# Dependencies
from census import Census
from config import (census_key, gkey)
import gmaps
import numpy as np
import pandas as pd
import requests
import time
from us import states
from scipy.stats import linregress
from matplotlib import pyplot as plt


# Census API Key
c = Census(census_key, year=2013)

In [12]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
zipcode_census_data = c.acs5.get(("B01003_001E", "B17001_002E","B17001_003E","B17001_017E",
                          "B17001A_002E","B17001B_002E", "B17001C_002E" ,"B17001D_002E" ,
                          "B17001E_002E" ,"B17001G_002E" ,"B17001I_002E" ,"B17012_002E",
                          "B17012_009E","B17012_014E" ,"B23025_002E" ,"B23025_007E" ,
                          "B23025_004E" ,"B23025_005E"), {
                         'for': 'zip code tabulation area:*'})

# Convert to DataFrame
zipcode_census_pd = pd.DataFrame(zipcode_census_data)

#reorder names
zipcode_census_pd = zipcode_census_pd.rename(columns={"B01003_001E": "Population",
                                      "B17001_002E": "Poverty Count","B17001_003E": "poverty_male",
                                                      "B17001_017E": "poverty_female","B17001A_002E": "poverty_white_alone",
                                                      "B17001B_002E": "poverty_black_alone",
                                                      "B17001C_002E": "population_american_indian_alone",
                                                      "B17001D_002E": "poverty_asian_alone",
                                                      "B17001E_002E": "poverty_native_hawaiian_alone",
                                                      "B17001G_002E": "poverty_two_or_more_races",
                                                      "B17001I_002E": "poverty_hispanic_origin",
                                                      "B17012_002E": "poverty_family",
                                                      "B17012_009E": "poverty_family_single_male",
                                                      "B17012_014E": "poverty_family_single_female",
                                                      "B23025_002E": "employment_labor_force",
                                                      "B23025_007E": "employment_not_labor_force"	,
                                                      "B23025_004E": "employment_employed",
                                                      "B23025_005E": "employment_unemployed",
                                      "zip code tabulation area": "Zipcode"})


# Add in Poverty Rate (Poverty Count / Population)
zipcode_census_pd["Poverty Rate"] = 100 * \
    zipcode_census_pd["Poverty Count"].astype(
        int) / zipcode_census_pd["Population"].astype(int)

# Final DataFrame
# census_pd = census_pd[["Zipcode", "Population", "Poverty Rate"]]

# # Visualize
print(len(zipcode_census_pd))
zipcode_census_pd.head()

33120


Unnamed: 0,Population,Poverty Count,poverty_male,poverty_female,poverty_white_alone,poverty_black_alone,population_american_indian_alone,poverty_asian_alone,poverty_native_hawaiian_alone,poverty_two_or_more_races,poverty_hispanic_origin,poverty_family,poverty_family_single_male,poverty_family_single_female,employment_labor_force,employment_not_labor_force,employment_employed,employment_unemployed,Zipcode,Poverty Rate
0,22121.0,2412.0,1156.0,1256.0,1638.0,90.0,28.0,19.0,0.0,195.0,1143.0,576.0,114.0,325.0,12160.0,5281.0,10888.0,1258.0,1832,10.903666
1,8295.0,191.0,88.0,103.0,191.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,0.0,14.0,4516.0,1877.0,4196.0,320.0,1833,2.302592
2,6675.0,146.0,53.0,93.0,94.0,0.0,0.0,52.0,0.0,0.0,0.0,40.0,0.0,26.0,3895.0,1504.0,3662.0,233.0,1834,2.187266
3,13527.0,1153.0,560.0,593.0,762.0,218.0,0.0,46.0,0.0,23.0,331.0,235.0,54.0,143.0,7988.0,2749.0,7536.0,452.0,1835,8.523693
4,4547.0,1798.0,696.0,1102.0,515.0,96.0,40.0,10.0,0.0,34.0,1622.0,334.0,19.0,193.0,1738.0,1810.0,1483.0,255.0,1840,39.542556


In [4]:
# Get California Data Only
    #Web scrape to get data for california zipcodes that includes county and whether 
tables = pd.read_html('Resources/zipcode_site.html')
zipcode_df = tables[2]



In [5]:
    #clean zipcode_df to get rid of p.o. boxes and rename columns
column_names = list(zipcode_df.iloc[0,:])
zipcode_df.columns = column_names
    #delete first row (redundant)
zipcode_df = zipcode_df.drop(zipcode_df.index[0])

In [6]:
#reset index 
zipcode_df=zipcode_df.reset_index()
#rename column for merge
zipcode_df = zipcode_df.rename(columns={"ZIP Code":"Zipcode"})

In [7]:
# zipcode_df.head()
zipcode_df['Zipcode']=zipcode_df['Zipcode'].str.slice(9)
zipcode_df.head()

Unnamed: 0,index,Zipcode,City,County,Type
0,1,90001,Los Angeles,Los Angeles,Standard
1,2,90002,Los Angeles,Los Angeles,Standard
2,3,90003,Los Angeles,Los Angeles,Standard
3,4,90004,Los Angeles,Los Angeles,Standard
4,5,90005,Los Angeles,Los Angeles,Standard


In [8]:
set(zipcode_df['Type'])

{'P.O. Box', 'Standard', 'Unique'}

In [9]:
zipcode_df['Type'].value.counts()

AttributeError: 'Series' object has no attribute 'value'

In [10]:
#drop all p.o. boxes 
zipcode_df = zipcode_df[~zipcode_df.Type.str.contains("P.O. Box|Unique")]
len(zipcode_df)

1583

In [13]:
#merge data frames to get california data only 
california_data = pd.merge(zipcode_census_pd, zipcode_df, on='Zipcode', how ='inner')
print(type(california_data))
print(len(california_data))
california_data.columns

<class 'pandas.core.frame.DataFrame'>
1551


Index(['Population', 'Poverty Count', 'poverty_male', 'poverty_female',
       'poverty_white_alone', 'poverty_black_alone',
       'population_american_indian_alone', 'poverty_asian_alone',
       'poverty_native_hawaiian_alone', 'poverty_two_or_more_races',
       'poverty_hispanic_origin', 'poverty_family',
       'poverty_family_single_male', 'poverty_family_single_female',
       'employment_labor_force', 'employment_not_labor_force',
       'employment_employed', 'employment_unemployed', 'Zipcode',
       'Poverty Rate', 'index', 'City', 'County', 'Type'],
      dtype='object')

In [14]:
california_data['Type'].value_counts()

Standard    1551
Name: Type, dtype: int64

In [15]:
california_data.to_csv("../resources/CACountiesZips.csv", encoding="utf-8", index=False)