In [None]:
! pip install census
! pip install lxml
! pip install beautifulsoup4 html5lib
!jupyter nbextension enable --py --sys-prefix widgetsnbextension
!pip install gmaps
!jupyter nbextension enable --py --sys-prefix gmaps
!pip install geopandas
!pip install pyshp
!pip install shapely
!pip install plotly==4.11.0
!pip install plotly-geo

In [None]:
# Set up Dependencies
from census import Census
from config import (census_key, gkey)
import gmaps
import numpy as np
import pandas as pd
import requests
import time
from us import states
from scipy.stats import linregress
from matplotlib import pyplot as plt

# Census API Key
c = Census(census_key, year=2017)

In [None]:
# Documentation census American Community Survey 5 Year Data 
    # see: https://github.com/CommerceDataService/census-wrapper for library documentation
    # See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels

# Run Census Search to retrieve data on all zip codes (2017 ACS5 Census)
zipcode_census_data = c.acs5.get(("B01003_001E", "B17001_002E","B17001_003E","B17001_017E",
                          "B17001A_002E","B17001B_002E", "B17001C_002E" ,"B17001D_002E" ,
                          "B17001E_002E" ,"B17001G_002E" ,"B17001I_002E" ,"B17012_002E",
                          "B17012_009E","B17012_014E" ,"B23025_002E" ,"B23025_007E" ,
                          "B23025_004E" ,"B23025_005E"), {
                         'for': 'zip code tabulation area:*'})

# Convert json to DataFrame
zipcode_census_pd = pd.DataFrame(zipcode_census_data)

#rename columns with appropriate names
zipcode_census_pd = zipcode_census_pd.rename(columns={"B01003_001E": "Population",
                                      "B17001_002E": "Poverty Count","B17001_003E": "poverty_male",
                                                      "B17001_017E": "poverty_female","B17001A_002E": "poverty_white_alone",
                                                      "B17001B_002E": "poverty_black_alone",
                                                      "B17001C_002E": "population_american_indian_alone",
                                                      "B17001D_002E": "poverty_asian_alone",
                                                      "B17001E_002E": "poverty_native_hawaiian_alone",
                                                      "B17001G_002E": "poverty_two_or_more_races",
                                                      "B17001I_002E": "poverty_hispanic_origin",
                                                      "B17012_002E": "poverty_family",
                                                      "B17012_009E": "poverty_family_single_male",
                                                      "B17012_014E": "poverty_family_single_female",
                                                      "B23025_002E": "employment_labor_force",
                                                      "B23025_007E": "employment_not_labor_force"	,
                                                      "B23025_004E": "employment_employed",
                                                      "B23025_005E": "employment_unemployed",
                                      "zip code tabulation area": "Zipcode"})

zipcode_census_pd.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

# Calculate and add in Poverty Rate (Poverty Count / Population)
zipcode_census_pd["Poverty Rate"] = 100 * \
    zipcode_census_pd["Poverty Count"].astype(
        int) / zipcode_census_pd["Population"].astype(int)
# # Visualize
print(len(zipcode_census_pd))
zipcode_census_pd.head()

In [None]:
# Get California Data Only
    #Web scrape to get data for california zipcodes that includes county and whether 
tables = pd.read_html('Resources/zipcode_site.html')
zipcode_df = tables[2]

In [None]:
#clean zipcode_df to get rid of p.o. boxes and rename columns
column_names = list(zipcode_df.iloc[0,:])
zipcode_df.columns = column_names
    #delete first row (redundant)
zipcode_df = zipcode_df.drop(zipcode_df.index[0])

In [None]:
#reset index 
zipcode_df=zipcode_df.reset_index()
#rename column for merge
zipcode_df = zipcode_df.rename(columns={"ZIP Code":"Zipcode"})

In [None]:
# zipcode_df.head()
zipcode_df['Zipcode']=zipcode_df['Zipcode'].str.slice(9)
zipcode_df

In [None]:
set(zipcode_df['Type'])

In [None]:
zipcode_df['Type'].value_counts()

In [None]:
#drop all p.o. boxes 
zipcode_df = zipcode_df[~zipcode_df.Type.str.contains("P.O. Box|Unique")]
len(zipcode_df)

In [None]:
#merge data frames to get california data only 
california_data = pd.merge(zipcode_census_pd, zipcode_df, on='Zipcode', how ='inner')
print(type(california_data))
print(len(california_data))
california_data.columns

In [None]:
california_data['Type']
california_data

In [None]:

california_data['Type'].value_counts()

In [None]:
#export data to CSV

#california_data.to_csv("Resources/california_census_data.csv", encoding="utf-8", index=False)