In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
from census import Census

# Import Census API Key and postgresql database password
from config import (api_key, password)

### Use Census API to get social economic data

* See: https://github.com/CommerceDataService/census-wrapper for library documentation
* See: https://www.census.gov/data/developers/data-sets/acs-5year.html for more details about ACS5

In [2]:
# Connect to Census data in 2018 (the latest year available up-to-date)
c = Census(api_key, year=2018)

* See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
* Current metrics of interest:
    * "B19013_001E": Median household income in the past 12 months
    * "B01003_001E": Total population
    * "B01002_001E": Median age
    * "B19301_001E": Per capita income in the past 12 months
    * "B17001_002E": Number of persons whose income in the past 12 months is below the poverty level
    * "B23025_005E": Number of unemployed, age 16 or older, in the civilian labor force?

**Are we interested in any other social economic metrics? like education or language? For example,
    <br>
    "B15003_002E": The number of persons age 25 and over who completed no schooling
    <br>
    "B16001_002E": Speak_only_English**

**Should we have a seperate table for just different social economic metric labels (the json file is available in the above link)? --> data normalisation**

In [3]:
# Run Census Search to retrieve data on all zip codes (2018 ACS5 Census)
census_data = c.acs5.get(("B19013_001E",
                          "B01003_001E",
                          "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",
                          "B23025_005E"),
                         {'for': 'zip code tabulation area:*'})

https://www2.census.gov/data/api-documentation/how-to-download-all-zip-code-tabulation-areas-from-the-census-api.pdf?

In [4]:
# Convert to DataFrame
census_df = pd.DataFrame(census_data)

In [5]:
# Column Reordering and Renaming
census_df = census_df.rename(columns={
    "B19013_001E": "Median Household Income",
    "B01003_001E": "Population",
    "B01002_001E": "Median Age",
    "B19301_001E": "Per Capita Income",
    "B17001_002E": "Poverty Count",
    "B23025_005E": "Unemployment Count",
    "zip code tabulation area": "ZCTA"})

census_df[["ZCTA", "Population", "Median Household Income", "Per Capita Income",
           "Poverty Count", "Unemployment Count", "Median Age"]].head()

Unnamed: 0,ZCTA,Population,Median Household Income,Per Capita Income,Poverty Count,Unemployment Count,Median Age
0,601,17242.0,13092.0,6999.0,10772.0,2316.0,40.5
1,602,38442.0,16358.0,9277.0,19611.0,1927.0,42.3
2,603,48814.0,16603.0,11307.0,24337.0,3124.0,41.1
3,606,6437.0,12832.0,5943.0,4163.0,230.0,43.3
4,610,27073.0,19309.0,10220.0,11724.0,1290.0,42.1


In [6]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 0 to 33119
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Median Household Income  33085 non-null  float64
 1   Population               33120 non-null  float64
 2   Median Age               33120 non-null  float64
 3   Per Capita Income        32776 non-null  float64
 4   Poverty Count            33085 non-null  float64
 5   Unemployment Count       33085 non-null  float64
 6   ZCTA                     33120 non-null  object 
dtypes: float64(6), object(1)
memory usage: 1.8+ MB


Look like we have lots of **missing data**

In [7]:
census_df["ZCTA"].nunique()

33120

In [8]:
odd1 = census_df["Median Age"].min()

In [9]:
odd_median_age = census_df[census_df["Median Age"]==odd1]
odd_median_age

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
110,-666666666.0,0.0,-666666666.0,-666666666.0,0.0,0.0,00950
111,-666666666.0,0.0,-666666666.0,-666666666.0,0.0,0.0,00951
167,-666666666.0,9.0,-666666666.0,,0.0,0.0,01066
206,-666666666.0,0.0,-666666666.0,-666666666.0,0.0,0.0,01199
514,-666666666.0,25.0,-666666666.0,,13.0,0.0,02203
...,...,...,...,...,...,...,...
33080,-666666666.0,15.0,-666666666.0,,0.0,0.0,99923
33099,,22.0,-666666666.0,,,,87539
33100,,0.0,-666666666.0,,,,87554
33106,,32.0,-666666666.0,,,,87581


In [10]:
odd_median_age["Median Household Income"].unique()

array([-6.66666666e+08,             nan])

In [11]:
odd_median_age["Per Capita Income"].unique()

array([-6.66666666e+08,             nan])

In [12]:
census_df = census_df[census_df["Median Age"] != odd1]

In [13]:
census_df["Median Age"].min()

2.4

In [14]:
census_df["Per Capita Income"].min()

42.0

In [15]:
census_df[census_df["Median Household Income"] == odd1]

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
42,-666666666.0,69.0,73.6,6880.0,24.0,0.0,00694
86,-666666666.0,348.0,64.0,14278.0,46.0,0.0,00786
107,-666666666.0,79.0,32.8,16956.0,0.0,26.0,00934
108,-666666666.0,812.0,77.3,7132.0,0.0,0.0,00936
117,-666666666.0,2221.0,32.8,3069.0,23.0,6.0,00960
...,...,...,...,...,...,...,...
33027,-666666666.0,14.0,60.0,25536.0,3.0,0.0,99757
33037,-666666666.0,60.0,17.5,18652.0,14.0,3.0,99767
33044,-666666666.0,11.0,25.8,6964.0,9.0,0.0,99774
33058,-666666666.0,10.0,50.5,68700.0,0.0,2.0,99790


In [16]:
census_df.describe()

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count
count,32525.0,32556.0,32556.0,32456.0,32525.0,32525.0
mean,-33436120.0,10021.744164,42.471406,29912.923743,1406.16206,298.733528
std,145639300.0,14682.914843,8.976003,13871.061533,2681.205129,521.522857
min,-666666700.0,5.0,2.4,42.0,0.0,0.0
25%,41205.0,767.0,36.9,22190.75,81.0,14.0
50%,52902.0,2935.0,41.9,27249.5,333.0,71.0
75%,67316.0,13701.5,47.3,33996.0,1456.0,353.0
max,250001.0,122814.0,98.3,461279.0,35874.0,9120.0


In [17]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32556 entries, 0 to 33119
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Median Household Income  32525 non-null  float64
 1   Population               32556 non-null  float64
 2   Median Age               32556 non-null  float64
 3   Per Capita Income        32456 non-null  float64
 4   Poverty Count            32525 non-null  float64
 5   Unemployment Count       32525 non-null  float64
 6   ZCTA                     32556 non-null  object 
dtypes: float64(6), object(1)
memory usage: 2.0+ MB


In [18]:
census_df[census_df["Poverty Count"].isnull()]

Unnamed: 0,Median Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,ZCTA
33085,,133.0,25.8,,,,87533
33086,,255.0,47.5,,,,87012
33087,,84.0,57.7,,,,87064
33088,,162.0,38.1,,,,87046
33089,,1341.0,37.8,,,,87548
33090,,775.0,48.6,,,,87575
33091,,1110.0,41.5,,,,87530
33092,,385.0,57.0,,,,87577
33093,,761.0,30.6,,,,87527
33094,,518.0,41.9,,,,87549


In [19]:
type(np.NaN)

float

In [None]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
census_df.to_csv("data/census_data.csv", encoding="utf-8", index=False)

### Extract CSVs into DataFrame

https://data.world/datafiniti/fast-food-restaurants-across-america

In [None]:
csv_file_one = "data/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants.csv"
df_one = pd.read_csv(csv_file_one)
df_one.head()

In [None]:
df_one["keys"].nunique()

In [None]:
df_one.info()

https://www.reddit.com/r/explainlikeimfive/comments/6284le/eli5_what_are_those_numbers_after_the_dash_in_zip/

In [None]:
df_one["postalCode"] = df_one["postalCode"].apply(lambda x: x.split("-")[0])

In [None]:
df_one["postalCode"] = df_one["postalCode"].astype(int)

In [None]:
df_one.info()

**Look like we can link the two datasets using zip_code**

### Zip code to ZCTA

Why https://en.wikipedia.org/wiki/ZIP_Code_Tabulation_Area
<br> https://acsdatacommunity.prb.org/acs-data-products--resources/american-factfinder/f/3/t/427
<br> https://atcoordinates.info/2020/05/11/the-trouble-with-zip-codes-solutions-for-data-analysis-and-mapping/
<br>Source: https://www.udsmapper.org/zcta-crosswalk.cfm

In [None]:
zip_to_zcta_df = pd.read_excel("data/zip_to_zcta_2019.xlsx")

In [None]:
zip_to_zcta_df.head()

In [None]:
## Add 0 to ....
for index, value in zip_to_zcta.iterrows():
    

In [None]:
zip_to_zcta_df.info()

In [None]:
zip_to_zcta_df["ZIP_CODE"].nunique()

In [None]:
zip_to_zcta_df["ZCTA"].nunique()

In [None]:
df_one_zcta = pd.merge(df_one, zip_to_zcta_df, left_on="postalCode", right_on="ZIP_CODE")
df_one_zcta.head()

In [None]:
df_one_zcta.columns

In [None]:
final_restaurant_df = df_one_zcta[['id', 'name', 'address', 'latitude', 'longitude', 'postalCode', 'ZIP_CODE', 'ZCTA']].copy()

In [None]:
final_restaurant_df.info()

In [None]:
final_restaurant_df["ZCTA"] = final_restaurant_df["ZCTA"].astype(int)

In [None]:
final_restaurant_df.info()

In [None]:
final_restaurant_df["diff"] = final_restaurant_df["ZCTA"] - final_restaurant_df["postalCode"]

In [None]:
final_restaurant_df[final_restaurant_df["diff"]!=0]

### Transform No 1 DataFrame

### Transform No 2 DataFrame

### Create database connection

### Load DataFrames into database