In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
from census import Census

# Import Census API Key and postgresql database password
from config import (api_key, password)

### Use Census API to get social economic data

* See: https://github.com/CommerceDataService/census-wrapper for library documentation
* See: https://www.census.gov/data/developers/data-sets/acs-5year.html for more details about ACS5

In [2]:
# Connect to Census data in 2018 (the latest year available up-to-date)
c = Census(api_key, year=2018)

* See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
* Current metrics of interest:
    * "B19013_001E": Median household income in the past 12 months
    * "B01003_001E": Total population
    * "B01002_001E": Median age
    * "B19301_001E": Per capita income in the past 12 months
    * "B17001_002E": Number of persons whose income in the past 12 months is below the poverty level
    * "B23025_005E": Number of unemployed, age 16 or older, in the civilian labor force?

**Are we interested in any other social economic metrics? like education or language? For example,
    <br>
    "B15003_002E": The number of persons age 25 and over who completed no schooling
    <br>
    "B16001_002E": Speak_only_English**

**Should we have a seperate table for just different social economic metric labels (the json file is available in the above link)? --> data normalisation**

In [3]:
# Run Census Search to retrieve data on all zip codes (2018 ACS5 Census)
census_data = c.acs5.get(("B19013_001E",
                          "B01003_001E",
                          "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",
                          "B23025_005E"),
                         {'for': 'zip code tabulation area:*'})

https://www2.census.gov/data/api-documentation/how-to-download-all-zip-code-tabulation-areas-from-the-census-api.pdf?

In [4]:
# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

In [5]:
# Column Reordering and Renaming
census_pd = census_pd.rename(columns={
    "B19013_001E": "Median Household Income",
    "B01003_001E": "Population",
    "B01002_001E": "Median Age",
    "B19301_001E": "Per Capita Income",
    "B17001_002E": "Poverty Count",
    "B23025_005E": "Unemployment Count",
    "zip code tabulation area": "ZCTA"})

census_pd[["ZCTA", "Population", "Median Household Income", "Per Capita Income",
           "Poverty Count", "Unemployment Count", "Median Age"]].head()

Unnamed: 0,ZCTA,Population,Median Household Income,Per Capita Income,Poverty Count,Unemployment Count,Median Age
0,601,17242.0,13092.0,6999.0,10772.0,2316.0,40.5
1,602,38442.0,16358.0,9277.0,19611.0,1927.0,42.3
2,603,48814.0,16603.0,11307.0,24337.0,3124.0,41.1
3,606,6437.0,12832.0,5943.0,4163.0,230.0,43.3
4,610,27073.0,19309.0,10220.0,11724.0,1290.0,42.1


In [6]:
census_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 0 to 33119
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Median Household Income  33085 non-null  float64
 1   Population               33120 non-null  float64
 2   Median Age               33120 non-null  float64
 3   Per Capita Income        32776 non-null  float64
 4   Poverty Count            33085 non-null  float64
 5   Unemployment Count       33085 non-null  float64
 6   ZCTA                     33120 non-null  object 
dtypes: float64(6), object(1)
memory usage: 1.8+ MB


Look like we have lots of **missing data**

In [7]:
census_pd["ZCTA"].nunique()

33120

In [8]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
census_pd.to_csv("data/census_data.csv", encoding="utf-8", index=False)

### Extract CSVs into DataFrame

https://data.world/datafiniti/fast-food-restaurants-across-america

In [9]:
csv_file_one = "data/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants.csv"
df_one = pd.read_csv(csv_file_one)
df_one.head()

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,name,postalCode,province,sourceURLs,websites
0,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,American Restaurant and Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
1,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
2,AVwcopQoByjofQCxgfVa,2016-03-29T05:06:36Z,2018-06-26T02:59:52Z,206 Wears Valley Rd,Fast Food Restaurant,Pigeon Forge,US,us/tn/pigeonforge/206wearsvalleyrd/-864103396,35.803788,-83.580553,Taco Bell,37863,TN,https://www.yellowpages.com/pigeon-forge-tn/mi...,"http://www.tacobell.com,https://locations.taco..."
3,AVweXN5RByjofQCxxilK,2017-01-03T07:46:11Z,2018-06-26T02:59:51Z,3652 Parkway,Fast Food,Pigeon Forge,US,us/tn/pigeonforge/3652parkway/93075755,35.782339,-83.551408,Arby's,37863,TN,http://www.yellowbook.com/profile/arbys_163389...,"http://www.arbys.com,https://locations.arbys.c..."
4,AWQ6MUvo3-Khe5l_j3SG,2018-06-26T02:59:43Z,2018-06-26T02:59:43Z,2118 Mt Zion Parkway,Fast Food Restaurant,Morrow,US,us/ga/morrow/2118mtzionparkway/1305117222,33.562738,-84.321143,Steak 'n Shake,30260,GA,https://foursquare.com/v/steak-n-shake/4bcf77a...,http://www.steaknshake.com/locations/23851-ste...


In [10]:
df_one.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10000 non-null  object 
 1   dateAdded    10000 non-null  object 
 2   dateUpdated  10000 non-null  object 
 3   address      10000 non-null  object 
 4   categories   10000 non-null  object 
 5   city         10000 non-null  object 
 6   country      10000 non-null  object 
 7   keys         10000 non-null  object 
 8   latitude     10000 non-null  float64
 9   longitude    10000 non-null  float64
 10  name         10000 non-null  object 
 11  postalCode   10000 non-null  object 
 12  province     10000 non-null  object 
 13  sourceURLs   10000 non-null  object 
 14  websites     10000 non-null  object 
dtypes: float64(2), object(13)
memory usage: 1.1+ MB


https://www.reddit.com/r/explainlikeimfive/comments/6284le/eli5_what_are_those_numbers_after_the_dash_in_zip/

In [11]:
df_one["postalCode"] = df_one["postalCode"].apply(lambda x: x.split("-")[0])

In [12]:
df_one["postalCode"] = df_one["postalCode"].astype(int)

In [13]:
df_one.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10000 non-null  object 
 1   dateAdded    10000 non-null  object 
 2   dateUpdated  10000 non-null  object 
 3   address      10000 non-null  object 
 4   categories   10000 non-null  object 
 5   city         10000 non-null  object 
 6   country      10000 non-null  object 
 7   keys         10000 non-null  object 
 8   latitude     10000 non-null  float64
 9   longitude    10000 non-null  float64
 10  name         10000 non-null  object 
 11  postalCode   10000 non-null  int32  
 12  province     10000 non-null  object 
 13  sourceURLs   10000 non-null  object 
 14  websites     10000 non-null  object 
dtypes: float64(2), int32(1), object(12)
memory usage: 1.1+ MB


**Look like we can link the two datasets using zip_code**

### Zip code to ZCTA

Why https://en.wikipedia.org/wiki/ZIP_Code_Tabulation_Area
<br> https://acsdatacommunity.prb.org/acs-data-products--resources/american-factfinder/f/3/t/427
<br> https://atcoordinates.info/2020/05/11/the-trouble-with-zip-codes-solutions-for-data-analysis-and-mapping/
<br>Source: https://www.udsmapper.org/zcta-crosswalk.cfm

In [14]:
zip_to_zcta_df = pd.read_excel("data/zip_to_zcta_2019.xlsx")

In [15]:
zip_to_zcta_df.head()

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601,Zip Matches ZCTA
3,602,Aguada,PR,Zip Code Area,602,Zip Matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603,Zip Matches ZCTA


In [None]:
## Add 0 to ....
for index, value in zip_to_zcta.iterrows():
    

In [16]:
zip_to_zcta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41107 entries, 0 to 41106
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ZIP_CODE       41107 non-null  int64 
 1   PO_NAME        41107 non-null  object
 2   STATE          41106 non-null  object
 3   ZIP_TYPE       41107 non-null  object
 4   ZCTA           41107 non-null  object
 5   zip_join_type  41107 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.9+ MB


In [17]:
zip_to_zcta_df["ZIP_CODE"].nunique()

41106

In [18]:
zip_to_zcta_df["ZCTA"].nunique()

33167

In [19]:
df_one_zcta = pd.merge(df_one, zip_to_zcta_df, left_on="postalCode", right_on="ZIP_CODE")
df_one_zcta.head()

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,...,postalCode,province,sourceURLs,websites,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,American Restaurant and Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,...,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...,70301,Thibodaux,LA,Zip Code Area,70301,Zip Matches ZCTA
1,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,...,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...,70301,Thibodaux,LA,Zip Code Area,70301,Zip Matches ZCTA
2,AVwdXkZ4_7pvs4fz5KBg,2015-11-19T22:28:14Z,2018-06-22T18:31:19Z,204 N Canal Blvd,Fast Food,Thibodaux,US,us/la/thibodaux/204ncanalblvd/718051523,29.800178,-90.817667,...,70301,LA,https://www.allmenus.com/la/thibodaux/131709-q...,http://www.quiznos.com/,70301,Thibodaux,LA,Zip Code Area,70301,Zip Matches ZCTA
3,AVweZiIVkufWRAb5Cj9J,2015-11-06T10:29:56Z,2018-04-14T12:20:08Z,1020 S Acadia Rd,Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/1020sacadiard/-1161002137,29.78119,-90.80801,...,70301,LA,http://www.yellowpages.com/thibodaux-la/mip/mc...,"http://mcdonalds.com,http://www.mcdonalds.com/...",70301,Thibodaux,LA,Zip Code Area,70301,Zip Matches ZCTA
4,AVweZiIVkufWRAb5Cj9J,2015-11-06T10:29:56Z,2018-04-14T12:20:08Z,1020 S Acadia Rd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/1020sacadiard/-1161002137,29.78119,-90.80801,...,70301,LA,http://www.yellowpages.com/thibodaux-la/mip/mc...,"http://mcdonalds.com,http://www.mcdonalds.com/...",70301,Thibodaux,LA,Zip Code Area,70301,Zip Matches ZCTA


In [20]:
df_one_zcta.columns

Index(['id', 'dateAdded', 'dateUpdated', 'address', 'categories', 'city',
       'country', 'keys', 'latitude', 'longitude', 'name', 'postalCode',
       'province', 'sourceURLs', 'websites', 'ZIP_CODE', 'PO_NAME', 'STATE',
       'ZIP_TYPE', 'ZCTA', 'zip_join_type'],
      dtype='object')

In [21]:
final_restaurant_df = df_one_zcta[['id', 'name', 'address', 'latitude', 'longitude', 'postalCode', 'ZIP_CODE', 'ZCTA']].copy()

In [22]:
final_restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          10000 non-null  object 
 1   name        10000 non-null  object 
 2   address     10000 non-null  object 
 3   latitude    10000 non-null  float64
 4   longitude   10000 non-null  float64
 5   postalCode  10000 non-null  int32  
 6   ZIP_CODE    10000 non-null  int64  
 7   ZCTA        10000 non-null  object 
dtypes: float64(2), int32(1), int64(1), object(4)
memory usage: 664.1+ KB


In [23]:
final_restaurant_df["ZCTA"] = final_restaurant_df["ZCTA"].astype(int)

In [24]:
final_restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          10000 non-null  object 
 1   name        10000 non-null  object 
 2   address     10000 non-null  object 
 3   latitude    10000 non-null  float64
 4   longitude   10000 non-null  float64
 5   postalCode  10000 non-null  int32  
 6   ZIP_CODE    10000 non-null  int64  
 7   ZCTA        10000 non-null  int32  
dtypes: float64(2), int32(2), int64(1), object(3)
memory usage: 625.0+ KB


In [25]:
final_restaurant_df["diff"] = final_restaurant_df["ZCTA"] - final_restaurant_df["postalCode"]

In [26]:
final_restaurant_df[final_restaurant_df["diff"]!=0]

Unnamed: 0,id,name,address,latitude,longitude,postalCode,ZIP_CODE,ZCTA,diff
3211,AVwcg46g_7pvs4fzv5ve,B.GOOD,255 Washington St,42.357762,-71.058363,2201,2201,2203,2
3560,AVwdYPaQkufWRAb55R-g,Charley's Grilled Subs,1000 Airport Blvd,40.496025,-80.256707,15231,15231,15108,-123
4980,AVwdaFx6kufWRAb55lPk,Arby's,4303 Prospect Ave,39.901745,-88.95457,62524,62524,62526,2
5829,AVwcgRCHByjofQCxe9SZ,Chick-fil-A,800 S Gay St,35.962623,-83.916672,37929,37929,37902,-27
7973,AWOnmxYc3-Khe5l_ivie,Quiznos,1 Medical Dr,35.905896,-79.051529,27599,27599,27514,-85
8852,AVwdNgii_7pvs4fz3cVO,Church's Chicken,6425 Burnet Rd,30.339992,-97.738675,73301,73301,78704,5403
8853,AVwdNgii_7pvs4fz3cVO,Church's Chicken,6425 Burnet Rd,30.339992,-97.738675,73301,73301,78704,5403
8872,AVwebb94ByjofQCxyG9e,McDonald's,250 Summit Park Dr,40.45116,-80.185857,15275,15275,15108,-167
8873,AVzA9lTq3D1zeR_xBCPk,McDonald's,100 Davis Blvd,40.449249,-80.173662,15275,15275,15108,-167
8914,AVwcvJAHkufWRAb5yd3H,Arby's,787 Erie Blvd W,43.047896,-76.170271,13201,13201,13202,1


### Transform No 1 DataFrame

### Transform No 2 DataFrame

### Create database connection

### Load DataFrames into database