In [1]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine
from census import Census

# Import Census API Key and postgresql database password
from config import (api_key, password)

### Use Census API to get social economic data

* See: https://github.com/CommerceDataService/census-wrapper for library documentation

In [2]:
# Connect to Census data in 2018 (the latest year available up-to-date)
c = Census(api_key, year=2018)

* See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
* Current metrics of interest:
    * "B19013_001E": Median household income in the past 12 months
    * "B01003_001E": Total population
    * "B01002_001E": Median age
    * "B19301_001E": Per capita income in the past 12 months
    * "B17001_002E": Number of persons whose income in the past 12 months is below the poverty level
    * "B23025_005E": Number of unemployed, age 16 or older, in the civilian labor force?

**Are we interested in any other social economic metrics? like education or language? For example,
    <br>
    "B15003_002E": The number of persons age 25 and over who completed no schooling
    <br>
    "B16001_002E": Speak_only_English**

**Should we have a seperate table for just different social economic metric labels (the json file is available in the above link)? --> data normalisation**

In [4]:
# Run Census Search to retrieve data on all zip codes (2018 ACS5 Census)
census_data = c.acs5.get(("B19013_001E",
                          "B01003_001E",
                          "B01002_001E",
                          "B19301_001E",
                          "B17001_002E",
                          "B23025_005E"),
                         {'for': 'zip code tabulation area:*'})

https://www2.census.gov/data/api-documentation/how-to-download-all-zip-code-tabulation-areas-from-the-census-api.pdf?

In [5]:
# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

In [6]:
# Column Reordering and Renaming
census_pd = census_pd.rename(columns={
    "zip code tabulation area": "zip_code",
    "B01003_001E": "Population",
    "B19013_001E": "Household Income",
    "B19301_001E": "Per Capita Income",
    "B17001_002E": "Poverty Count",
    "B23025_005E": "Unemployment Count",
    "B01002_001E": "Median Age"})

census_pd.head()

Unnamed: 0,Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,zip_code
0,13092.0,17242.0,40.5,6999.0,10772.0,2316.0,601
1,16358.0,38442.0,42.3,9277.0,19611.0,1927.0,602
2,16603.0,48814.0,41.1,11307.0,24337.0,3124.0,603
3,12832.0,6437.0,43.3,5943.0,4163.0,230.0,606
4,19309.0,27073.0,42.1,10220.0,11724.0,1290.0,610


In [7]:
census_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33120 entries, 0 to 33119
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Household Income    33085 non-null  float64
 1   Population          33120 non-null  float64
 2   Median Age          33120 non-null  float64
 3   Per Capita Income   32776 non-null  float64
 4   Poverty Count       33085 non-null  float64
 5   Unemployment Count  33085 non-null  float64
 6   zip_code            33120 non-null  object 
dtypes: float64(6), object(1)
memory usage: 1.8+ MB


Look like we have lots of **missing data**

In [None]:
### Something to consider
## Add in Poverty Rate (Poverty Count / Population)
# census_pd["Poverty Rate"] = 100 * \
#     census_pd["Poverty Count"].astype(
#         int) / census_pd["Population"].astype(int)

## Add in Unemployment Rate (Unemployment Count / Population)
# census_pd["Unemployment Rate"] = 100 * \
#     census_pd["Unemployment Count"].astype(
#         int) / census_pd["Population"].astype(int)

## Final DataFrame
# census_pd = census_pd[["zip_code", "Name", "Population", "Median Age", "Household Income",
#                        "Per Capita Income", "Poverty Count", "Poverty Rate", "Unemployment Rate"]]

## Visualize
# print(len(census_pd))

In [12]:
# Save as a csv
# Note to avoid any issues later, use encoding="utf-8"
census_pd.to_csv("data/census_data.csv", encoding="utf-8", index=False)

### Extract CSVs into DataFrame

In [8]:
csv_file_one = "data/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants.csv"
df_one = pd.read_csv(csv_file_one)
df_one.head()

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,name,postalCode,province,sourceURLs,websites
0,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,American Restaurant and Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
1,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
2,AVwcopQoByjofQCxgfVa,2016-03-29T05:06:36Z,2018-06-26T02:59:52Z,206 Wears Valley Rd,Fast Food Restaurant,Pigeon Forge,US,us/tn/pigeonforge/206wearsvalleyrd/-864103396,35.803788,-83.580553,Taco Bell,37863,TN,https://www.yellowpages.com/pigeon-forge-tn/mi...,"http://www.tacobell.com,https://locations.taco..."
3,AVweXN5RByjofQCxxilK,2017-01-03T07:46:11Z,2018-06-26T02:59:51Z,3652 Parkway,Fast Food,Pigeon Forge,US,us/tn/pigeonforge/3652parkway/93075755,35.782339,-83.551408,Arby's,37863,TN,http://www.yellowbook.com/profile/arbys_163389...,"http://www.arbys.com,https://locations.arbys.c..."
4,AWQ6MUvo3-Khe5l_j3SG,2018-06-26T02:59:43Z,2018-06-26T02:59:43Z,2118 Mt Zion Parkway,Fast Food Restaurant,Morrow,US,us/ga/morrow/2118mtzionparkway/1305117222,33.562738,-84.321143,Steak 'n Shake,30260,GA,https://foursquare.com/v/steak-n-shake/4bcf77a...,http://www.steaknshake.com/locations/23851-ste...


In [9]:
df_one.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10000 non-null  object 
 1   dateAdded    10000 non-null  object 
 2   dateUpdated  10000 non-null  object 
 3   address      10000 non-null  object 
 4   categories   10000 non-null  object 
 5   city         10000 non-null  object 
 6   country      10000 non-null  object 
 7   keys         10000 non-null  object 
 8   latitude     10000 non-null  float64
 9   longitude    10000 non-null  float64
 10  name         10000 non-null  object 
 11  postalCode   10000 non-null  object 
 12  province     10000 non-null  object 
 13  sourceURLs   10000 non-null  object 
 14  websites     10000 non-null  object 
dtypes: float64(2), object(13)
memory usage: 1.1+ MB


In [10]:
#Look for postal code "70301" in the Census data
census_pd[census_pd["zip_code"]=="70301"]

Unnamed: 0,Household Income,Population,Median Age,Per Capita Income,Poverty Count,Unemployment Count,zip_code
24170,49838.0,45298.0,34.8,28546.0,7134.0,1942.0,70301


In [11]:
#Look for postal code "70301" in the restaurant data
df_one[df_one["postalCode"]=="70301"]

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,city,country,keys,latitude,longitude,name,postalCode,province,sourceURLs,websites
0,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,American Restaurant and Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
1,AVwcmSyZIN2L1WUfmxyw,2015-10-19T23:47:58Z,2018-06-26T03:00:14Z,800 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/800ncanalblvd/1780593795,29.814697,-90.814742,SONIC Drive In,70301,LA,https://foursquare.com/v/sonic-drive-in/4b7361...,https://locations.sonicdrivein.com/la/thibodau...
571,AVwdXkZ4_7pvs4fz5KBg,2015-11-19T22:28:14Z,2018-06-22T18:31:19Z,204 N Canal Blvd,Fast Food,Thibodaux,US,us/la/thibodaux/204ncanalblvd/718051523,29.800178,-90.817667,Quiznos Sub,70301,LA,https://www.allmenus.com/la/thibodaux/131709-q...,http://www.quiznos.com/
9632,AVweZiIVkufWRAb5Cj9J,2015-11-06T10:29:56Z,2018-04-14T12:20:08Z,1020 S Acadia Rd,Fast Food Restaurant,Thibodaux,US,us/la/thibodaux/1020sacadiard/-1161002137,29.78119,-90.80801,McDonald's,70301,LA,http://www.yellowpages.com/thibodaux-la/mip/mc...,"http://mcdonalds.com,http://www.mcdonalds.com/..."
9633,AVweZiIVkufWRAb5Cj9J,2015-11-06T10:29:56Z,2018-04-14T12:20:08Z,1020 S Acadia Rd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/1020sacadiard/-1161002137,29.78119,-90.80801,McDonald's,70301,LA,http://www.yellowpages.com/thibodaux-la/mip/mc...,"http://mcdonalds.com,http://www.mcdonalds.com/..."
9637,AVwd4iikIN2L1WUf0DLF,2016-05-06T14:22:06Z,2018-04-14T12:19:54Z,612 N Canal Blvd,Fast Food Restaurants,Thibodaux,US,us/la/thibodaux/612ncanalblvd/601946506,29.81164,-90.81551,Wingstop Restaurant,70301,LA,https://www.yellowpages.com/thibodaux-la/mip/w...,http://www.wingstop.com


**Look like we can link the two datasets using zip_code**

In [None]:
csv_file_two = "data/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants_Jun19.csv"
df_two = pd.read_csv(csv_file_two)
df_two.head()

In [None]:
df_two.info()

In [None]:
csv_file_three = "data/datafiniti-fast-food-restaurants-across-america/FastFoodRestaurants.csv"
df_three = pd.read_csv(csv_file_three)
df_three.head()

In [None]:
df_three.info()

In [None]:
df_three["province"].value_counts()

In [None]:
df_three["province"].nunique()

In [None]:
df_three["postalCode"].nunique()

In [None]:
df_three["postalCode"].value_counts()

In [None]:
df_three[df_three["name"]=="Carl's Jr."]

In [None]:
url = "https://api.census.gov/data/2018/acs/acs5?get=NAME,group(B19013)&for=zip%20code%20tabulation%20area:*"

### Transform No 1 DataFrame

### Transform No 2 DataFrame

### Create database connection

### Load DataFrames into database