In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

In [2]:
def getRestaurantsDf():
        # Source restaurants data in wake county from wakegov
        base_url = "https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/0/query?f=json&where=(FACILITYTYPE%20IN%20('Restaurant'))&outFields=*"

        # keep pagination in mind (maxRecordCount = 1000, so grabbing 1000 at a time)
        reqs = []
        reqs.append(base_url)
        reqs.append(base_url+'&resultOffset=1000&resultRecordCount=1000')
        reqs.append(base_url+'&resultOffset=2000&resultRecordCount=1000')
        reqs.append(base_url+'&resultOffset=3000&resultRecordCount=1000')

        # Sending get request and saving the response as response object
        # extracting data in json 
        rows = []
        for val in reqs:
            r = requests.get(url = val)
            data = r.json()['features']
            for d in data:
                rows.append(d['attributes'])
        df = pd.DataFrame(rows)
        print('restaurants df shape:', df.shape)
        return df

restaurants_df_raw = getRestaurantsDf()

restaurants df shape: (2386, 15)


In [3]:
def preprocess_restaurants(df):
    
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('Dropping columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)
    # Drop single value column
    df = df.drop('FACILITYTYPE', axis=1)
    
    # Drop irrelevant columns
    df = df.drop('PHONENUMBER', axis=1)
    print(df.isna().mean())
    return df

In [4]:
# Validate shape & data
# https://data-wake.opendata.arcgis.com/datasets/Wake::restaurants-in-wake-county/explore?filters=eyJGQUNJTElUWVRZUEUiOlsiUmVzdGF1cmFudCJdfQ%3D%3D&location=35.794181%2C-78.605600%2C3.98&showTable=true

print('\nDisplay Raw Data Info------------------------------\n')
display(restaurants_df_raw.info())
print('\n---------------------------------------------------\n')
display({column: len(restaurants_df_raw[column].unique()) for column in restaurants_df_raw.columns})

# validation (testing)
# assert restaurants_df.all(restaurants_df['FACILITYTYPE'] == 'Restaurant')

# Preprocess
print('\nPreprocessing--------------------------------------\n')
restaurants_df = preprocess_restaurants(restaurants_df_raw.copy())
print('\nDisplay--------------------------------------------\n')
display(restaurants_df.head())


Display Raw Data Info------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2386 entries, 0 to 2385
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   OBJECTID            2386 non-null   int64  
 1   HSISID              2386 non-null   object 
 2   NAME                2386 non-null   object 
 3   ADDRESS1            2386 non-null   object 
 4   ADDRESS2            341 non-null    object 
 5   CITY                2386 non-null   object 
 6   STATE               2386 non-null   object 
 7   POSTALCODE          2386 non-null   object 
 8   PHONENUMBER         2300 non-null   object 
 9   RESTAURANTOPENDATE  2386 non-null   int64  
 10  FACILITYTYPE        2386 non-null   object 
 11  PERMITID            2386 non-null   int64  
 12  X                   2386 non-null   float64
 13  Y                   2386 non-null   float64
 14  GEOCODESTATUS       2386 non-null   object 
dtypes

None


---------------------------------------------------



{'OBJECTID': 1833,
 'HSISID': 1833,
 'NAME': 1751,
 'ADDRESS1': 1774,
 'ADDRESS2': 177,
 'CITY': 31,
 'STATE': 1,
 'POSTALCODE': 276,
 'PHONENUMBER': 1748,
 'RESTAURANTOPENDATE': 1401,
 'FACILITYTYPE': 1,
 'PERMITID': 1833,
 'X': 1319,
 'Y': 1319,
 'GEOCODESTATUS': 3}


Preprocessing--------------------------------------

Dropping columns with more than 25% missing values: Index(['ADDRESS2'], dtype='object')
OBJECTID              0.0
HSISID                0.0
NAME                  0.0
ADDRESS1              0.0
CITY                  0.0
STATE                 0.0
POSTALCODE            0.0
RESTAURANTOPENDATE    0.0
PERMITID              0.0
X                     0.0
Y                     0.0
GEOCODESTATUS         0.0
dtype: float64

Display--------------------------------------------



Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,CITY,STATE,POSTALCODE,RESTAURANTOPENDATE,PERMITID,X,Y,GEOCODESTATUS
0,1856045,4092011394,McDonald's # 13334,6213 FALLS OF NEUSE RD,RALEIGH,NC,27609-3505,754531200000,11566,-78.621232,35.86745,M
1,1856046,4092011309,Goodberry's Creamery,1407 Garner Station Blv,RALEIGH,NC,27603,740707200000,11564,-78.651632,35.724843,M
2,1856047,4092013106,Sawmill Tap Room,7701 LEAD MINE RD,RALEIGH,NC,27615-4829,947808000000,11575,-78.660325,35.885009,M
3,1856048,4092013101,CAPITAL CITY CHOP HOUSE,151 AIRGATE DR,MORRISVILLE,NC,27560-8495,949449600000,11578,-78.814789,35.863449,M
4,1856049,4092013060,PNC 329,1400 EDWARDS MILL RD,RALEIGH,NC,27607-3624,941155200000,11580,-78.722936,35.803242,M
