In [14]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

In [15]:
def getRestaurantsDf():
        # Source restaurants data in wake county from wakegov
        # base_url = "https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/0/query?f=json&where=(FACILITYTYPE%20IN%20('Restaurant'))&outFields=*"

        # keep pagination in mind (maxRecordCount = 1000, so grabbing 1000 at a time)
        val = 'https://opendata.arcgis.com/datasets/124c2187da8c41c59bde04fa67eb2872_0.geojson'

        # Sending get request and saving the response as response object
        # extracting data in json 
        r = requests.get(url = val)
        rows = []
        data = r.json()['features']
        for d in data:
            rows.append(d['properties'])
        df = pd.DataFrame(rows)
        print('restaurants df shape:', df.shape)
        return df

restaurants_df_raw = getRestaurantsDf()
display(restaurants_df_raw.head())

restaurants df shape: (3637, 15)


Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,ADDRESS2,CITY,STATE,POSTALCODE,PHONENUMBER,RESTAURANTOPENDATE,FACILITYTYPE,PERMITID,X,Y,GEOCODESTATUS
0,1857933,4092017187,THE 19TH HOLE,1527 TRYON RD,,RALEIGH,NC,27603,(919) 772-9987,2016-04-01T04:00:00Z,Restaurant,2,-78.667555,35.735219,M
1,1857934,4092017181,COSTCO WHOLESALE #1206 FOOD COURT,1021 PINE PLAZA DR,,APEX,NC,27523,(919) 331-6035,2016-03-14T04:00:00Z,Restaurant,6,-78.827267,35.746416,M
2,1857935,4092017178,RALEIGH RAW,7 W HARGETT ST,,RALEIGH,NC,27601,(919) 400-0944,2016-03-14T04:00:00Z,Restaurant,7,-78.63958,35.778101,M
3,1857936,4092018368,D's Bottle Shop,13200 Falls of Neuse RD,STE 115,RALEIGH,NC,27614,(919) 435-7034,2020-05-20T04:00:00Z,Restaurant,8,-78.55982,35.940756,M
4,1858145,4092017322,BADA BEAN COFFEE & THINGS,938 GATEWAY COMMONS CIR,,WAKE FOREST,NC,27587,(984) 235-4608,2016-09-16T04:00:00Z,Restaurant,9,-78.495178,35.967329,M


In [16]:
def preprocess_restaurants(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('Dropping columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)
    
    # get only restaurants
    df = df[df['FACILITYTYPE'] == 'Restaurant']
    
    # Drop single value column
    df = df.drop('FACILITYTYPE', axis=1)

    # Drop irrelevant columns
    df = df.drop('PHONENUMBER', axis=1) 
    print(df.isna().mean())
    return df

In [17]:
# Validate shape & data
# https://data-wake.opendata.arcgis.com/datasets/Wake::restaurants-in-wake-county/explore?filters=eyJGQUNJTElUWVRZUEUiOlsiUmVzdGF1cmFudCJdfQ%3D%3D&location=35.794181%2C-78.605600%2C3.98&showTable=true

print('\nDisplay Raw Data Info------------------------------\n')
display(restaurants_df_raw.info())
print('\n---------------------------------------------------\n')
display({column: len(restaurants_df_raw[column].unique()) for column in restaurants_df_raw.columns})

# validation (testing)
# assert restaurants_df.all(restaurants_df['FACILITYTYPE'] == 'Restaurant')

# Preprocess
print('\nPreprocessing--------------------------------------\n')
restaurants_df = preprocess_restaurants(restaurants_df_raw.copy())
print(restaurants_df.shape)
print('\nDisplay--------------------------------------------\n')
display(restaurants_df.head())


Display Raw Data Info------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3637 entries, 0 to 3636
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   OBJECTID            3637 non-null   int64  
 1   HSISID              3637 non-null   object 
 2   NAME                3637 non-null   object 
 3   ADDRESS1            3637 non-null   object 
 4   ADDRESS2            480 non-null    object 
 5   CITY                3637 non-null   object 
 6   STATE               3637 non-null   object 
 7   POSTALCODE          3637 non-null   object 
 8   PHONENUMBER         3483 non-null   object 
 9   RESTAURANTOPENDATE  3637 non-null   object 
 10  FACILITYTYPE        3637 non-null   object 
 11  PERMITID            3637 non-null   int64  
 12  X                   3637 non-null   float64
 13  Y                   3637 non-null   float64
 14  GEOCODESTATUS       3637 non-null   object 
dtypes

None


---------------------------------------------------



{'OBJECTID': 3637,
 'HSISID': 3637,
 'NAME': 3503,
 'ADDRESS1': 3163,
 'ADDRESS2': 297,
 'CITY': 45,
 'STATE': 1,
 'POSTALCODE': 565,
 'PHONENUMBER': 3122,
 'RESTAURANTOPENDATE': 2250,
 'FACILITYTYPE': 10,
 'PERMITID': 3637,
 'X': 2150,
 'Y': 2150,
 'GEOCODESTATUS': 3}


Preprocessing--------------------------------------

Dropping columns with more than 25% missing values: Index(['ADDRESS2'], dtype='object')
OBJECTID              0.0
HSISID                0.0
NAME                  0.0
ADDRESS1              0.0
CITY                  0.0
STATE                 0.0
POSTALCODE            0.0
RESTAURANTOPENDATE    0.0
PERMITID              0.0
X                     0.0
Y                     0.0
GEOCODESTATUS         0.0
dtype: float64
(2386, 12)

Display--------------------------------------------



Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,CITY,STATE,POSTALCODE,RESTAURANTOPENDATE,PERMITID,X,Y,GEOCODESTATUS
0,1857933,4092017187,THE 19TH HOLE,1527 TRYON RD,RALEIGH,NC,27603,2016-04-01T04:00:00Z,2,-78.667555,35.735219,M
1,1857934,4092017181,COSTCO WHOLESALE #1206 FOOD COURT,1021 PINE PLAZA DR,APEX,NC,27523,2016-03-14T04:00:00Z,6,-78.827267,35.746416,M
2,1857935,4092017178,RALEIGH RAW,7 W HARGETT ST,RALEIGH,NC,27601,2016-03-14T04:00:00Z,7,-78.63958,35.778101,M
3,1857936,4092018368,D's Bottle Shop,13200 Falls of Neuse RD,RALEIGH,NC,27614,2020-05-20T04:00:00Z,8,-78.55982,35.940756,M
4,1858145,4092017322,BADA BEAN COFFEE & THINGS,938 GATEWAY COMMONS CIR,WAKE FOREST,NC,27587,2016-09-16T04:00:00Z,9,-78.495178,35.967329,M
