In [76]:
!pip install phonenumbers



In [124]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

import phonenumbers

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

### Fetch Restaurants from wakegov

In [125]:
def getRestaurantsDf(forceFetch=False):
        path = Path()
        key = 'restaurants.csv'
        filename = path/key
        
        # If the file does already exist in the directory, download it
        if os.path.exists(filename) and not forceFetch:
            print('Using pre-fetched restaurants data')
            df = pd.read_csv('restaurants.csv')
            print('restaurants df shape:', df.shape)
            return df
    
        # Source restaurants data in wake county from wakegov
        # base_url = "https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/0/query?f=json&where=(FACILITYTYPE%20IN%20('Restaurant'))&outFields=*"
        else:
            print('Fetching restaurants data...')
            val = 'https://opendata.arcgis.com/datasets/124c2187da8c41c59bde04fa67eb2872_0.geojson'

            # Sending get request and saving the response as response object
            # extracting data in json 
            r = requests.get(url = val)
            rows = []
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
            df = pd.DataFrame(rows)
            print('restaurants df shape:', df.shape)
            df.to_csv('restaurants.csv', index=False)
            print('Done')
            return df

restaurants_df_raw = getRestaurantsDf(forceFetch=False)
display(restaurants_df_raw.head())

Using pre-fetched restaurants data
restaurants df shape: (3641, 15)


Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,ADDRESS2,CITY,STATE,POSTALCODE,PHONENUMBER,RESTAURANTOPENDATE,FACILITYTYPE,PERMITID,X,Y,GEOCODESTATUS
0,1891530,4092016487,PEACE CHINA,13220 Strickland RD,Ste 167,RALEIGH,NC,27613,(919) 676-9968,2013-08-14T04:00:00Z,Restaurant,2,-78.725938,35.908783,M
1,1891531,4092018622,Northside Bistro & Cocktails,832 SPRING FOREST RD,,RALEIGH,NC,27609,(919) 890-5225,2021-05-13T04:00:00Z,Restaurant,22,-78.622635,35.866275,M
2,1891532,4092016155,DAILY PLANET CAFE,11 W JONES ST,STE 1509,RALEIGH,NC,27601,(919) 707-8060,2012-04-12T04:00:00Z,Restaurant,26,-78.639431,35.782205,M
3,1891533,4092016161,HIBACHI 88,3416 POOLE RD,,RALEIGH,NC,27610,(919) 231-1688,2012-04-18T04:00:00Z,Restaurant,28,-78.579533,35.767246,M
4,1891534,4092017180,BOND BROTHERS BEER COMPANY,202 E CEDAR ST,,CARY,NC,27511,(919) 459-2670,2016-03-11T05:00:00Z,Restaurant,29,-78.778021,35.787986,M


### Preprocessing

In [155]:
# normalize to E164 format
def normalize_phone(phone):
    try:
        phone = phone.split("ext")[0].strip()
        z = phonenumbers.parse(phone, 'US')
        retVal = phonenumbers.format_number(z, phonenumbers.PhoneNumberFormat.E164)
        return retVal
    except:
        return phone

In [158]:
def preprocess_restaurants(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('Dropping columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)
    
    # get only restaurants
    df = df[df['FACILITYTYPE'] == 'Restaurant']
    
    # Drop single value column
    df = df.drop('FACILITYTYPE', axis=1)

    # Drop irrelevant columns
    df = df.drop('STATE', axis=1) 
    
    # Convert rest open date
    df['RESTAURANTOPENDATE'] = pd.to_datetime(df['RESTAURANTOPENDATE']).dt.date
    
    #normalize phone numbers 
    df['PHONENUMBER'] = df['PHONENUMBER'].apply(lambda x: normalize_phone(x))
    df['PHONENUMBER'] = df['PHONENUMBER'].apply(lambda a: str(a).split('.')[0])
    df['PHONENUMBER'] = df['PHONENUMBER'].astype(str)
    print(df.isna().mean())
    return df

In [159]:
# Validate shape & data
# https://data-wake.opendata.arcgis.com/datasets/Wake::restaurants-in-wake-county/explore?filters=eyJGQUNJTElUWVRZUEUiOlsiUmVzdGF1cmFudCJdfQ%3D%3D&location=35.794181%2C-78.605600%2C3.98&showTable=true

print('\nDisplay Raw Data Info------------------------------\n')
display(restaurants_df_raw.info())
print('\n---------------------------------------------------\n')
display({column: len(restaurants_df_raw[column].unique()) for column in restaurants_df_raw.columns})

# validation (testing)
# assert restaurants_df.all(restaurants_df['FACILITYTYPE'] == 'Restaurant')

# Preprocess
print('\nPreprocessing--------------------------------------\n')
restaurants_df = preprocess_restaurants(restaurants_df_raw.copy())
print(restaurants_df.shape)


Display Raw Data Info------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3641 entries, 0 to 3640
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   OBJECTID            3641 non-null   int64  
 1   HSISID              3641 non-null   int64  
 2   NAME                3641 non-null   object 
 3   ADDRESS1            3641 non-null   object 
 4   ADDRESS2            485 non-null    object 
 5   CITY                3641 non-null   object 
 6   STATE               3641 non-null   object 
 7   POSTALCODE          3641 non-null   object 
 8   PHONENUMBER         3487 non-null   object 
 9   RESTAURANTOPENDATE  3641 non-null   object 
 10  FACILITYTYPE        3641 non-null   object 
 11  PERMITID            3641 non-null   int64  
 12  X                   3641 non-null   float64
 13  Y                   3641 non-null   float64
 14  GEOCODESTATUS       3641 non-null   object 
dtypes

None


---------------------------------------------------



{'OBJECTID': 3641,
 'HSISID': 3641,
 'NAME': 3507,
 'ADDRESS1': 3164,
 'ADDRESS2': 298,
 'CITY': 45,
 'STATE': 1,
 'POSTALCODE': 565,
 'PHONENUMBER': 3127,
 'RESTAURANTOPENDATE': 2250,
 'FACILITYTYPE': 10,
 'PERMITID': 3641,
 'X': 2154,
 'Y': 2154,
 'GEOCODESTATUS': 3}


Preprocessing--------------------------------------

Dropping columns with more than 25% missing values: Index(['ADDRESS2'], dtype='object')
OBJECTID              0.0
HSISID                0.0
NAME                  0.0
ADDRESS1              0.0
CITY                  0.0
POSTALCODE            0.0
PHONENUMBER           0.0
RESTAURANTOPENDATE    0.0
PERMITID              0.0
X                     0.0
Y                     0.0
GEOCODESTATUS         0.0
dtype: float64
(2385, 12)


In [160]:
restaurants_df['POSTALCODE'].nunique()

358

In [161]:
restaurants_df['PHONENUMBER'].nunique()

2260

In [162]:
restaurants_df.dtypes

OBJECTID                int64
HSISID                  int64
NAME                   object
ADDRESS1               object
CITY                   object
POSTALCODE             object
PHONENUMBER            object
RESTAURANTOPENDATE     object
PERMITID                int64
X                     float64
Y                     float64
GEOCODESTATUS          object
dtype: object

In [163]:
display(restaurants_df.head(20))

Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,CITY,POSTALCODE,PHONENUMBER,RESTAURANTOPENDATE,PERMITID,X,Y,GEOCODESTATUS
0,1891530,4092016487,PEACE CHINA,13220 Strickland RD,RALEIGH,27613,19196769968,2013-08-14,2,-78.725938,35.908783,M
1,1891531,4092018622,Northside Bistro & Cocktails,832 SPRING FOREST RD,RALEIGH,27609,19198905225,2021-05-13,22,-78.622635,35.866275,M
2,1891532,4092016155,DAILY PLANET CAFE,11 W JONES ST,RALEIGH,27601,19197078060,2012-04-12,26,-78.639431,35.782205,M
3,1891533,4092016161,HIBACHI 88,3416 POOLE RD,RALEIGH,27610,19192311688,2012-04-18,28,-78.579533,35.767246,M
4,1891534,4092017180,BOND BROTHERS BEER COMPANY,202 E CEDAR ST,CARY,27511,19194592670,2016-03-11,29,-78.778021,35.787986,M
5,1891535,4092018352,Prime Barbeque,403 KNIGHTDALE STATION RUN,KNIGHTDALE,27545,19193738076,2020-04-22,30,-78.473566,35.797225,M
6,1891536,4092017187,THE 19TH HOLE,1527 TRYON RD,RALEIGH,27603,19197729987,2016-04-01,34,-78.667555,35.735219,M
7,1891537,4092017181,COSTCO WHOLESALE #1206 FOOD COURT,1021 PINE PLAZA DR,APEX,27523,19193316035,2016-03-14,38,-78.827267,35.746416,M
8,1891538,4092017178,RALEIGH RAW,7 W HARGETT ST,RALEIGH,27601,19194000944,2016-03-14,39,-78.63958,35.778101,M
9,1891539,4092018368,D's Bottle Shop,13200 Falls of Neuse RD,RALEIGH,27614,19194357034,2020-05-20,40,-78.55982,35.940756,M


In [164]:
restaurants_df.to_csv('preprocessed_restaurants.csv', index=False)

In [166]:
rests = pd.read_csv('preprocessed_restaurants.csv', dtype={'PHONENUMBER': str})
rests.head()

Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,CITY,POSTALCODE,PHONENUMBER,RESTAURANTOPENDATE,PERMITID,X,Y,GEOCODESTATUS
0,1891530,4092016487,PEACE CHINA,13220 Strickland RD,RALEIGH,27613,19196769968,2013-08-14,2,-78.725938,35.908783,M
1,1891531,4092018622,Northside Bistro & Cocktails,832 SPRING FOREST RD,RALEIGH,27609,19198905225,2021-05-13,22,-78.622635,35.866275,M
2,1891532,4092016155,DAILY PLANET CAFE,11 W JONES ST,RALEIGH,27601,19197078060,2012-04-12,26,-78.639431,35.782205,M
3,1891533,4092016161,HIBACHI 88,3416 POOLE RD,RALEIGH,27610,19192311688,2012-04-18,28,-78.579533,35.767246,M
4,1891534,4092017180,BOND BROTHERS BEER COMPANY,202 E CEDAR ST,CARY,27511,19194592670,2016-03-11,29,-78.778021,35.787986,M
