## Wake County - Restaurant Food Inspections Analysis

In [45]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

#### Resources
 1. <a>  https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872 </a>
 2. <a>  https://data-wake.opendata.arcgis.com/datasets/Wake::restaurants-in-wake-county/api </a>

### Sourcing Data

In [47]:
def getRestaurantsDf():
        # Source restaurants data in wake county from wakegov
        base_url = "https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/0/query?f=json&where=(FACILITYTYPE%20IN%20('Restaurant'))&outFields=*"

        # keep pagination in mind (maxRecordCount = 1000, so grabbing 1000 at a time)
        reqs = []
        reqs.append(base_url)
        reqs.append(base_url+'&resultOffset=1000&resultRecordCount=1000')
        reqs.append(base_url+'&resultOffset=2000&resultRecordCount=1000')
        reqs.append(base_url+'&resultOffset=3000&resultRecordCount=1000')

        # Sending get request and saving the response as response object
        # extracting data in json 
        rows = []
        for val in reqs:
            r = requests.get(url = val)
            data = r.json()['features']
            for d in data:
                rows.append(d['attributes'])
        df = pd.DataFrame(rows)
        print('restaurants df shape:', df.shape)
        return df

restaurants_df_raw = getRestaurantsDf()

restaurants df shape: (2386, 15)


In [52]:
def preprocess_restaurants(df):
    
    # Drop columns w/ > 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    print('Dropping columns with more than 25% missing values:', missing_value_columns)
    df = df.drop(missing_value_columns, axis=1)
    # Drop single value column
    df = df.drop('FACILITYTYPE', axis=1)
    
    # Drop irrelevant columns
    df = df.drop('PHONENUMBER', axis=1)
    print(df.isna().mean())
    return df

In [53]:
# Validate shape & data
# https://data-wake.opendata.arcgis.com/datasets/Wake::restaurants-in-wake-county/explore?filters=eyJGQUNJTElUWVRZUEUiOlsiUmVzdGF1cmFudCJdfQ%3D%3D&location=35.794181%2C-78.605600%2C3.98&showTable=true

print(restaurants_df_raw.info())
print({column: len(restaurants_df_raw[column].unique()) for column in restaurants_df_raw.columns})

# validation (testing)
# assert restaurants_df.all(restaurants_df['FACILITYTYPE'] == 'Restaurant')

# Preprocess
restaurants_df = preprocess_restaurants(restaurants_df_raw.copy())

display(restaurants_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2386 entries, 0 to 2385
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   OBJECTID            2386 non-null   int64  
 1   HSISID              2386 non-null   object 
 2   NAME                2386 non-null   object 
 3   ADDRESS1            2386 non-null   object 
 4   ADDRESS2            341 non-null    object 
 5   CITY                2386 non-null   object 
 6   STATE               2386 non-null   object 
 7   POSTALCODE          2386 non-null   object 
 8   PHONENUMBER         2300 non-null   object 
 9   RESTAURANTOPENDATE  2386 non-null   int64  
 10  FACILITYTYPE        2386 non-null   object 
 11  PERMITID            2386 non-null   int64  
 12  X                   2386 non-null   float64
 13  Y                   2386 non-null   float64
 14  GEOCODESTATUS       2386 non-null   object 
dtypes: float64(2), int64(3), object(10)
memory usage: 279.7

Unnamed: 0,OBJECTID,HSISID,NAME,ADDRESS1,CITY,STATE,POSTALCODE,RESTAURANTOPENDATE,PERMITID,X,Y,GEOCODESTATUS
0,1856045,4092011394,McDonald's # 13334,6213 FALLS OF NEUSE RD,RALEIGH,NC,27609-3505,754531200000,11566,-78.621232,35.86745,M
1,1856046,4092011309,Goodberry's Creamery,1407 Garner Station Blv,RALEIGH,NC,27603,740707200000,11564,-78.651632,35.724843,M
2,1856047,4092013106,Sawmill Tap Room,7701 LEAD MINE RD,RALEIGH,NC,27615-4829,947808000000,11575,-78.660325,35.885009,M
3,1856048,4092013101,CAPITAL CITY CHOP HOUSE,151 AIRGATE DR,MORRISVILLE,NC,27560-8495,949449600000,11578,-78.814789,35.863449,M
4,1856049,4092013060,PNC 329,1400 EDWARDS MILL RD,RALEIGH,NC,27607-3624,941155200000,11580,-78.722936,35.803242,M


In [9]:
# https://data-wake.opendata.arcgis.com/datasets/food-inspections/explore
# https://www.arcgis.com/home/item.html?id=ebe3ae7f76954fad81411612d7c4fb17
def getFoodInspectionsDf(): 
    # https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/1/query?outFields=*&where=1%3D1
    
    # Sending get request and saving the response as response object
    # extracting data in json
    val = 'https://opendata.arcgis.com/datasets/ebe3ae7f76954fad81411612d7c4fb17_1.geojson'
    rows = []
    r = requests.get(url = val)
    data = r.json()
    #print(data)
    for d in data:
        data = r.json()['features']
        for d in data:
            rows.append(d['properties'])
    return pd.DataFrame(rows)

food_inspections_df = getFoodInspectionsDf()

In [10]:
print(food_inspections_df.shape)
food_inspections_df.head()

(189384, 8)


Unnamed: 0,OBJECTID,HSISID,SCORE,DATE_,DESCRIPTION,TYPE,INSPECTOR,PERMITID
0,21475168,4092017542,94.5,2017-04-07T00:00:00Z,"Inspection conducted by Joshua Volkan, supervi...",Inspection,Anne-Kathrin Bartoli,325
1,21475185,4092017542,92.0,2017-11-08T00:00:00Z,manager owns two deep chest freezers that are ...,Inspection,Laura McNeill,325
2,21475186,4092017542,95.0,2018-03-23T00:00:00Z,,Inspection,Laura McNeill,325
3,21475187,4092017542,93.5,2018-09-07T00:00:00Z,"*NOTICE* EFFECTIVE JANUARY 1, 2019, THE NC FOO...",Inspection,Laura McNeill,325
4,21475188,4092017542,93.0,2019-04-04T00:00:00Z,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Joanne Rutkofske,325


In [2]:
def getViolationsDf():
    val = 'https://opendata.arcgis.com/datasets/9b04d0c39abd4e049cbd4656a0a04ba3_2.geojson'
    rows = []
    r = requests.get(url = val)
    data = r.json()
    #print(data)
    for d in data:
        data = r.json()['features']
        for d in data:
            rows.append(d['properties'])
    return pd.DataFrame(rows)

violations_df = getViolationsDf()

In [4]:
display(violations_df.head())
print(violations_df.shape)

Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCRISKFACTOR,CDCDATAITEM,PERMITID
0,181856751,4092015060,2012-10-05T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Karla Crowder,3-201.11 Bags of beef unable to be identified ...,1.0,Out,CDI,,Food shall be obtained from sources that compl...,2230
1,181856752,4092015776,2012-12-14T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Christy Klaus,3-201.11-Packaged food shall be labeled as spe...,1.0,Out,VR,,Food shall be obtained from sources that compl...,12266
2,181856753,4092016116,2013-01-23T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,David Adcock,,0.0,In,,,Food shall be obtained from sources that compl...,576
3,181856754,4092015419,2013-01-24T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Angela Myers,3-201.11 LG Asafoetida powder is not approved ...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,1636
4,181856755,4092015740,2013-03-19T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Karla Crowder,3-201.11 Provide documentation (receipts) for ...,0.0,In,,,Food shall be obtained from sources that compl...,22708


(1673972, 18)
