## Imports

In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

### Fetch food inspections from wakegov

In [2]:
# https://data-wake.opendata.arcgis.com/datasets/food-inspections/explore
# https://www.arcgis.com/home/item.html?id=ebe3ae7f76954fad81411612d7c4fb17
def getFoodInspectionsDf(forceFetch=False): 
    
    # https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/1/query?outFields=*&where=1%3D1
    # Sending get request and saving the response as response object
    # extracting data in json
    
    path = Path()
    key = 'inspections.csv'
    filename = path/key
    # If the file does not already exist in the directory, download it
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched inspections data')
        return pd.read_csv(key)
    
    else:
        print('Fetching inspections data...')
        val = 'https://opendata.arcgis.com/datasets/ebe3ae7f76954fad81411612d7c4fb17_1.geojson'
        rows = []
        r = requests.get(url = val)
        data = r.json()
        #print(data)
        for d in data:
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
        inspections = pd.DataFrame(rows)
        inspections.to_csv(key, index=False)
        print('Done')
        return inspections

food_inspections_raw = getFoodInspectionsDf(forceFetch=False)

Using pre-fetched inspections data


In [3]:
food_inspections_raw.head()

Unnamed: 0,OBJECTID,HSISID,SCORE,DATE_,DESCRIPTION,TYPE,INSPECTOR,PERMITID
0,22236467,4092017542,94.5,2017-04-07T04:00:00Z,"Inspection conducted by Joshua Volkan, supervi...",Inspection,Anne-Kathrin Bartoli,367
1,22236468,4092017542,92.0,2017-11-08T05:00:00Z,manager owns two deep chest freezers that are ...,Inspection,Laura McNeill,367
2,22236469,4092017542,95.0,2018-03-23T04:00:00Z,,Inspection,Laura McNeill,367
3,22236470,4092017542,93.5,2018-09-07T04:00:00Z,"*NOTICE* EFFECTIVE JANUARY 1, 2019, THE NC FOO...",Inspection,Laura McNeill,367
4,22236471,4092017542,93.0,2019-04-04T04:00:00Z,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Joanne Rutkofske,367


In [4]:
def preprocess_inspections(df):
        
    # drop duplicates
    df.drop_duplicates(inplace=True)
                       
    # convert date to datetime & only get date
    df['DATE_'] = pd.to_datetime(df['DATE_']).dt.date
    df.rename(columns={"DATE_": "DATE"}, inplace=True)

    # filter only since beginning of 2018 - we don't want data before that
    df = df[df['DATE'] >= pd.to_datetime('2018-01-01').date()].reset_index(drop=True)
    
    return df

### Preprocessing

In [5]:
df = preprocess_inspections(food_inspections_raw.copy())
df.to_csv('preprocessed_inspections.csv', index=False)

In [6]:
print(df.shape)
df.head()

(27343, 8)


Unnamed: 0,OBJECTID,HSISID,SCORE,DATE,DESCRIPTION,TYPE,INSPECTOR,PERMITID
0,22236469,4092017542,95.0,2018-03-23,,Inspection,Laura McNeill,367
1,22236470,4092017542,93.5,2018-09-07,"*NOTICE* EFFECTIVE JANUARY 1, 2019, THE NC FOO...",Inspection,Laura McNeill,367
2,22236471,4092017542,93.0,2019-04-04,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Joanne Rutkofske,367
3,22236472,4092017542,93.5,2019-10-07,Follow-Up: 10/17/2019,Inspection,Naterra McQueen,367
4,22236473,4092017542,92.5,2020-05-19,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Naterra McQueen,367


In [7]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27343 entries, 0 to 27342
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   OBJECTID     27343 non-null  int64  
 1   HSISID       27343 non-null  int64  
 2   SCORE        27343 non-null  float64
 3   DATE         27343 non-null  object 
 4   DESCRIPTION  22605 non-null  object 
 5   TYPE         27343 non-null  object 
 6   INSPECTOR    27343 non-null  object 
 7   PERMITID     27343 non-null  int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 1.7+ MB


None

In [8]:
df.isna().sum()

OBJECTID          0
HSISID            0
SCORE             0
DATE              0
DESCRIPTION    4738
TYPE              0
INSPECTOR         0
PERMITID          0
dtype: int64

In [9]:
display({column: len(df[column].unique()) for column in df.columns})

{'OBJECTID': 27343,
 'HSISID': 3878,
 'SCORE': 52,
 'DATE': 1077,
 'DESCRIPTION': 7851,
 'TYPE': 2,
 'INSPECTOR': 54,
 'PERMITID': 3878}

In [10]:
df.dtypes

OBJECTID         int64
HSISID           int64
SCORE          float64
DATE            object
DESCRIPTION     object
TYPE            object
INSPECTOR       object
PERMITID         int64
dtype: object