In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

## Fetching restaurant violations from wakegov

In [13]:
## only run this once, this takes a while ##
def getViolationsDf(forceFetch=False):
    path = Path()
    key = 'violations.csv'
    filename = path/key
    
    # If the file does not already exist in the directory, download it
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched violations data')
        df = pd.read_csv('violations.csv', dtype={'OBJECTID': 'int64',
         'HSISID': 'int64',
         'INSPECTDATE': 'object',
         'CATEGORY': 'object',
         'STATECODE': 'object',
         'CRITICAL': 'object',
         'QUESTIONNO': 'int64',
         'VIOLATIONCODE': 'object',
         'SEVERITY': 'object',
         'SHORTDESC': 'object',
         'INSPECTEDBY': 'object',
         'COMMENTS': 'object',
         'POINTVALUE': 'float64',
         'OBSERVATIONTYPE': 'object',
         'VIOLATIONTYPE': 'object',
         'CDCDATAITEM': 'object',
         'PERMITID': 'int64'}) #specify dtype to save some memory/time
        print('violations df shape:', df.shape)
        return df
    else:
        print('Fetching restaurant violations data...')
        val = 'https://opendata.arcgis.com/datasets/9b04d0c39abd4e049cbd4656a0a04ba3_2.geojson'
        rows = []
        r = requests.get(url = val)
        data = r.json()
        #print(data)
        for d in data:
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
        df = pd.DataFrame(rows)
        print('violations df shape:', df.shape)
        df.to_csv('violations.csv', index=False)
        print('Done')
        return df

violations_df = getViolationsDf()

Using pre-fetched violations data
violations df shape: (1681260, 18)


In [14]:
def preprocess_violations(df):
    
    # drop duplicates from dataframe
    df.drop_duplicates(inplace=True)
    
    # drop columns with high missing values
    df = df.drop('CDCRISKFACTOR', axis=1)
    
    # convert date to datetime & only get date
    df['INSPECTDATE'] = pd.to_datetime(df['INSPECTDATE']).dt.date
    
    # filter only since beginning of 2019 - we don't want data before that
    df = df[df['INSPECTDATE'] >= pd.to_datetime('2019-01-01').date()]
    return df

In [15]:
display(violations_df.head())
print(violations_df.shape)

Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCRISKFACTOR,CDCDATAITEM,PERMITID
0,186468679,4092015776,2012-12-14T05:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Christy Klaus,3-201.11-Packaged food shall be labeled as spe...,1.0,Out,VR,,Food shall be obtained from sources that compl...,15958
1,186468680,4092016116,2013-01-23T05:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,David Adcock,,0.0,In,,,Food shall be obtained from sources that compl...,630
2,186468681,4092015419,2013-01-24T05:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Angela Myers,3-201.11 LG Asafoetida powder is not approved ...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,6156
3,186468682,4092014324,2013-03-04T05:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Several packages of dried good...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,7357
4,186468683,4092015740,2013-03-19T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Karla Crowder,3-201.11 Provide documentation (receipts) for ...,0.0,In,,,Food shall be obtained from sources that compl...,22013


(1681260, 18)


## Preprocessing

In [16]:
df = preprocess_violations(violations_df.copy()) # pass in copy for immutability

In [17]:
display(df.isna().mean())
print(df.shape)
display(df.head())

OBJECTID           0.000000
HSISID             0.000000
INSPECTDATE        0.000000
CATEGORY           0.000000
STATECODE          0.000000
CRITICAL           0.046545
QUESTIONNO         0.000000
VIOLATIONCODE      0.000000
SEVERITY           0.046545
SHORTDESC          0.000000
INSPECTEDBY        0.000000
COMMENTS           0.000911
POINTVALUE         0.000000
OBSERVATIONTYPE    0.000182
VIOLATIONTYPE      0.422547
CDCDATAITEM        0.013387
PERMITID           0.000000
dtype: float64

(126244, 17)


Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCDATAITEM,PERMITID
26,186468705,4092025252,2020-01-24,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,David Adcock,3-201.11; Some of the lamb was purchased from ...,1.0,Out,,Food shall be obtained from sources that compl...,18067
27,186468706,4092030492,2021-06-14,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,David Adcock,3-201.11;(B)Employees stated that when they ne...,1.0,Out,,Food shall be obtained from sources that compl...,15779
36,186468715,4092110158,2019-02-20,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Kaitlyn Yow,3-202.11;,0.0,N/O,,Refrigerated food shall be at a temperature of...,11926
37,186468716,4092010218,2019-05-24,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Jackson Hooton,3-202.11; Priority; Box of diced tomatoes was ...,0.0,Out,CDI,Refrigerated food shall be at a temperature of...,4038
38,186468717,4092014259,2019-09-23,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Laura McNeill,3-202.11; upon arrival the manager had receive...,0.0,Out,,Refrigerated food shall be at a temperature of...,14777


In [18]:
display({column: len(df[column].unique()) for column in df.columns})

{'OBJECTID': 126244,
 'HSISID': 4525,
 'INSPECTDATE': 788,
 'CATEGORY': 25,
 'STATECODE': 19,
 'CRITICAL': 3,
 'QUESTIONNO': 56,
 'VIOLATIONCODE': 322,
 'SEVERITY': 4,
 'SHORTDESC': 92,
 'INSPECTEDBY': 51,
 'COMMENTS': 116940,
 'POINTVALUE': 7,
 'OBSERVATIONTYPE': 6,
 'VIOLATIONTYPE': 4,
 'CDCDATAITEM': 272,
 'PERMITID': 4526}

In [19]:
df.dtypes.apply(lambda x: x.name).to_dict()

{'OBJECTID': 'int64',
 'HSISID': 'int64',
 'INSPECTDATE': 'object',
 'CATEGORY': 'object',
 'STATECODE': 'object',
 'CRITICAL': 'object',
 'QUESTIONNO': 'int64',
 'VIOLATIONCODE': 'object',
 'SEVERITY': 'object',
 'SHORTDESC': 'object',
 'INSPECTEDBY': 'object',
 'COMMENTS': 'object',
 'POINTVALUE': 'float64',
 'OBSERVATIONTYPE': 'object',
 'VIOLATIONTYPE': 'object',
 'CDCDATAITEM': 'object',
 'PERMITID': 'int64'}