In [2]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

## Fetching restaurant violations from wakegov

In [3]:
## only run this once, this takes a while ##
def getViolationsDf(forceFetch=False):
    path = Path()
    key = 'violations.csv'
    filename = path/key
    
    # If the file does not already exist in the directory, download it
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched violations data')
        df = pd.read_csv('violations.csv', dtype={'OBJECTID': 'int64',
         'HSISID': 'int64',
         'INSPECTDATE': 'object',
         'CATEGORY': 'object',
         'STATECODE': 'object',
         'CRITICAL': 'object',
         'QUESTIONNO': 'int64',
         'VIOLATIONCODE': 'object',
         'SEVERITY': 'object',
         'SHORTDESC': 'object',
         'INSPECTEDBY': 'object',
         'COMMENTS': 'object',
         'POINTVALUE': 'float64',
         'OBSERVATIONTYPE': 'object',
         'VIOLATIONTYPE': 'object',
         'CDCDATAITEM': 'object',
         'PERMITID': 'int64'}) #specify dtype to save some memory/time
        print('violations df shape:', df.shape)
        return df
    else:
        print('Fetching restaurant violations data...')
        val = 'https://opendata.arcgis.com/datasets/9b04d0c39abd4e049cbd4656a0a04ba3_2.geojson'
        rows = []
        r = requests.get(url = val)
        data = r.json()
        #print(data)
        for d in data:
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
        df = pd.DataFrame(rows)
        print('violations df shape:', df.shape)
        df.to_csv('violations.csv', index=False)
        print('Done')
        return df

violations_df = getViolationsDf()

Using pre-fetched violations data
violations df shape: (1685520, 18)


In [4]:
def preprocess_violations(df):
    
    # drop duplicates from dataframe
    df.drop_duplicates(inplace=True)
    
    # drop columns with high missing values
    df = df.drop('CDCRISKFACTOR', axis=1)
    
    # convert date to datetime & only get date
    df['INSPECTDATE'] = pd.to_datetime(df['INSPECTDATE']).dt.date
    
    # filter only since beginning of 2019 - we don't want data before that
    df = df[df['INSPECTDATE'] >= pd.to_datetime('2019-01-01').date()]
    return df

In [5]:
display(violations_df.head())
print(violations_df.shape)

Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCRISKFACTOR,CDCDATAITEM,PERMITID
0,188572555,4092015776,2012-12-14T05:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Christy Klaus,3-201.11-Packaged food shall be labeled as spe...,1.0,Out,VR,,Food shall be obtained from sources that compl...,14516
1,188572556,4092040137,2013-03-18T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Lisa McCoy,Chicken kabobs are not approved to be on this ...,1.0,Out,,,Food shall be obtained from sources that compl...,20186
2,188572557,4092015740,2013-03-19T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Karla Crowder,3-201.11 Provide documentation (receipts) for ...,0.0,In,,,Food shall be obtained from sources that compl...,11367
3,188572558,4092016206,2013-03-27T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Habash Shawerma Spices from Ha...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,577
4,188572559,4092014578,2013-04-23T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Packaged frozen banana popsicl...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,2036


(1685520, 18)


## Preprocessing

In [6]:
df = preprocess_violations(violations_df.copy()) # pass in copy for immutability
df.to_csv('preprocessed_violations.csv', index=False)

In [7]:
display(df.isna().mean())
print(df.shape)
display(df.head())

OBJECTID           0.000000
HSISID             0.000000
INSPECTDATE        0.000000
CATEGORY           0.000000
STATECODE          0.000000
CRITICAL           0.046407
QUESTIONNO         0.000000
VIOLATIONCODE      0.000000
SEVERITY           0.046407
SHORTDESC          0.000000
INSPECTEDBY        0.000000
COMMENTS           0.000927
POINTVALUE         0.000000
OBSERVATIONTYPE    0.000181
VIOLATIONTYPE      0.422790
CDCDATAITEM        0.014618
PERMITID           0.000000
dtype: float64

(127309, 17)


Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCDATAITEM,PERMITID
15,188572810,4092017322,2020-07-10,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Lauren Harden,3-201.11; PIC states that bakery items in disp...,0.0,Out,,Food shall be obtained from sources that compl...,41
26,188572821,4092110158,2019-02-20,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Kaitlyn Yow,3-202.11;,0.0,N/O,,Refrigerated food shall be at a temperature of...,11426
27,188572822,4092014259,2019-09-23,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Laura McNeill,3-202.11; upon arrival the manager had receive...,0.0,Out,,Refrigerated food shall be at a temperature of...,11599
28,188572823,4092014045,2020-10-13,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Ursula Gadomski,3-202.11; Priority; Shredded cabbage was recei...,1.0,Out,CDI,Refrigerated food shall be at a temperature of...,12939
29,188572824,4092050030,2021-01-21,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Laura McNeill,"3-202.11; Ground beef wrap, rice bowl, and min...",1.0,Out,CDI,Refrigerated food shall be at a temperature of...,19


In [8]:
display({column: len(df[column].unique()) for column in df.columns})

{'OBJECTID': 127309,
 'HSISID': 4534,
 'INSPECTDATE': 793,
 'CATEGORY': 25,
 'STATECODE': 19,
 'CRITICAL': 3,
 'QUESTIONNO': 56,
 'VIOLATIONCODE': 323,
 'SEVERITY': 4,
 'SHORTDESC': 92,
 'INSPECTEDBY': 51,
 'COMMENTS': 117913,
 'POINTVALUE': 7,
 'OBSERVATIONTYPE': 6,
 'VIOLATIONTYPE': 4,
 'CDCDATAITEM': 273,
 'PERMITID': 4535}

In [9]:
df.dtypes.apply(lambda x: x.name).to_dict()

{'OBJECTID': 'int64',
 'HSISID': 'int64',
 'INSPECTDATE': 'object',
 'CATEGORY': 'object',
 'STATECODE': 'object',
 'CRITICAL': 'object',
 'QUESTIONNO': 'int64',
 'VIOLATIONCODE': 'object',
 'SEVERITY': 'object',
 'SHORTDESC': 'object',
 'INSPECTEDBY': 'object',
 'COMMENTS': 'object',
 'POINTVALUE': 'float64',
 'OBSERVATIONTYPE': 'object',
 'VIOLATIONTYPE': 'object',
 'CDCDATAITEM': 'object',
 'PERMITID': 'int64'}