## Imports 

In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

## Fetching restaurant violations from wakegov

In [2]:
## only run this once, this takes a while ##
def getViolationsDf(forceFetch=False):
    path = Path()
    key = 'violations.csv'
    filename = path/key
    
    # If the file does already exist in the directory, download it
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched violations data...')
        df = pd.read_csv(key, dtype={'OBJECTID': 'int64',
         'HSISID': 'int64',
         'INSPECTDATE': 'object',
         'CATEGORY': 'object',
         'STATECODE': 'object',
         'CRITICAL': 'object',
         'QUESTIONNO': 'int64',
         'VIOLATIONCODE': 'object',
         'SEVERITY': 'object',
         'SHORTDESC': 'object',
         'INSPECTEDBY': 'object',
         'COMMENTS': 'object',
         'POINTVALUE': 'float64',
         'OBSERVATIONTYPE': 'object',
         'VIOLATIONTYPE': 'object',
         'CDCDATAITEM': 'object',
         'PERMITID': 'int64'}) # specify dtype to save some memory/time
        print('violations df shape:', df.shape)
        return df
    else:
        print('Fetching restaurant violations data...')
        val = 'https://opendata.arcgis.com/datasets/9b04d0c39abd4e049cbd4656a0a04ba3_2.geojson'
        rows = []
        r = requests.get(url = val)
        data = r.json()
        for d in data:
            data = r.json()['features']
            for d in data:
                rows.append(d['properties'])
        df = pd.DataFrame(rows)
        print('violations df shape:', df.shape)
        df.to_csv(key, index=False)
        print('Done')
        return df

violations_df = getViolationsDf()

Using pre-fetched violations data...
violations df shape: (1685520, 18)


In [3]:
def preprocess_violations(df):
    
    # drop duplicates from dataframe
    df.drop_duplicates(inplace=True)
    
    # drop columns with high missing values
    df = df.drop('CDCRISKFACTOR', axis=1)
    
    # drop irrelevant column(s)
    df = df.drop('STATECODE', axis=1)
    
    # permit id is unique per rest so removing it
    df = df.drop('PERMITID', axis=1)
    
    # normalize observation type
    df['OBSERVATIONTYPE'] = df['OBSERVATIONTYPE'].str.upper()
    df['OBSERVATIONTYPE'].fillna('MISSING', inplace=True)
    
    # convert date to datetime & only get date
    df['INSPECTDATE'] = pd.to_datetime(df['INSPECTDATE']).dt.date
    
    # filter only since beginning of 2018 - we don't want data before that
    df = df[df['INSPECTDATE'] >= pd.to_datetime('2018-01-01').date()]
    
    return df

In [4]:
violations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1685520 entries, 0 to 1685519
Data columns (total 18 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   OBJECTID         1685520 non-null  int64  
 1   HSISID           1685520 non-null  int64  
 2   INSPECTDATE      1685520 non-null  object 
 3   CATEGORY         1685520 non-null  object 
 4   STATECODE        1685520 non-null  object 
 5   CRITICAL         1607772 non-null  object 
 6   QUESTIONNO       1685520 non-null  int64  
 7   VIOLATIONCODE    1685520 non-null  object 
 8   SEVERITY         1607772 non-null  object 
 9   SHORTDESC        1685520 non-null  object 
 10  INSPECTEDBY      1685208 non-null  object 
 11  COMMENTS         1682996 non-null  object 
 12  POINTVALUE       1685520 non-null  float64
 13  OBSERVATIONTYPE  1684252 non-null  object 
 14  VIOLATIONTYPE    897580 non-null   object 
 15  CDCRISKFACTOR    0 non-null        float64
 16  CDCDATAITEM      1

In [7]:
# critical(handled), severity, inspected by, comments, obs type, violation type, cdcdataitem
display(violations_df.head())
print(violations_df.shape)
print('VIOLATIONTYPE:', violations_df['VIOLATIONTYPE'].unique())
print('OBSERVATIONTYPE:', violations_df['OBSERVATIONTYPE'].unique())
print('CRITICAL:', violations_df['CRITICAL'].unique())
print('CATEGORY:', violations_df['CATEGORY'].unique())
print('SEVERITY', violations_df['SEVERITY'].unique())
print('QUESTIONNO', violations_df['QUESTIONNO'].unique())
print('POINTVALUE', violations_df['POINTVALUE'].unique())
print('CRITICAL COUNTS:', violations_df['CRITICAL'].value_counts(dropna=False))

Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCRISKFACTOR,CDCDATAITEM,PERMITID
0,188572555,4092015776,2012-12-14T05:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Christy Klaus,3-201.11-Packaged food shall be labeled as spe...,1.0,Out,VR,,Food shall be obtained from sources that compl...,14516
1,188572556,4092040137,2013-03-18T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Lisa McCoy,Chicken kabobs are not approved to be on this ...,1.0,Out,,,Food shall be obtained from sources that compl...,20186
2,188572557,4092015740,2013-03-19T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Karla Crowder,3-201.11 Provide documentation (receipts) for ...,0.0,In,,,Food shall be obtained from sources that compl...,11367
3,188572558,4092016206,2013-03-27T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Habash Shawerma Spices from Ha...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,577
4,188572559,4092014578,2013-04-23T04:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Packaged frozen banana popsicl...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,2036


(1685520, 18)
VIOLATIONTYPE: ['VR' nan 'CDI' 'R']
OBSERVATIONTYPE: ['Out' 'In' 'N/O' nan 'OUT' 'IN']
CRITICAL: [nan 'No' 'Yes']
CATEGORY: ['Approved Source' 'Chemical' 'Conformance with Approved Procedures'
 'Consumer Advisory' 'Employee Health'
 'EQUIPMENT AND UTENSILS; PROTECTION FROM CONTAMINATION'
 'Food Identification' 'FOOD PROTECTION' 'Food Temperature Control'
 'Good Hygienic Practices' 'Highly Susceptible Populations'
 'LAUNDRY AND LINENS'
 'MANAGEMENT AND PERSONNEL; EMPLOYEE HEALTH; GOOD HYGIENE PRACTICES; PREVENTING CONTAMINATION BY HANDS'
 'PHYSICAL FACILITIES' 'Physical Facilities'
 'Potentially Hazardous Food Time/Temperature'
 'PREMISES, STORAGE, POISONOUS OR TOXIC MATERIALS'
 'Preventing Contamination by Hands' 'Prevention of Food Contamination'
 'Proper Use of Utensils' 'Protection from Contamination'
 'Safe Food and Water' 'Supervision' 'Utensils and Equipment'
 'WATER, PLUMBING AND WASTE']
SEVERITY [nan 'Core' 'Priority' 'Priority Foundation']
QUESTIONNO [ 9 10 11 12

1.CDI = Corrected during inspection

2.NA = Not Applicable

3.NO = Not Observed

4.R = Repeat Violation

5.VR = Verification Required

## Preprocessing & Validation

In [6]:
df = preprocess_violations(violations_df.copy()) # pass in copy for immutability
df.to_csv('preprocessed_violations.csv', index=False)

In [7]:
display(df.isna().sum())
print(df.shape)
display(df.head())

OBJECTID               0
HSISID                 0
INSPECTDATE            0
CATEGORY               0
CRITICAL            7780
QUESTIONNO             0
VIOLATIONCODE          0
SEVERITY            7780
SHORTDESC              0
INSPECTEDBY            0
COMMENTS             150
POINTVALUE             0
OBSERVATIONTYPE        0
VIOLATIONTYPE      76409
CDCDATAITEM         2141
dtype: int64

(178102, 15)


Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCDATAITEM
15,188572810,4092017322,2020-07-10,Approved Source,,9,3-201.11,,Food obtained from approved source,Lauren Harden,3-201.11; PIC states that bakery items in disp...,0.0,OUT,,Food shall be obtained from sources that compl...
24,188572819,4092110520,2018-03-05,Approved Source,,10,3-202.11,,Food received at proper temperature,Laura McNeill,3-202.11; pans of reheated beef steak received...,1.0,OUT,CDI,Refrigerated food shall be at a temperature of...
25,188572820,4092017143,2018-08-20,Approved Source,,10,3-202.11,,Food received at proper temperature,Jennifer Brown,3-202.11;Measured raw shell eggs at 49-50F upo...,0.0,OUT,CDI,Refrigerated food shall be at a temperature of...
26,188572821,4092110158,2019-02-20,Approved Source,,10,3-202.11,,Food received at proper temperature,Kaitlyn Yow,3-202.11;,0.0,N/O,,Refrigerated food shall be at a temperature of...
27,188572822,4092014259,2019-09-23,Approved Source,,10,3-202.11,,Food received at proper temperature,Laura McNeill,3-202.11; upon arrival the manager had receive...,0.0,OUT,,Refrigerated food shall be at a temperature of...


In [8]:
print('OBSERVATIONTYPE:', df['OBSERVATIONTYPE'].unique())
display({column: len(df[column].unique()) for column in df.columns})

OBSERVATIONTYPE: ['OUT' 'N/O' 'IN' 'MISSING']


{'OBJECTID': 178102,
 'HSISID': 4850,
 'INSPECTDATE': 1085,
 'CATEGORY': 25,
 'CRITICAL': 3,
 'QUESTIONNO': 56,
 'VIOLATIONCODE': 327,
 'SEVERITY': 4,
 'SHORTDESC': 92,
 'INSPECTEDBY': 58,
 'COMMENTS': 164766,
 'POINTVALUE': 7,
 'OBSERVATIONTYPE': 4,
 'VIOLATIONTYPE': 4,
 'CDCDATAITEM': 277}

In [9]:
df.dtypes.apply(lambda x: x.name).to_dict()

{'OBJECTID': 'int64',
 'HSISID': 'int64',
 'INSPECTDATE': 'object',
 'CATEGORY': 'object',
 'CRITICAL': 'object',
 'QUESTIONNO': 'int64',
 'VIOLATIONCODE': 'object',
 'SEVERITY': 'object',
 'SHORTDESC': 'object',
 'INSPECTEDBY': 'object',
 'COMMENTS': 'object',
 'POINTVALUE': 'float64',
 'OBSERVATIONTYPE': 'object',
 'VIOLATIONTYPE': 'object',
 'CDCDATAITEM': 'object'}