In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

## Fetching restaurant violations from wakegov

In [2]:
## only run this once, this takes a while ##
def getViolationsDf():
    val = 'https://opendata.arcgis.com/datasets/9b04d0c39abd4e049cbd4656a0a04ba3_2.geojson'
    rows = []
    r = requests.get(url = val)
    data = r.json()
    #print(data)
    for d in data:
        data = r.json()['features']
        for d in data:
            rows.append(d['properties'])
    return pd.DataFrame(rows)

violations_df = getViolationsDf()

In [21]:
def preprocess_violations(df):
    
    # drop duplicates from dataframe
    df.drop_duplicates(inplace=True)
    
    # drop columns with high missing values
    df = df.drop('CDCRISKFACTOR', axis=1)
    
    # convert date to datetime & only get date
    df['INSPECTDATE'] = pd.to_datetime(df['INSPECTDATE']).dt.date
    
    # filter only since beginning of 2019 - we don't want data before that
    df = df[df['INSPECTDATE'] >= pd.to_datetime('2019-01-01').date()]
    return df

In [22]:
display(violations_df.head())
print(violations_df.shape)

Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCRISKFACTOR,CDCDATAITEM,PERMITID
0,182275244,4092015776,2012-12-14T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Christy Klaus,3-201.11-Packaged food shall be labeled as spe...,1.0,Out,VR,,Food shall be obtained from sources that compl...,16279
1,182275245,4092015419,2013-01-24T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Angela Myers,3-201.11 LG Asafoetida powder is not approved ...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,19659
2,182275246,4092014324,2013-03-04T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Several packages of dried good...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,6479
3,182275247,4092015740,2013-03-19T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Karla Crowder,3-201.11 Provide documentation (receipts) for ...,0.0,In,,,Food shall be obtained from sources that compl...,10225
4,182275248,4092016206,2013-03-27T00:00:00Z,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Melissa Harrison,Pf - 3-201.11 - Habash Shawerma Spices from Ha...,0.0,Out,CDI,,Food shall be obtained from sources that compl...,958


(1674472, 18)


## Preprocessing

In [18]:
df = preprocess_violations(violations_df.copy()) # pass in copy for immutability

In [23]:
display(df.isna().mean())
print(df.shape)
display(df.head())

OBJECTID           0.000000
HSISID             0.000000
INSPECTDATE        0.000000
CATEGORY           0.000000
STATECODE          0.000000
CRITICAL           0.046609
QUESTIONNO         0.000000
VIOLATIONCODE      0.000000
SEVERITY           0.046609
SHORTDESC          0.000000
INSPECTEDBY        0.000000
COMMENTS           0.000923
POINTVALUE         0.000000
OBSERVATIONTYPE    0.000000
VIOLATIONTYPE      0.422041
CDCDATAITEM        0.011249
PERMITID           0.000000
dtype: float64

(124547, 17)


Unnamed: 0,OBJECTID,HSISID,INSPECTDATE,CATEGORY,STATECODE,CRITICAL,QUESTIONNO,VIOLATIONCODE,SEVERITY,SHORTDESC,INSPECTEDBY,COMMENTS,POINTVALUE,OBSERVATIONTYPE,VIOLATIONTYPE,CDCDATAITEM,PERMITID
27,182275543,4092017322,2020-07-10,Approved Source,".2653,.2655",,9,3-201.11,,Food obtained from approved source,Lauren Harden,3-201.11; PIC states that bakery items in disp...,0.0,Out,,Food shall be obtained from sources that compl...,17
37,182275553,4092010218,2019-05-24,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Jackson Hooton,3-202.11; Priority; Box of diced tomatoes was ...,0.0,Out,CDI,Refrigerated food shall be at a temperature of...,6516
38,182275554,4092010922,2020-09-16,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Jackson Hooton,3-202.11; Priority Foundation; A bag of cut le...,1.0,Out,CDI,Refrigerated food shall be at a temperature of...,3455
39,182275555,4092010922,2021-01-07,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Jackson Hooton,3-202.11; Priority; Ground beef that the owner...,1.0,Out,CDI,Refrigerated food shall be at a temperature of...,3455
40,182275556,4092050030,2021-01-21,Approved Source,".2653,.2655",,10,3-202.11,,Food received at proper temperature,Laura McNeill,"3-202.11; Ground beef wrap, rice bowl, and min...",1.0,Out,CDI,Refrigerated food shall be at a temperature of...,468


In [20]:
display({column: len(df[column].unique()) for column in df.columns})

{'OBJECTID': 124547,
 'HSISID': 4513,
 'INSPECTDATE': 777,
 'CATEGORY': 25,
 'STATECODE': 19,
 'CRITICAL': 3,
 'QUESTIONNO': 56,
 'VIOLATIONCODE': 322,
 'SEVERITY': 4,
 'SHORTDESC': 90,
 'INSPECTEDBY': 51,
 'COMMENTS': 115402,
 'POINTVALUE': 7,
 'OBSERVATIONTYPE': 6,
 'VIOLATIONTYPE': 4,
 'CDCDATAITEM': 271,
 'PERMITID': 4514}