In [10]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

### Resources
 1. [Restaurants in Wake County Data Info](https://www.arcgis.com/home/item.html?id=124c2187da8c41c59bde04fa67eb2872)
 2. [Wake County Open Data](https://data-wake.opendata.arcgis.com/search?tags=restaurants)
 3. [Food Inspection Violations Data Info](https://data.wakegov.com/datasets/Wake::food-inspection-violations/about)
 4. [Wake County Yelp Initiative](https://ash.harvard.edu/news/wake-county-yelp-initiative)

In [11]:
# https://data-wake.opendata.arcgis.com/datasets/food-inspections/explore
# https://www.arcgis.com/home/item.html?id=ebe3ae7f76954fad81411612d7c4fb17
def getFoodInspectionsDf(): 
    # https://maps.wakegov.com/arcgis/rest/services/Inspections/RestaurantInspectionsOpenData/MapServer/1/query?outFields=*&where=1%3D1
    
    # Sending get request and saving the response as response object
    # extracting data in json
    val = 'https://opendata.arcgis.com/datasets/ebe3ae7f76954fad81411612d7c4fb17_1.geojson'
    rows = []
    r = requests.get(url = val)
    data = r.json()
    #print(data)
    for d in data:
        data = r.json()['features']
        for d in data:
            rows.append(d['properties'])
    return pd.DataFrame(rows)

food_inspections_raw = getFoodInspectionsDf()

In [20]:
food_inspections_raw.head()

Unnamed: 0,OBJECTID,HSISID,SCORE,DATE_,DESCRIPTION,TYPE,INSPECTOR,PERMITID
0,21522530,4092017542,94.5,2017-04-07T00:00:00Z,"Inspection conducted by Joshua Volkan, supervi...",Inspection,Anne-Kathrin Bartoli,1
1,21522531,4092017542,92.0,2017-11-08T00:00:00Z,manager owns two deep chest freezers that are ...,Inspection,Laura McNeill,1
2,21522532,4092017542,95.0,2018-03-23T00:00:00Z,,Inspection,Laura McNeill,1
3,21522533,4092017542,93.5,2018-09-07T00:00:00Z,"*NOTICE* EFFECTIVE JANUARY 1, 2019, THE NC FOO...",Inspection,Laura McNeill,1
4,21522534,4092017542,93.0,2019-04-04T00:00:00Z,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Joanne Rutkofske,1


In [42]:
def preprocess_inspections(df):
    # convert date to datetime & only get date
    df['DATE_'] = pd.to_datetime(df['DATE_']).dt.date
    df.rename(columns={"DATE_": "DATE"}, inplace=True)
    
    # drop duplicates
    df.drop_duplicates(inplace=True)
    return df

In [43]:
df = preprocess_inspections(food_inspections_raw.copy())

In [45]:
print(df.shape)
df.head(30)

(47346, 8)


Unnamed: 0,OBJECTID,HSISID,SCORE,DATE,DESCRIPTION,TYPE,INSPECTOR,PERMITID
0,21522530,4092017542,94.5,2017-04-07,"Inspection conducted by Joshua Volkan, supervi...",Inspection,Anne-Kathrin Bartoli,1
1,21522531,4092017542,92.0,2017-11-08,manager owns two deep chest freezers that are ...,Inspection,Laura McNeill,1
2,21522532,4092017542,95.0,2018-03-23,,Inspection,Laura McNeill,1
3,21522533,4092017542,93.5,2018-09-07,"*NOTICE* EFFECTIVE JANUARY 1, 2019, THE NC FOO...",Inspection,Laura McNeill,1
4,21522534,4092017542,93.0,2019-04-04,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Joanne Rutkofske,1
5,21522535,4092017542,93.5,2019-10-07,Follow-Up: 10/17/2019,Inspection,Naterra McQueen,1
6,21522536,4092017542,92.5,2020-05-19,"*NOTICE* AS OF JANUARY 1, 2019, THE NC FOOD CO...",Inspection,Naterra McQueen,1
7,21522537,4092017542,94.0,2020-10-09,PIC cannot sign due to COVID-19 concerns.,Inspection,Nicole Millard,1
8,21522538,4092017542,94.0,2021-03-24,PIC cannot sign due to COVID-19 concerns.,Inspection,Nicole Millard,1
9,21522539,4092017542,92.0,2021-07-20,Bathrooms for customers were locked today but ...,Inspection,David Adcock,1


In [28]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189384 entries, 0 to 189383
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   OBJECTID     189384 non-null  int64  
 1   HSISID       189384 non-null  object 
 2   SCORE        189384 non-null  float64
 3   DATE         189384 non-null  object 
 4   DESCRIPTION  129556 non-null  object 
 5   TYPE         189384 non-null  object 
 6   INSPECTOR    189384 non-null  object 
 7   PERMITID     189384 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 11.6+ MB


None

In [29]:
df.isna().mean()

OBJECTID       0.000000
HSISID         0.000000
SCORE          0.000000
DATE           0.000000
DESCRIPTION    0.315908
TYPE           0.000000
INSPECTOR      0.000000
PERMITID       0.000000
dtype: float64

In [30]:
display({column: len(df[column].unique()) for column in df.columns})

{'OBJECTID': 47346,
 'HSISID': 3863,
 'SCORE': 53,
 'DATE': 2477,
 'DESCRIPTION': 14375,
 'TYPE': 2,
 'INSPECTOR': 67,
 'PERMITID': 3863}

In [36]:
df.dtypes

OBJECTID         int64
HSISID          object
SCORE          float64
DATE            object
DESCRIPTION     object
TYPE            object
INSPECTOR       object
PERMITID         int64
dtype: object