In [1]:
from IPython import display
import requests
import json
from sodapy import Socrata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import time
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_columns',50)

from sklearn.preprocessing import OneHotEncoder


# Introduction

According to the CDC, more than 48 million Americans per year become sick from food, and an estimated 75% of the outbreaks came from food prepared by caterers, delis, and restaurants. In most cities, health inspections are generally random, which can increase time spent on spot checks at clean restaurants that have been following the rules closely — and missed opportunities to improve health and hygiene at places with more pressing food safety issues.

The goal for this project is to leverage public citizen generated data from social media to narrow the search for critical health and safety violations in New York City. As the City of New York manages  an open data portal, everyone can access historical hygiene inspections and violation records. By combine these two data source this project aims to determine which words, phrases, ratings, and patterns among restaurants lead to critical health and safety violations. This model can assist city health inspectors do their job better by prioritizing the kitchens most likely to be in violation of code.

# Obtain

This project requires data pulled from two different sources, the City of New York and Yelp. To obtain the data we will call the API keys.

## NYC Open Data API

In [29]:
doh = pd.read_csv('data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv')
doh

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,50008319,POULETTE,Manhattan,790,9 AVENUE,10019.0,2129569488,Chicken,01/28/2019,Violations were cited in the following area(s).,04D,"Food worker does not wash hands thoroughly after using the toilet, coughing, sneezing, smoking, ...",Critical,22,,,03/24/2022,Cycle Inspection / Initial Inspection,40.765040,-73.987795,104.0,3.0,13300.0,1025235.0,1.010430e+09,MN15
1,41678734,TINY'S DINER,Bronx,3603,RIVERDALE AVENUE,10463.0,7187087600,American,09/23/2019,Violations were cited in the following area(s).,04M,Live roaches present in facility's food and/or non-food areas.,Critical,24,,,03/24/2022,Cycle Inspection / Initial Inspection,40.886691,-73.907056,208.0,11.0,29500.0,2084186.0,2.057960e+09,BX29
2,40795021,TAO RESTAURANT,Manhattan,42,EAST 58 STREET,10022.0,2128882288,Asian/Asian Fusion,03/11/2020,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,9,,,03/24/2022,Cycle Inspection / Initial Inspection,40.762786,-73.971486,105.0,4.0,11202.0,1036073.0,1.012930e+09,MN17
3,40795021,TAO RESTAURANT,Manhattan,42,EAST 58 STREET,10022.0,2128882288,Asian/Asian Fusion,03/11/2020,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,9,,,03/24/2022,Cycle Inspection / Initial Inspection,40.762786,-73.971486,105.0,4.0,11202.0,1036073.0,1.012930e+09,MN17
4,40538234,"DUNKIN',' BASKIN ROBBINS",Queens,9925,HORACE HARDING EXPRESSWAY,11368.0,7182719222,Donuts,07/29/2021,Violations were cited in the following area(s).,06E,"Sanitized equipment or utensil, including in-use food dispensing utensil, improperly used or sto...",Critical,10,A,07/29/2021,03/24/2022,Cycle Inspection / Initial Inspection,40.736185,-73.858052,404.0,21.0,43702.0,4047915.0,4.019480e+09,QN25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186222,41556790,FIVE GUYS FAMOUS BURGERS AND FRIES,Manhattan,2847,BROADFWAY,,2126787701,Hamburgers,03/01/2019,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,3,A,03/01/2019,03/24/2022,Cycle Inspection / Initial Inspection,0.000000,0.000000,,,,,1.000000e+00,
186223,50032876,TANNER SMITH'S,Manhattan,204,WEST 55 STREET,10019.0,9175172283,American,06/07/2019,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premise...,Not Critical,12,A,06/07/2019,03/24/2022,Cycle Inspection / Initial Inspection,40.764364,-73.981362,105.0,4.0,13700.0,1024857.0,1.010260e+09,MN17
186224,50003842,T- 45,Manhattan,135,WEST 45 STREET,10036.0,6466403775,American,07/26/2018,Violations were cited in the following area(s).,10B,Plumbing not properly installed or maintained; anti-siphonage or backflow prevention device not ...,Not Critical,49,,,03/24/2022,Cycle Inspection / Initial Inspection,40.757244,-73.983815,105.0,4.0,11900.0,1089796.0,1.009980e+09,MN17
186225,41640824,RICURAS ECUADORIAN BAKERY,Bronx,1576,WATSON AVENUE,10472.0,7184508363,Spanish,07/20/2018,Violations were cited in the following area(s).,06F,Wiping cloths soiled or not stored in sanitizing solution.,Critical,12,A,07/20/2018,03/24/2022,Cycle Inspection / Initial Inspection,40.826352,-73.876066,209.0,18.0,5001.0,2023533.0,2.037160e+09,BX55


In [3]:
# How many unique restaurants are in this dataset?
doh['CAMIS'].nunique()

19792

Health code violations found during an inspections carries a point value, and a restaurant’s score corresponds to a letter grade. A lower point score, leads to a better letter grade:

"A" grade: 0 to 13 points for sanitary violations
"B" grade: 14 to 27 points for sanitary violations
"C" grade: 28 or more points for sanitary violations

In [4]:
doh['A'] = doh['SCORE'] < 14
doh['B'] = (doh['SCORE'] > 13) & (doh['SCORE'] < 28)
doh['C'] = doh['SCORE'] > 27

In [5]:
doh

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA,A,B,C
0,50008319,POULETTE,Manhattan,790,9 AVENUE,10019.0,2129569488,Chicken,01/28/2019,Violations were cited in the following area(s).,04D,"Food worker does not wash hands thoroughly after using the toilet, coughing, sneezing, smoking, ...",Critical,22,,,03/24/2022,Cycle Inspection / Initial Inspection,40.765040,-73.987795,104.0,3.0,13300.0,1025235.0,1.010430e+09,MN15,False,True,False
1,41678734,TINY'S DINER,Bronx,3603,RIVERDALE AVENUE,10463.0,7187087600,American,09/23/2019,Violations were cited in the following area(s).,04M,Live roaches present in facility's food and/or non-food areas.,Critical,24,,,03/24/2022,Cycle Inspection / Initial Inspection,40.886691,-73.907056,208.0,11.0,29500.0,2084186.0,2.057960e+09,BX29,False,True,False
2,40795021,TAO RESTAURANT,Manhattan,42,EAST 58 STREET,10022.0,2128882288,Asian/Asian Fusion,03/11/2020,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,9,,,03/24/2022,Cycle Inspection / Initial Inspection,40.762786,-73.971486,105.0,4.0,11202.0,1036073.0,1.012930e+09,MN17,True,False,False
3,40795021,TAO RESTAURANT,Manhattan,42,EAST 58 STREET,10022.0,2128882288,Asian/Asian Fusion,03/11/2020,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,9,,,03/24/2022,Cycle Inspection / Initial Inspection,40.762786,-73.971486,105.0,4.0,11202.0,1036073.0,1.012930e+09,MN17,True,False,False
4,40538234,"DUNKIN',' BASKIN ROBBINS",Queens,9925,HORACE HARDING EXPRESSWAY,11368.0,7182719222,Donuts,07/29/2021,Violations were cited in the following area(s).,06E,"Sanitized equipment or utensil, including in-use food dispensing utensil, improperly used or sto...",Critical,10,A,07/29/2021,03/24/2022,Cycle Inspection / Initial Inspection,40.736185,-73.858052,404.0,21.0,43702.0,4047915.0,4.019480e+09,QN25,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186222,41556790,FIVE GUYS FAMOUS BURGERS AND FRIES,Manhattan,2847,BROADFWAY,,2126787701,Hamburgers,03/01/2019,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,3,A,03/01/2019,03/24/2022,Cycle Inspection / Initial Inspection,0.000000,0.000000,,,,,1.000000e+00,,True,False,False
186223,50032876,TANNER SMITH'S,Manhattan,204,WEST 55 STREET,10019.0,9175172283,American,06/07/2019,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premise...,Not Critical,12,A,06/07/2019,03/24/2022,Cycle Inspection / Initial Inspection,40.764364,-73.981362,105.0,4.0,13700.0,1024857.0,1.010260e+09,MN17,True,False,False
186224,50003842,T- 45,Manhattan,135,WEST 45 STREET,10036.0,6466403775,American,07/26/2018,Violations were cited in the following area(s).,10B,Plumbing not properly installed or maintained; anti-siphonage or backflow prevention device not ...,Not Critical,49,,,03/24/2022,Cycle Inspection / Initial Inspection,40.757244,-73.983815,105.0,4.0,11900.0,1089796.0,1.009980e+09,MN17,False,False,True
186225,41640824,RICURAS ECUADORIAN BAKERY,Bronx,1576,WATSON AVENUE,10472.0,7184508363,Spanish,07/20/2018,Violations were cited in the following area(s).,06F,Wiping cloths soiled or not stored in sanitizing solution.,Critical,12,A,07/20/2018,03/24/2022,Cycle Inspection / Initial Inspection,40.826352,-73.876066,209.0,18.0,5001.0,2023533.0,2.037160e+09,BX55,True,False,False


In [30]:
doh['PHONE'] = '+1'+doh['PHONE']

In [33]:
top_doh = doh.head()
top_doh['PHONE'].iloc[1]

'+17187087600'

In [14]:
# pass_fail = doh.groupby(['CAMIS'])['PASS', 'FAIL'].sum()

In [15]:
# pass_fail[pass_fail['FAIL'] > 0]

Of the 19,792 unique restaurants, 12,221 did not pass the intitial inspections at least once.

In [21]:
doh_grades = doh.groupby(['CAMIS'])['PHONE','BORO','BUILDING','STREET','ZIPCODE',
                                    'PHONE','CUISINE DESCRIPTION',
                                    'Latitude','Longitude','Community Board',
                                    'Council District','Census Tract',
                                    'A', 'B','C'].sum()

In [22]:
doh_grades

Unnamed: 0_level_0,ZIPCODE,Latitude,Longitude,Community Board,Council District,Census Tract,A,B,C
CAMIS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30075445,83696.0,326.785850,-590.847775,1688.0,104.0,201600.0,1,7,0
30112340,56125.0,203.313258,-369.810407,1545.0,200.0,163500.0,5,0,0
30191841,50095.0,203.836629,-369.921552,520.0,15.0,69500.0,5,0,0
40356018,44896.0,162.319681,-295.928361,1252.0,188.0,139200.0,4,0,0
40356483,78638.0,284.340781,-517.348926,2226.0,322.0,490000.0,3,4,0
...,...,...,...,...,...,...,...,...,...
50115169,10002.0,40.714841,-73.991700,103.0,1.0,1600.0,1,0,0
50116155,33699.0,122.051758,-221.752145,948.0,123.0,111900.0,3,0,0
50117350,30054.0,122.257596,-221.968545,315.0,9.0,32700.0,3,0,0
50117434,79604.0,285.289075,-517.196049,2821.0,175.0,228900.0,0,0,7


In [18]:
doh_grades[doh_grades['B'] > 0]

Unnamed: 0_level_0,ZIPCODE,Latitude,Longitude,Community Board,Council District,Census Tract,A,B,C
CAMIS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
30075445,83696.0,326.785850,-590.847775,1688.0,104.0,201600.0,1,7,0
40356483,101106.0,365.581004,-665.162905,2862.0,414.0,630000.0,5,4,0
40362264,70175.0,285.548344,-517.773967,749.0,42.0,126700.0,3,4,0
40362274,70084.0,285.080205,-517.982347,714.0,7.0,38507.0,1,6,0
40362715,230115.0,936.235858,-1702.198016,2323.0,23.0,16100.0,7,11,5
...,...,...,...,...,...,...,...,...,...
50111805,30042.0,122.195939,-222.011203,306.0,9.0,20100.0,0,3,0
50113198,40008.0,162.854183,-295.984590,412.0,4.0,3200.0,0,4,0
50113951,41820.0,163.249463,-295.613108,808.0,32.0,33200.0,0,4,0
50113984,30081.0,122.429129,-221.880270,327.0,21.0,60900.0,0,3,0


Of the 19,792 unique restaurants, 9,978 failed an initial cycle inspection at least once.

In [19]:
doh.duplicated().sum()

11502

In [20]:
doh.drop_duplicates(inplace=True)

In [None]:
doh.shape

In [None]:
doh['SCORE'].isna().sum()

In [None]:
doh['SCORE'].hist(bins=113, figsize=(12,8));

In [None]:
doh['SCORE'].value_counts()

In [None]:
doh['SCORE'].mean()

In [None]:
doh['SCORE'].median()

In [None]:
doh['SCORE'].mode()

In [None]:
doh['GRADE'].isna().sum()

In [None]:
doh['GRADE'].value_counts()

In [None]:
doh.info()

In [None]:
doh['INSPECTION DATE'] =  pd.to_datetime(doh['INSPECTION DATE'])

In [None]:
# doh.pivot(columns=['CAMIS','INSPECTION DATE'],values='SCORE')

The New York Health Department inspects the approximately 27,000 restaurants within the city to monitor their compliance with food safety regulations. Inspectors observe how food is prepared, served and stored and whether restaurant workers are practicing good hygiene. They check food temperatures, equipment maintenance and pest control measures.

Calling API

In [None]:
dohmh_df['score'].hist(bins='auto', figsize=(12,8));

In [None]:
# # Take a look at duplicated records
# duplicates = df1.duplicated(subset=['camis'], keep=False)
# df1.loc[duplicates.loc[duplicates==True].index].sort_values(by='camis')

In [None]:
# # Unauthenticated client only works with public data sets. Note 'None'
# # in place of application token, and no username or password:
# client = Socrata("data.cityofnewyork.us", None)



# # Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofnewyork.us,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# # First 2000 results, returned as JSON from API / converted to Python list of
# # dictionaries by sodapy.
# results = client.get("43nn-pn8j", limit=2000)

# # Convert to pandas DataFrame
# dohmh_df = pd.DataFrame.from_records(results)

## Yelp API

In [8]:
with open('/Users/Rob/.secret/yelp_api.json') as f:
    creds = json.load(f)

In [9]:
creds.keys()

dict_keys(['api_key'])

In [10]:
#Business Search      
# url = 'https://api.yelp.com/v3/businesses/search'

#Business Match       
# url = 'https://api.yelp.com/v3/businesses/matches'

#Phone Search         
url = 'https://api.yelp.com/v3/businesses/search/phone'

#Business Details     
# url = 'https://api.yelp.com/v3/businesses/{id}'

#Business Reviews     
# url = 'https://api.yelp.com/v3/businesses/{id}/reviews'

In [None]:
# #Business Search  
# headers = {
#     'Authorization': 'Bearer ' + creds['api_key']
#           }
# location = 'New York NY'
# SEARCH_LIMIT = 50

# url_params = {
#     'limit': SEARCH_LIMIT,
#     'location' : location.replace(' ','+'),
#     'offset': 0
#              }


# response = requests.get(url, headers=headers, params=url_params)
# print(response.status_code)

In [26]:
phones = list(top_doh['PHONE'])

['+2129569488', '+7187087600', '+2128882288', '+2128882288', '+7182719222']

In [34]:
#Phone Search  
headers = {
    'Authorization': 'Bearer ' + creds['api_key']
          }

url_params = {
    'phone': top_doh['PHONE'].iloc[1]
             }


response = requests.get(url, headers=headers, params=url_params)
print(response.status_code)

200


In [35]:
response_json = response.json()
# View the keys
response_json.keys()

dict_keys(['businesses', 'total'])

In [47]:
# Functionize the Yelp API call
def get_results(url='https://api.yelp.com/v3/businesses/search/phone',
                cred=None,fpath='/Users/Rob/.secret/yelp_api.json',phones=None):
    
    
    if cred is None:
    
        with open(fpath) as f:
            cred = json.load(f)

    headers = {
        'Authorization': 'Bearer ' + cred['api_key']
    }

    url_params = {
        'phone': phones
    }


    response = requests.get(url, headers=headers, params=url_params)
    return response.json()

In [50]:
response = get_results(phones)

NameError: name 'phones' is not defined

In [49]:
response['total']

KeyError: 'total'

In [None]:
total = response['total']

Yelp has information for 25,800 businesses in NYC.

In [39]:
# Retrieve the value from response_json
businesses = response_json.get('businesses',{})
# View the first 2 records
businesses

[{'id': 'k9M1t_n2MnutuEpavf1ezQ',
  'alias': 'tinys-diner-bronx',
  'name': "Tiny's Diner",
  'image_url': 'https://s3-media0.fl.yelpcdn.com/bphoto/XX8BU3GnD8HAmIeT4lMIZA/o.jpg',
  'is_closed': False,
  'url': 'https://www.yelp.com/biz/tinys-diner-bronx?adjust_creative=82uXkAt1Tiw7u9_h33zr1A&utm_campaign=yelp_api_v3&utm_medium=api_v3_phone_search&utm_source=82uXkAt1Tiw7u9_h33zr1A',
  'review_count': 101,
  'categories': [{'alias': 'diners', 'title': 'Diners'}],
  'rating': 3.0,
  'coordinates': {'latitude': 40.8867411222787,
   'longitude': -73.9072045235889},
  'transactions': ['pickup', 'delivery'],
  'price': '$$',
  'location': {'address1': '3603 Riverdale Ave',
   'address2': '',
   'address3': '',
   'city': 'Bronx',
   'zip_code': '10463',
   'country': 'US',
   'state': 'NY',
   'display_address': ['3603 Riverdale Ave', 'Bronx, NY 10463']},
  'phone': '+17187087600',
  'display_phone': '(718) 708-7600'}]

In [None]:
type(businesses)

In [None]:
# def prepare_data(data_list):
#     """
#     This function takes in a list of dictionaries and prepares it
#     for analysis
#     """
    
#     # Make a new list to hold results
#     results = []
    
#     for business_data in data_list:
    
#         # Make a new dictionary to hold prepared data for this business
#         prepared_data = {}
        
#         # Extract name, review_count, rating, and price key-value pairs
#         # from business_data and add to prepared_data
#         # If a key is not present in business_data, add it to prepared_data
#         # with an associated value of None
#         for key in ("name", "review_count", "rating", "price"):
#             prepared_data[key] = business_data.get(key, None)
    
#         # Parse and add latitude and longitude columns
#         coordinates = business_data["coordinates"]
#         prepared_data["latitude"] = coordinates["latitude"]
#         prepared_data["longitude"] = coordinates["longitude"]
        
#         # Add to list if all values are present
#         if all(prepared_data.values()):
#             results.append(prepared_data)
    
#     return results
    
# # Test out function
# prepared_businesses = prepare_data(businesses)
# prepared_businesses[:5]

### Loop Through Pagination

In [None]:
def get_offsets(total):
    """
    Get a list of offsets needed to get all pages
    of data up until the total
    """
    return list(range(0, total, 50))

In [None]:
url_params

In [None]:
# full_dataset = []

# for offset in get_offsets(total):
    
#     url_params['offset'] = offset
    
#     response = requests.get(url, headers=headers, params=url_params)
    
#     time.sleep(1)
    
#     response_json = response.json()
    
#     businesses = response_json.get('businesses')
# #    
#     prepared_business = prepare_data(businesses)
    
#     full_dataset.extend(prepared_businesses)

# len(full_dataset)

In [None]:
# Loop to retrieve all the results from the original request for all NYC businesses
offset=0
response = get_results('New York,NY',offset=offset)

n_per_page = len(response['businesses'])
total_result = response['total']

results = response_json.get('businesses',{})

while offset < total_result:
    offset+=n_per_page
    res = get_results('New York,NY',offset=offset)
    results.extend(res['businesses'])
results[:5]

In [None]:
res

In [None]:
len(results)

In [None]:
df = pd.DataFrame(results)
df.to_csv('yelp_results2.csv',index=False)
df = pd.read_csv('yelp_results2.csv')
df

In [None]:
# response.json().keys()

In [None]:
# yelp_df = pd.DataFrame(response.json()['businesses'])
# # yelp_df.head(3)

In [None]:
# lat = []
# long = []

# for _,business in yelp_df.iterrows():
#     lat.append(business['coordinates']['latitude'])
#     long.append(business['coordinates']['longitude'])

# yelp_df['lat'] = lat
# yelp_df['long'] = long

# Data Understanding

For this project there will be two sources and types of data used:

* Historical health and hygiene inspections recorded by New York City Department of Health and Mental Hygiene (DOHMH) public health inspectors
* User generated Yelp business ratings and reviews

## Understanding NYC DOHMH Data

This dataset contains over 330,000 records, let's explore its contents.

In [None]:
dohmh_df['inspection_date'] =  pd.to_datetime(dohmh_inspections['inspection_date'])

In [None]:
dohmh_df['inspection_date'].min()

In [None]:
dohmh_df['inspection_date'].max()

Inspections in this dataset range from May 2008 up to present day.

In [None]:
# dohmh_df.set_index('inspection_date')

In [None]:
dohmh_df['boro'].value_counts()

In [None]:
dohmh_df['score'].value_counts()

In [None]:
dohmh_df['score'].isna().sum()

In [None]:
dohmh_df['grade'].value_counts()

In [None]:
dohmh_df['grade'].value_counts(normalize=True)

In [None]:
dohmh_df['grade'].isna().sum()

In [None]:
dohmh_df['critical_flag'].value_counts()

Critical violations are those most likely to contribute to foodborne illness

In [None]:
dohmh_df['critical_flag'].value_counts(normalize=True)

In [None]:
dohmh_df['inspection_type'].value_counts()

In [None]:
dohmh_inspections = dohmh_df[(dohmh_df['inspection_type'] ==
                              'Cycle Inspection / Initial Inspection') | (dohmh_df['inspection_type'] ==
                              'Cycle Inspection / Re-inspection')]

In [None]:
dohmh_inspections['critical_flag'].value_counts()

In [None]:
# Drop records with NA critical_flag

dohmh_ohe = dohmh_inspections[dohmh_inspections['critical_flag'] != 'Not Applicable']

In [None]:
dohmh_ohe = pd.get_dummies(dohmh_inspections,columns=['critical_flag'])

In [None]:
dohmh_ohe.head(2)

In [None]:
critical_flags = dohmh_ohe.groupby(['camis'])['critical_flag_Critical'].count()

In [None]:
critical_flags[critical_flagss]

In [None]:
critical_flags['critical_flag_Not Critical'].sum()

In [None]:
non_critical_flags = dohmh_ohe.groupby(['camis'])

In [None]:
dohmh_inspections.duplicated().sum()

In [None]:
dohmh_inspections.loc[dohmh_inspections.duplicated(keep='first'),:]

In [None]:
dohmh_inspections.drop_duplicates().shape

In [None]:
dohmh_inspections.duplicated(subset=['camis']).sum()

In [None]:
dohmh_inspections['camis'].nunique()

There are approximately 28,000 restaurants that have been inspected by the DOHMH. Let's check out how many have ever been flagged for a critical violation.

## Mapping with Follium