In [1]:
from IPython import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_columns',50)


In [29]:
import requests
import json

In [36]:
from sodapy import Socrata

In [31]:
url = 'https://api.yelp.com/v3/businesses/search'

In [32]:
business_search = requests.get(url)

In [33]:
type(business_search)

requests.models.Response

In [34]:
business_search.status_code

400

In [None]:
TABLE OF CONTENTS
Click to jump to matching Markdown Header.


Introduction
OBTAIN
SCRUB
EXPLORE
MODEL
iNTERPRET
Conclusions/Recommendations


# Introduction

According to the Centers for Disease Control, more than 48 million Americans per year become sick from food, and an estimated 75% of the outbreaks came from food prepared by caterers, delis, and restaurants. In most cities, health inspections are generally random, which can increase time spent on spot checks at clean restaurants that have been following the rules closely — and missed opportunities to improve health and hygiene at places with more pressing food safety issues.

The goal for this project is to leverage public citizen generated data from social media to narrow the search for critical health and safety violations in New York City. As the City of New York manages  an open data portal, everyone can access historical hygiene inspections and violation records. By combine these two data source this project aims to determine which words, phrases, ratings, and patterns among restaurants lead to critical health and safety violations. This model can assist city health inspectors do their job better by prioritizing the kitchens most likely to be in violation of code.

# Obtain

This project requires data pulled from two different sources, the City of New York and Yelp. To obtain the data we will call the API keys.

## NYC Open Data API

In [39]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofnewyork.us,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("43nn-pn8j", limit=2000)

# Convert to pandas DataFrame
dohmh_df = pd.DataFrame.from_records(results)



In [40]:
dohmh_df

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,41707327,DRAGON STATE KITCHEN,Bronx,4124A,WHITE PLAINS ROAD,10466,7185152329,Chinese,2018-01-23T00:00:00.000,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premise...,Not Critical,9,A,2018-01-23T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Re-inspection,40.891433609009,-73.858445915336,212,12,042200,2063647,2048440035,BX44
1,50043322,THE CRUS-Z FAMILY RESTAURANT,Queens,8709,ROOSEVELT AVE,11372,3473372628,Mexican,2018-09-11T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,49,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.748143103699,-73.879106766814,403,21,027900,4036273,4014750042,QN28
2,50000613,SUSHI DOJO,Manhattan,110,1 AVENUE,10009,6466929398,Japanese,2022-03-16T00:00:00.000,Violations were cited in the following area(s).,06C,"Food not protected from potential source of contamination during storage, preparation, transport...",Critical,31,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.726725405316,-73.985903892131,103,02,003200,1005773,1004340006,MN22
3,50073108,AREPALICIOUS,Queens,13720,CROSSBAY BLVD,11417,7184806049,Bakery Products/Desserts,2019-05-01T00:00:00.000,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rinsed and sanitized after each use and following any ...",Critical,11,A,2019-05-01T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.671360998626,-73.843006327716,410,32,005800,4458765,4114090010,QN56
4,41326216,NONNA'S OLD FASHIONED PIZZERIA,Staten Island,27,BROWER COURT,10308,7182278844,Pizza,2019-03-19T00:00:00.000,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140º F.,Critical,12,A,2019-03-19T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.550965887755,-74.151186174889,503,51,014604,5070227,5054330010,SI54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,50099170,OMAR'S,Manhattan,302,BROOME STREET,10002,2122035552,French,2020-03-05T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,11,N,,2022-03-21T06:00:38.000,Pre-permit (Operational) / Initial Inspection,40.718727756026,-73.992211458718,103,01,001800,1066637,1004197501,MN27
1996,50099170,OMAR'S,Manhattan,302,BROOME STREET,10002,2122035552,French,2020-03-05T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,11,N,,2022-03-21T06:00:38.000,Pre-permit (Operational) / Initial Inspection,40.718727756026,-73.992211458718,103,01,001800,1066637,1004197501,MN27
1997,41225718,ROMAN'S,Brooklyn,243,DEKALB AVENUE,11205,7186225300,American,2018-11-16T00:00:00.000,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (FRSA) flies present in facility’s food and/or non-...,Critical,12,A,2018-11-16T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Re-inspection,40.689483919583,-73.969497792791,302,35,018300,3058587,3020920066,BK68
1998,40546082,DON PEPI PIZZA,Manhattan,000,PENN STATION,10121,2129674385,Pizza,2019-03-05T00:00:00.000,No violations were recorded at the time of this inspection.,,"Single service item reused, improperly stored, dispensed; not used when required.",Not Critical,0,,,2022-03-21T06:00:44.000,Cycle Inspection / Initial Inspection,40.750059234828,-73.992106724456,105,03,010100,1083026,1007810002,MN17


## Yelp API

In [None]:
#Business Search      URL -- 'https://api.yelp.com/v3/businesses/search'
#Business Match       URL -- 'https://api.yelp.com/v3/businesses/matches'
#Phone Search         URL -- 'https://api.yelp.com/v3/businesses/search/phone'

#Business Details     URL -- 'https://api.yelp.com/v3/businesses/{id}'
#Business Reviews     URL -- 'https://api.yelp.com/v3/businesses/{id}/reviews'


# Define a business ID
business_id = '4AErMBEoNzbk7Q8g45kKaQ'
unix_time = 1546047836

# Define my API Key, My Endpoint, and My Header
API_KEY = 'YOUR API KEY'
ENDPOINT = 'https://api.yelp.com/v3/businesses/{}/reviews'.format(business_id)
HEADERS = {'Authorization': 'bearer %s' % API_KEY}

# Define my parameters of the search
# BUSINESS SEARCH PARAMETERS - EXAMPLE
#PARAMETERS = {'term': 'food',
#              'limit': 50,
#              'offset': 50,
#              'radius': 10000,
#              'location': 'San Diego'}

# BUSINESS MATCH PARAMETERS - EXAMPLE
#PARAMETERS = {'name': 'Peets Coffee & Tea',
#              'address1': '7845 Highland Village Pl',
#              'city': 'San Diego',
#              'state': 'CA',
#              'country': 'US'}

# Make a request to the Yelp API
response = requests.get(url = ENDPOINT,
                        params = PARAMETERS,
                        headers = HEADERS)

# Conver the JSON String
business_data = response.json()

# print the response
print(json.dumps(business_data, indent = 3))

# Data Understanding

For this project there will be two sources and types of data used:

* Historical health and hygiene inspections recorded by New York City Department of Health and Mental Hygiene (DOHMH) public health inspectors
* User generated Yelp business ratings and reviews

This dataset contains 337943 records. Let's explore further.

In [42]:
dohmh_df['boro'].value_counts()

Manhattan        802
Brooklyn         467
Queens           445
Bronx            216
Staten Island     70
Name: boro, dtype: int64

In [43]:
dohmh_df['action'].value_counts()

Violations were cited in the following area(s).                                                                                       1852
Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.      85
No violations were recorded at the time of this inspection.                                                                             31
Establishment re-opened by DOHMH.                                                                                                       18
Name: action, dtype: int64

In [45]:
dohmh_df['critical_flag'].value_counts()

Critical          1078
Not Critical       897
Not Applicable      25
Name: critical_flag, dtype: int64

In [46]:
dohmh_df['critical_flag'].value_counts(normalize=True)

Critical          0.5390
Not Critical      0.4485
Not Applicable    0.0125
Name: critical_flag, dtype: float64

In [47]:
dohmh_df['inspection_type'].value_counts(normalize=True)

Cycle Inspection / Initial Inspection                0.566465
Cycle Inspection / Re-inspection                     0.234642
Pre-permit (Operational) / Initial Inspection        0.097684
Pre-permit (Operational) / Re-inspection             0.037764
Administrative Miscellaneous / Initial Inspection    0.015609
Cycle Inspection / Reopening Inspection              0.008056
Pre-permit (Non-operational) / Initial Inspection    0.007553
Smoke-Free Air Act / Initial Inspection              0.007049
Administrative Miscellaneous / Re-inspection         0.005035
Trans Fat / Initial Inspection                       0.004532
Pre-permit (Operational) / Compliance Inspection     0.004532
Cycle Inspection / Compliance Inspection             0.003021
Inter-Agency Task Force / Initial Inspection         0.002518
Pre-permit (Operational) / Reopening Inspection      0.001511
Smoke-Free Air Act / Re-inspection                   0.001007
Trans Fat / Re-inspection                            0.001007
Calorie 

In [None]:
# # Python program to convert
# # JSON file to CSV


# import json
# import csv


# # Opening JSON file and loading the data
# # into the variable data
# with open('data/yelp_academic_dataset_review.json') as json_file:
# yelp_data = json.load(json_file)

# employee_data = data['emp_details']

# # now we will open a file for writing
# data_file = open('data_file.csv', 'w')

# # create the csv writer object
# csv_writer = csv.writer(data_file)

# # Counter variable used for writing
# # headers to the CSV file
# count = 0

# for emp in employee_data:
# 	if count == 0:

# 		# Writing headers of CSV file
# 		header = emp.keys()
# 		csv_writer.writerow(header)
# 		count += 1

# 	# Writing data of CSV file
# 	csv_writer.writerow(emp.values())

# data_file.close()
