In [3]:
from IPython import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_columns',50)


In [16]:
import requests
import json

In [36]:
from sodapy import Socrata

In [31]:
url = 'https://api.yelp.com/v3/businesses/search'

In [32]:
business_search = requests.get(url)

In [33]:
type(business_search)

requests.models.Response

In [34]:
business_search.status_code

400

In [None]:
TABLE OF CONTENTS
Click to jump to matching Markdown Header.


Introduction
OBTAIN
SCRUB
EXPLORE
MODEL
iNTERPRET
Conclusions/Recommendations


# Introduction

According to the Centers for Disease Control, more than 48 million Americans per year become sick from food, and an estimated 75% of the outbreaks came from food prepared by caterers, delis, and restaurants. In most cities, health inspections are generally random, which can increase time spent on spot checks at clean restaurants that have been following the rules closely — and missed opportunities to improve health and hygiene at places with more pressing food safety issues.

The goal for this project is to leverage public citizen generated data from social media to narrow the search for critical health and safety violations in New York City. As the City of New York manages  an open data portal, everyone can access historical hygiene inspections and violation records. By combine these two data source this project aims to determine which words, phrases, ratings, and patterns among restaurants lead to critical health and safety violations. This model can assist city health inspectors do their job better by prioritizing the kitchens most likely to be in violation of code.

# Obtain

This project requires data pulled from two different sources, the City of New York and Yelp. To obtain the data we will call the API keys.

## NYC Open Data API

Calling API

In [24]:
url = 'https://data.cityofnewyork.us/resource/43nn-pn8j.json?$limit=40'

In [25]:
nyc_response = requests.get(url)

In [26]:
nyc_response.ok

True

In [31]:
nyc_data = nyc_response.json()
nyc_data

[{'camis': '41707327',
  'dba': 'DRAGON STATE KITCHEN',
  'boro': 'Bronx',
  'building': '4124A',
  'street': 'WHITE PLAINS ROAD',
  'zipcode': '10466',
  'phone': '7185152329',
  'cuisine_description': 'Chinese',
  'inspection_date': '2018-01-23T00:00:00.000',
  'action': 'Violations were cited in the following area(s).',
  'violation_code': '08A',
  'violation_description': 'Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premises and/or allowing vermin to exist.',
  'critical_flag': 'Not Critical',
  'score': '9',
  'grade': 'A',
  'grade_date': '2018-01-23T00:00:00.000',
  'record_date': '2022-03-21T06:00:38.000',
  'inspection_type': 'Cycle Inspection / Re-inspection',
  'latitude': '40.891433609009',
  'longitude': '-73.858445915336',
  'community_board': '212',
  'council_district': '12',
  'census_tract': '042200',
  'bin': '2063647',
  'bbl': '2048440035',
  'nta': 'BX44'},
 {'camis': '50043322',
  'dba': 'THE CRUS-Z FAMILY RESTAURANT',

In [36]:
type(nyc_data)

list

In [23]:
df1 = pd.read_csv('https://data.cityofnewyork.us/resource/43nn-pn8j.csv?$limit=400000')

In [13]:
df1

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,41707327,DRAGON STATE KITCHEN,Bronx,4124A,WHITE PLAINS ROAD,10466.0,7185152329,Chinese,2018-01-23T00:00:00.000,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premise...,Not Critical,9.0,A,2018-01-23T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Re-inspection,40.891434,-73.858446,212.0,12.0,42200.0,2063647.0,2.048440e+09,BX44
1,50043322,THE CRUS-Z FAMILY RESTAURANT,Queens,8709,ROOSEVELT AVE,11372.0,3473372628,Mexican,2018-09-11T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,49.0,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.748143,-73.879107,403.0,21.0,27900.0,4036273.0,4.014750e+09,QN28
2,50000613,SUSHI DOJO,Manhattan,110,1 AVENUE,10009.0,6466929398,Japanese,2022-03-16T00:00:00.000,Violations were cited in the following area(s).,06C,"Food not protected from potential source of contamination during storage, preparation, transport...",Critical,31.0,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.726725,-73.985904,103.0,2.0,3200.0,1005773.0,1.004340e+09,MN22
3,50073108,AREPALICIOUS,Queens,13720,CROSSBAY BLVD,11417.0,7184806049,Bakery Products/Desserts,2019-05-01T00:00:00.000,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rinsed and sanitized after each use and following any ...",Critical,11.0,A,2019-05-01T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.671361,-73.843006,410.0,32.0,5800.0,4458765.0,4.114090e+09,QN56
4,41326216,NONNA'S OLD FASHIONED PIZZERIA,Staten Island,27,BROWER COURT,10308.0,7182278844,Pizza,2019-03-19T00:00:00.000,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140º F.,Critical,12.0,A,2019-03-19T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.550966,-74.151186,503.0,51.0,14604.0,5070227.0,5.054330e+09,SI54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331517,50045329,SOCIAL TERRACE,Manhattan,570,10 AVENUE,10036.0,9175510811,Bottled Beverages,2018-08-08T00:00:00.000,Violations were cited in the following area(s).,06C,"Food not protected from potential source of contamination during storage, preparation, transport...",Critical,11.0,A,2018-08-08T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.759405,-73.995614,104.0,3.0,11500.0,1088437.0,1.010518e+09,MN15
331518,40711301,CHICKEN FESTIVAL,Queens,29-19,DITMARS BOULEVARD,11105.0,7187289696,Chicken,2019-06-27T00:00:00.000,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact su...,Not Critical,27.0,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.776718,-73.911525,401.0,22.0,11300.0,4017714.0,4.008460e+09,QN72
331519,50085173,GOSSIP HOUSE,Queens,14746,NORTHERN BLVD,11354.0,7183586262,Korean,2019-07-25T00:00:00.000,Violations were cited in the following area(s).,10J,Hand wash sign not posted,Not Critical,25.0,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.765161,-73.819056,407.0,20.0,116300.0,4113530.0,4.050160e+09,QN51
331520,41565544,TANDOOR & CO,Queens,9524,QUEENS BOULEVARD,11374.0,7189976800,Indian,2019-03-11T00:00:00.000,Violations were cited in the following area(s).,04H,"Raw, cooked or prepared food is adulterated, contaminated, cross-contaminated, or not discarded ...",Critical,9.0,A,2019-03-11T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.730592,-73.863671,406.0,29.0,69300.0,4072067.0,4.030800e+09,QN18


In [39]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)



# Example authenticated client (needed for non-public datasets):
client = Socrata(data.cityofnewyork.us,
                 MyAppToken,
                 userame="user@example.com",
                 password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("43nn-pn8j", limit=2000)

# Convert to pandas DataFrame
dohmh_df = pd.DataFrame.from_records(results)



In [40]:
dohmh_df

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,violation_code,violation_description,critical_flag,score,grade,grade_date,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,41707327,DRAGON STATE KITCHEN,Bronx,4124A,WHITE PLAINS ROAD,10466,7185152329,Chinese,2018-01-23T00:00:00.000,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or conditions conducive to attracting vermin to the premise...,Not Critical,9,A,2018-01-23T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Re-inspection,40.891433609009,-73.858445915336,212,12,042200,2063647,2048440035,BX44
1,50043322,THE CRUS-Z FAMILY RESTAURANT,Queens,8709,ROOSEVELT AVE,11372,3473372628,Mexican,2018-09-11T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,49,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.748143103699,-73.879106766814,403,21,027900,4036273,4014750042,QN28
2,50000613,SUSHI DOJO,Manhattan,110,1 AVENUE,10009,6466929398,Japanese,2022-03-16T00:00:00.000,Violations were cited in the following area(s).,06C,"Food not protected from potential source of contamination during storage, preparation, transport...",Critical,31,,,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.726725405316,-73.985903892131,103,02,003200,1005773,1004340006,MN22
3,50073108,AREPALICIOUS,Queens,13720,CROSSBAY BLVD,11417,7184806049,Bakery Products/Desserts,2019-05-01T00:00:00.000,Violations were cited in the following area(s).,06D,"Food contact surface not properly washed, rinsed and sanitized after each use and following any ...",Critical,11,A,2019-05-01T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.671360998626,-73.843006327716,410,32,005800,4458765,4114090010,QN56
4,41326216,NONNA'S OLD FASHIONED PIZZERIA,Staten Island,27,BROWER COURT,10308,7182278844,Pizza,2019-03-19T00:00:00.000,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140º F.,Critical,12,A,2019-03-19T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Initial Inspection,40.550965887755,-74.151186174889,503,51,014604,5070227,5054330010,SI54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,50099170,OMAR'S,Manhattan,302,BROOME STREET,10002,2122035552,French,2020-03-05T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,11,N,,2022-03-21T06:00:38.000,Pre-permit (Operational) / Initial Inspection,40.718727756026,-73.992211458718,103,01,001800,1066637,1004197501,MN27
1996,50099170,OMAR'S,Manhattan,302,BROOME STREET,10002,2122035552,French,2020-03-05T00:00:00.000,Violations were cited in the following area(s).,02G,Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) exce...,Critical,11,N,,2022-03-21T06:00:38.000,Pre-permit (Operational) / Initial Inspection,40.718727756026,-73.992211458718,103,01,001800,1066637,1004197501,MN27
1997,41225718,ROMAN'S,Brooklyn,243,DEKALB AVENUE,11205,7186225300,American,2018-11-16T00:00:00.000,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (FRSA) flies present in facility’s food and/or non-...,Critical,12,A,2018-11-16T00:00:00.000,2022-03-21T06:00:38.000,Cycle Inspection / Re-inspection,40.689483919583,-73.969497792791,302,35,018300,3058587,3020920066,BK68
1998,40546082,DON PEPI PIZZA,Manhattan,000,PENN STATION,10121,2129674385,Pizza,2019-03-05T00:00:00.000,No violations were recorded at the time of this inspection.,,"Single service item reused, improperly stored, dispensed; not used when required.",Not Critical,0,,,2022-03-21T06:00:44.000,Cycle Inspection / Initial Inspection,40.750059234828,-73.992106724456,105,03,010100,1083026,1007810002,MN17


## Yelp API

In [39]:
with open('/Users/Rob/.secret/yelp_api.json') as f:
    creds = json.load(f)

In [40]:
creds.keys()

dict_keys(['api_key'])

In [42]:
#Business Search      
url = 'https://api.yelp.com/v3/businesses/search'
SEARCH_LIMIT = 10
#Business Match       
# url = 'https://api.yelp.com/v3/businesses/matches'

#Phone Search         
# url = 'https://api.yelp.com/v3/businesses/search/phone'

#Business Details     
# url = 'https://api.yelp.com/v3/businesses/{id}'

#Business Reviews     
# url = 'https://api.yelp.com/v3/businesses/{id}/reviews'

In [47]:
headers = {
    'Authorization': 'Bearer ' + creds['api_key']
          }

url_params = {
    'limit': SEARCH_LIMIT,
    'location' : 'NYC',
    'offset': 0
             }


response = requests.get(url, headers=headers, params=url_params)
print(response.status_code)

200


In [62]:
# Functionize the Yelp API call
def get_results(location, SEARCH_LIMIT=20,
                url='https://api.yelp.com/v3/businesses/search',
                cred=None,fpath='/Users/Rob/.secret/yelp_api.json',
                offset=0):
    
    
    if cred is None:
    
        with open(fpath) as f:
            cred = json.load(f)

    headers = {
        'Authorization': 'Bearer ' + cred['api_key']
    }

    url_params = {
        'limit': SEARCH_LIMIT,
        'location' : location,
        'offset': offset
    }


    response = requests.get(url, headers=headers, params=url_params)
    return response.json()

In [63]:
response = get_results('NYC')

In [65]:
n_per_page = len(response['businesses'])

In [67]:
response2 = get_results('NYC',offset=n_per_page)
pd.DataFrame(response2['businesses']).head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,xt4sa64WOrpJvZBDPNPNYg,jacobs-pickles-new-york,Jacob's Pickles,https://s3-media1.fl.yelpcdn.com/bphoto/yU9q4U1WSk6diDF0dcA9ag/o.jpg,False,https://www.yelp.com/biz/jacobs-pickles-new-york?adjust_creative=82uXkAt1Tiw7u9_h33zr1A&utm_camp...,4739,"[{'alias': 'comfortfood', 'title': 'Comfort Food'}, {'alias': 'southern', 'title': 'Southern'}, ...",4.0,"{'latitude': 40.7866504411994, 'longitude': -73.9755284786224}","[delivery, pickup]",$$,"{'address1': '509 Amsterdam Ave', 'address2': '', 'address3': '', 'city': 'New York', 'zip_code'...",12124705566,(212) 470-5566,9166.964738
1,ga6sRtE0l85iftw_5-W84Q,dominique-ansel-bakery-new-york,Dominique Ansel Bakery,https://s3-media3.fl.yelpcdn.com/bphoto/ae6sDhEfUXTE5EKfWSOrCQ/o.jpg,False,https://www.yelp.com/biz/dominique-ansel-bakery-new-york?adjust_creative=82uXkAt1Tiw7u9_h33zr1A&...,4951,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {'alias': 'desserts', 'title': 'Desserts'}]",4.0,"{'latitude': 40.72516, 'longitude': -74.00296}",[delivery],$$,"{'address1': '189 Spring St', 'address2': '', 'address3': '', 'city': 'New York', 'zip_code': '1...",12122192773,(212) 219-2773,2312.769277
2,vk7W3_sQwr7eZbRFsXv6rw,taiyaki-nyc-new-york,Taiyaki NYC,https://s3-media4.fl.yelpcdn.com/bphoto/F3Vyd-otu36oE8B8M1XXug/o.jpg,False,https://www.yelp.com/biz/taiyaki-nyc-new-york?adjust_creative=82uXkAt1Tiw7u9_h33zr1A&utm_campaig...,3089,"[{'alias': 'desserts', 'title': 'Desserts'}, {'alias': 'japanese', 'title': 'Japanese'}, {'alias...",4.5,"{'latitude': 40.71789, 'longitude': -73.9988}","[delivery, pickup]",$,"{'address1': '119 Baxter St', 'address2': '', 'address3': None, 'city': 'New York', 'zip_code': ...",12129662882,(212) 966-2882,1439.725809
3,FEVQpbOPOwAPNIgO7D3xxw,shake-shack-new-york-2,Shake Shack,https://s3-media3.fl.yelpcdn.com/bphoto/5Tnq5PlJ1wkEU5op-yolcg/o.jpg,False,https://www.yelp.com/biz/shake-shack-new-york-2?adjust_creative=82uXkAt1Tiw7u9_h33zr1A&utm_campa...,5655,"[{'alias': 'burgers', 'title': 'Burgers'}, {'alias': 'foodstands', 'title': 'Food Stands'}, {'al...",4.0,"{'latitude': 40.74212, 'longitude': -73.98707}","[delivery, pickup]",$$,"{'address1': 'E 23rd St Madison Ave', 'address2': '', 'address3': 'Madison Square Park', 'city':...",12128896600,(212) 889-6600,3975.513089
4,ysqgdbSrezXgVwER2kQWKA,julianas-brooklyn-3,Juliana's,https://s3-media2.fl.yelpcdn.com/bphoto/clscwgOF9_Ecq-Rwsq7jyQ/o.jpg,False,https://www.yelp.com/biz/julianas-brooklyn-3?adjust_creative=82uXkAt1Tiw7u9_h33zr1A&utm_campaign...,2408,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 40.70274718768062, 'longitude': -73.99343490196397}",[delivery],$$,"{'address1': '19 Old Fulton St', 'address2': '', 'address3': '', 'city': 'Brooklyn', 'zip_code':...",17185966700,(718) 596-6700,308.569844


### Loop Through Pagination

In [74]:
offset=0
response = get_results('NYC',offset=offset)
n_per_page = len(response['businesses'])
total_result = response['total']

results = response['businesses']

while offset < total_result:
    offset+=n_per_page
    res = get_results('NYC',offset=offset)
    results.extend(res['businesses'])
results[:5]

In [84]:
res

{'error': {'code': 'INTERNAL_ERROR',
  'description': 'Something went wrong internally, please try again later.'}}

In [85]:
while offset < total_result:
    offset+=n_per_page
    returns = get_results('NYC',offset=offset)
    results.extend(returns['businesses'])
results[:5]

KeyError: 'businesses'

In [86]:
returns

{'error': {'code': 'INTERNAL_ERROR',
  'description': 'Something went wrong internally, please try again later.'}}

In [81]:
df_yelp = pd.DataFrame(results)
df_yelp

In [49]:
# response.json().keys()

dict_keys(['businesses', 'total', 'region'])

In [69]:
# yelp_df = pd.DataFrame(response.json()['businesses'])
# # yelp_df.head(3)

In [70]:
# lat = []
# long = []

# for _,business in yelp_df.iterrows():
#     lat.append(business['coordinates']['latitude'])
#     long.append(business['coordinates']['longitude'])

# yelp_df['lat'] = lat
# yelp_df['long'] = long

# Data Understanding

For this project there will be two sources and types of data used:

* Historical health and hygiene inspections recorded by New York City Department of Health and Mental Hygiene (DOHMH) public health inspectors
* User generated Yelp business ratings and reviews

This dataset contains 337943 records. Let's explore further.

In [42]:
dohmh_df['boro'].value_counts()

Manhattan        802
Brooklyn         467
Queens           445
Bronx            216
Staten Island     70
Name: boro, dtype: int64

In [43]:
dohmh_df['action'].value_counts()

Violations were cited in the following area(s).                                                                                       1852
Establishment Closed by DOHMH. Violations were cited in the following area(s) and those requiring immediate action were addressed.      85
No violations were recorded at the time of this inspection.                                                                             31
Establishment re-opened by DOHMH.                                                                                                       18
Name: action, dtype: int64

In [45]:
dohmh_df['critical_flag'].value_counts()

Critical          1078
Not Critical       897
Not Applicable      25
Name: critical_flag, dtype: int64

In [46]:
dohmh_df['critical_flag'].value_counts(normalize=True)

Critical          0.5390
Not Critical      0.4485
Not Applicable    0.0125
Name: critical_flag, dtype: float64

In [47]:
dohmh_df['inspection_type'].value_counts(normalize=True)

Cycle Inspection / Initial Inspection                0.566465
Cycle Inspection / Re-inspection                     0.234642
Pre-permit (Operational) / Initial Inspection        0.097684
Pre-permit (Operational) / Re-inspection             0.037764
Administrative Miscellaneous / Initial Inspection    0.015609
Cycle Inspection / Reopening Inspection              0.008056
Pre-permit (Non-operational) / Initial Inspection    0.007553
Smoke-Free Air Act / Initial Inspection              0.007049
Administrative Miscellaneous / Re-inspection         0.005035
Trans Fat / Initial Inspection                       0.004532
Pre-permit (Operational) / Compliance Inspection     0.004532
Cycle Inspection / Compliance Inspection             0.003021
Inter-Agency Task Force / Initial Inspection         0.002518
Pre-permit (Operational) / Reopening Inspection      0.001511
Smoke-Free Air Act / Re-inspection                   0.001007
Trans Fat / Re-inspection                            0.001007
Calorie 

In [None]:
# # Python program to convert
# # JSON file to CSV


# import json
# import csv


# # Opening JSON file and loading the data
# # into the variable data
# with open('data/yelp_academic_dataset_review.json') as json_file:
# yelp_data = json.load(json_file)

# employee_data = data['emp_details']

# # now we will open a file for writing
# data_file = open('data_file.csv', 'w')

# # create the csv writer object
# csv_writer = csv.writer(data_file)

# # Counter variable used for writing
# # headers to the CSV file
# count = 0

# for emp in employee_data:
# 	if count == 0:

# 		# Writing headers of CSV file
# 		header = emp.keys()
# 		csv_writer.writerow(header)
# 		count += 1

# 	# Writing data of CSV file
# 	csv_writer.writerow(emp.values())

# data_file.close()
