In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
# Import the required Python libraries 

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import requests
import sqlalchemy
from sqlalchemy import create_engine

In [4]:
# Connect to SQL database using 'create_engine' with SQLAlchemy

conn_string_business = 'mysql://{user}:{password}@{host}:{port}/{db}?charset=utf8'.format(
    user='notebook', 
    password='notebook', 
    host = 'dreamteam.conr4khphv8c.us-east-2.rds.amazonaws.com', 
    port=3306, 
    db='nyhealth',
    encoding = 'utf-8'
)

engine = create_engine(conn_string_business)

In [5]:
# Sebastian, Ross, and Ethan created API keys at https://www.yelp.com/developers/ - All keys listed below
# This is a test request to make sure the data Yelp returns is sufficient

rosskey = 'yJnJ8IxCWjMYXqoomIg2ySfDoHSSwpa5dcMzgKe7YNgYfzeF8jA049dzxDd6-EHZqATK2n07KS8ZOML_dbPqpceAA1WGoPpzzrnz_AnMOgFikw4XFVfcoHZabzYpW3Yx'
auth_header = {'Authorization': 'Bearer ' + rosskey}

# For Yelp API Documentation, see https://www.yelp.com/developers/documentation/v3
# We're use the URL below to search by phone number

url = 'https://api.yelp.com/v3/businesses/search/phone'

# By simply passing the restaurant phone number with a "+1" we get a response

parameters = {
    "phone": "+12128658777"
}

# Issue the authenticated request

resp = requests.get(url, headers=auth_header, params=parameters)
testdata = resp.json()

In [6]:
# The response contains three keys
# The main response is the "business", which contains a list of the results
testdata.keys()

dict_keys(['businesses', 'total'])

In [7]:
testdata

{'businesses': [{'alias': 'jimbos-hamburger-palace-new-york-10',
   'categories': [{'alias': 'burgers', 'title': 'Burgers'},
    {'alias': 'sandwiches', 'title': 'Sandwiches'}],
   'coordinates': {'latitude': 40.8135833740234,
    'longitude': -73.9558029174805},
   'display_phone': '(212) 865-8777',
   'id': 'Rx3m7vtf-eWBSQ4AavQl8g',
   'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/b0gAXXUeO8-v70fRCx0nDg/o.jpg',
   'is_closed': False,
   'location': {'address1': '1345 Amsterdam Ave',
    'address2': '',
    'address3': '',
    'city': 'New York',
    'country': 'US',
    'display_address': ['1345 Amsterdam Ave', 'New York, NY 10027'],
    'state': 'NY',
    'zip_code': '10027'},
   'name': "Jimbo's Hamburger Palace",
   'phone': '+12128658777',
   'price': '$',
   'rating': 4.0,
   'review_count': 16,
   'transactions': [],
   'url': 'https://www.yelp.com/biz/jimbos-hamburger-palace-new-york-10?adjust_creative=11VDKWSHlwVqVDIBUer9Kw&utm_campaign=yelp_api_v3&utm_medium=api_v3_p

In [8]:
# We can put the results directly into a dataframe
# The issue is that some columns (e.g location) are composite and
# instead of containing values, the cells contain *dictionaries*
# Panos showed us how to use assorted Pandas methods to clean this up.
df = pd.DataFrame(testdata['businesses'])

# We take the dictionaries in the location and coordinates columns
# and create new dataframes for both of them, with each element of the dictionary
# now being a column
address_df = df.location.apply(pd.Series)
coordinates_df = df.coordinates.apply(pd.Series)

# We remove what we don't need from the original dataframe
df.drop('location', axis='columns', inplace=True)
df.drop('coordinates', axis='columns', inplace=True)
df.drop('url', axis='columns', inplace=True)
df.drop('display_phone', axis='columns', inplace=True)
df.drop('alias', axis='columns', inplace=True)
df.drop('categories', axis='columns', inplace=True)
df.drop('transactions', axis='columns', inplace=True)
df.drop('image_url', axis='columns', inplace=True)

# Put together the original dataframe, the addresses, and the coordinates dataframes
# (We concatenates the 3 dataframes bu putting them all next to each other)
new_df = pd.concat([df, address_df, coordinates_df], axis='columns')
new_df



Unnamed: 0,id,is_closed,name,phone,price,rating,review_count,address1,address2,address3,city,zip_code,country,state,display_address,latitude,longitude
0,Rx3m7vtf-eWBSQ4AavQl8g,False,Jimbo's Hamburger Palace,12128658777,$,4.0,16,1345 Amsterdam Ave,,,New York,10027,US,NY,"[1345 Amsterdam Ave, New York, NY 10027]",40.813583,-73.955803


In [9]:
# Must CONCAT +1 FOR nyhealth PHONE numbers to send Yelp the right request

query = '''
select distinct CONCAT("+1", phone) as phone
from NYHEALTH
where BORO = 'MANHATTAN'
;
'''
phone_numbers = pd.read_sql(query, con=engine)

In [10]:
phone_numbers

Unnamed: 0,phone
0,+12127521495
1,+19172620380
2,+12122576434
3,+16469129061
4,+12129410911
5,+12126775834
6,+16468613403
7,+12122475756
8,+12127772188
9,+12128658777


In [27]:
# DO NOT RUN UNLESS YOU CHANGE THE SUBSECT OTHERWISE WILL CREATE OVER 9000 JSON FILES

sebastiankey = 'mh0lIHxmIVO9hFI0G3bexfV5h-1WLHNqze9Vb14SZaLM4diYfD5NgPR0-noXYkTZAFUyci1F5OYaCTNr68cIZWFINX3JZh4_GVucrLGaawHHbmQZkQiYVdibIL0mW3Yx'
rosskey = 'yJnJ8IxCWjMYXqoomIg2ySfDoHSSwpa5dcMzgKe7YNgYfzeF8jA049dzxDd6-EHZqATK2n07KS8ZOML_dbPqpceAA1WGoPpzzrnz_AnMOgFikw4XFVfcoHZabzYpW3Yx'
ethankey = '57qiAnSnRWl08a84pPm-ftJqb_WseqUphYrxLqOUxURXGYfHkU3WqDVr10_1ppi2G4wXyLGgR6RKIZklEOJpiSjgNWhJJXNuZWZhA_dH6083WJDOS2UH2n9DpDUpW3Yx'

auth_header = {'Authorization': 'Bearer ' + sebastiankey}
url = 'https://api.yelp.com/v3/businesses/search/phone'
i=0
for i in range(len(phone_numbers)):
    parameter = {'phone': str(phone_numbers['phone'][i])}
    resp = requests.get(url, headers=auth_header, params=parameter)
    data = resp.json()
    n = str(i)+'.json'
    f = open(n, "w")
    json.dump(data, f)
    i+1
    
    if i == 4990:
        break

# Duplicating the function above to run again with a different API key.
# Yelp has a daily limit of 5000 API requests. To download all of the files we need in one day, need two keys.        

auth_header = {'Authorization': 'Bearer ' + ethankey}
url = 'https://api.yelp.com/v3/businesses/search/phone'
i=4990
for i in range(i, len(phone_numbers)):
    parameter = {'phone': str(phone_numbers['phone'][i])}
    resp = requests.get(url, headers=auth_header, params=parameter)
    data = resp.json()
    n = str(i)+'.json'
    f = open(n, "w")
    json.dump(data, f)
    i+1
    
    if i == 9937:
        break

In [11]:
# Testing loop above with 10 phone numbers

ten_test = phone_numbers.head(10)

In [12]:
# Testing loop above with 10 phone numbers, remember, arrays start at 0!

ten_test

Unnamed: 0,phone
0,12127521495
1,19172620380
2,12122576434
3,16469129061
4,12129410911
5,12126775834
6,16468613403
7,12122475756
8,12127772188
9,12128658777


In [None]:
# Put JSON files into a single DataFrame and append each file to the end of the frame
# Deletes the JSON file once it has been processed

i=0
for i in range(len(phone_numbers)):
    data = json.load(open(str(i)+".json"))
    if i == 0:
        appended = pd.DataFrame(data['businesses'])
        os.remove(str(i)+".json")
        i+1
    else:
        appended = pd.DataFrame(data['businesses']).append(appended, sort=False)
        os.remove(str(i)+".json")
        i+1

In [35]:
# Final DataFrame with all JSON Files

appended

Unnamed: 0,alias,categories,coordinates,display_phone,id,image_url,is_closed,location,name,phone,price,rating,review_count,transactions,url
0,bleecker-heights-tavern-new-york,"[{'alias': 'sportsbars', 'title': 'Sports Bars'}]","{'latitude': 40.7323112487793, 'longitude': -7...",(212) 675-6157,bgv_tR6ZJjw3js44zvcJUg,https://s3-media1.fl.yelpcdn.com/bphoto/u6X8oi...,True,"{'address1': '296 Bleecker St', 'address2': ''...",Bleecker Heights Tavern,+12126756157,$$,3.5,58,[],https://www.yelp.com/biz/bleecker-heights-tave...
1,the-garret-new-york,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...","{'latitude': 40.73238, 'longitude': -74.0038457}",(212) 675-6157,dJoWkpVxWcvdqNRo7w622w,https://s3-media4.fl.yelpcdn.com/bphoto/wvfw7O...,False,"{'address1': '296 Bleecker St', 'address2': ''...",The Garret,+12126756157,$$,3.5,195,[],https://www.yelp.com/biz/the-garret-new-york?a...
0,voyager-espresso-new-york,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...","{'latitude': 40.708897, 'longitude': -74.006676}",(212) 227-2744,qzdCvtgpCDjNVV_zRJB5EQ,https://s3-media3.fl.yelpcdn.com/bphoto/f5U1Zl...,False,"{'address1': '110 William St', 'address2': Non...",Voyager Espresso,+12122272744,$,5.0,87,[],https://www.yelp.com/biz/voyager-espresso-new-...
1,amanzi-tea-new-york,"[{'alias': 'coffee', 'title': 'Coffee & Tea'}]","{'latitude': 40.7159805, 'longitude': -74.0105...",(212) 227-2744,EPJjEMdlsGtEridFDqvSFg,https://s3-media2.fl.yelpcdn.com/bphoto/EhzM8L...,True,"{'address1': '166 Chambers St', 'address2': ''...",Amanzi Tea,+12122272744,$,4.0,9,[],https://www.yelp.com/biz/amanzi-tea-new-york?a...
0,fu-on-kitchen-new-york,"[{'alias': 'restaurants', 'title': 'Restaurant...","{'latitude': 40.81914, 'longitude': -73.94038}",(212) 862-5660,JAjbYabOKnPaZyKLcNjFAA,,False,"{'address1': '2419 Adam Clayton Powell', 'addr...",Fu On Kitchen,+12128625660,,3.0,2,[],https://www.yelp.com/biz/fu-on-kitchen-new-yor...
0,cafe-water-new-york,"[{'alias': 'delis', 'title': 'Delis'}, {'alias...","{'latitude': 40.7056344, 'longitude': -74.0068...",(212) 785-1320,aoYDuvipcT_XGAM5ckFJnA,https://s3-media2.fl.yelpcdn.com/bphoto/TNboxc...,False,"{'address1': '130 Water St', 'address2': '', '...",Cafe Water,+12127851320,$$,3.0,68,"[delivery, pickup]",https://www.yelp.com/biz/cafe-water-new-york?a...
0,b-side-new-york,"[{'alias': 'divebars', 'title': 'Dive Bars'}]","{'latitude': 40.728454, 'longitude': -73.978988}",(212) 475-4600,WvzuSPayCKimxkjWYQHg0Q,https://s3-media1.fl.yelpcdn.com/bphoto/i4j0M0...,False,"{'address1': '204 Ave B', 'address2': '', 'add...",B Side,+12124754600,$,4.0,90,[],https://www.yelp.com/biz/b-side-new-york?adjus...
0,sanatorium-new-york,"[{'alias': 'cocktailbars', 'title': 'Cocktail ...","{'latitude': 40.721057, 'longitude': -73.980773}",(212) 614-0300,18CzGWvJqc_9xoMpwEgl-w,https://s3-media3.fl.yelpcdn.com/bphoto/o37bfU...,False,"{'address1': '14 Avenue C', 'address2': '', 'a...",Sanatorium,+12126140300,$$$,3.5,29,[],https://www.yelp.com/biz/sanatorium-new-york?a...
0,sushi-on-jones-new-york-3,"[{'alias': 'sushi', 'title': 'Sushi Bars'}, {'...","{'latitude': 40.7338884211791, 'longitude': -7...",(646) 357-0453,IS94JDTcq_WedBcj6P4qpQ,https://s3-media2.fl.yelpcdn.com/bphoto/PhY0ZQ...,False,"{'address1': '210 W 10th St', 'address2': '', ...",Sushi on Jones,+16463570453,$$$,4.0,78,[],https://www.yelp.com/biz/sushi-on-jones-new-yo...
0,ashoka-on-2nd-avenue-new-york,"[{'alias': 'indpak', 'title': 'Indian'}]","{'latitude': 40.7798555, 'longitude': -73.9500...",(212) 876-9100,jJYQJGygWjIX6_0MiIoJRQ,https://s3-media1.fl.yelpcdn.com/bphoto/g-vL8I...,False,"{'address1': '1718 2nd Ave', 'address2': None,...",Ashoka on 2nd Avenue,+12128769100,$$,5.0,21,"[restaurant_reservation, pickup, delivery]",https://www.yelp.com/biz/ashoka-on-2nd-avenue-...


In [36]:
# Run the final appended DataFrame through Panos' cleanup method outlined above

address_df = appended.location.apply(pd.Series)
coordinates_df = appended.coordinates.apply(pd.Series)

appended.drop('location', axis='columns', inplace=True)
appended.drop('coordinates', axis='columns', inplace=True)
appended.drop('url', axis='columns', inplace=True)
appended.drop('display_phone', axis='columns', inplace=True)
appended.drop('alias', axis='columns', inplace=True)
appended.drop('categories', axis='columns', inplace=True)
appended.drop('transactions', axis='columns', inplace=True)
appended.drop('image_url', axis='columns', inplace=True)

final_appended = pd.concat([appended, address_df, coordinates_df], axis='columns')
final_appended

Unnamed: 0,id,is_closed,name,phone,price,rating,review_count,address1,address2,address3,city,zip_code,country,state,display_address,latitude,longitude
0,bgv_tR6ZJjw3js44zvcJUg,True,Bleecker Heights Tavern,+12126756157,$$,3.5,58,296 Bleecker St,,,New York,10014,US,NY,"[296 Bleecker St, New York, NY 10014]",40.732311,-74.003876
1,dJoWkpVxWcvdqNRo7w622w,False,The Garret,+12126756157,$$,3.5,195,296 Bleecker St,,,New York,10014,US,NY,"[296 Bleecker St, New York, NY 10014]",40.732380,-74.003846
0,qzdCvtgpCDjNVV_zRJB5EQ,False,Voyager Espresso,+12122272744,$,5.0,87,110 William St,,,New York,10038,US,NY,"[110 William St, New York, NY 10038]",40.708897,-74.006676
1,EPJjEMdlsGtEridFDqvSFg,True,Amanzi Tea,+12122272744,$,4.0,9,166 Chambers St,,,New York,10007,US,NY,"[166 Chambers St, New York, NY 10007]",40.715981,-74.010597
0,JAjbYabOKnPaZyKLcNjFAA,False,Fu On Kitchen,+12128625660,,3.0,2,2419 Adam Clayton Powell,,,New York,10030,US,NY,"[2419 Adam Clayton Powell, New York, NY 10030]",40.819140,-73.940380
0,aoYDuvipcT_XGAM5ckFJnA,False,Cafe Water,+12127851320,$$,3.0,68,130 Water St,,,New York,10005,US,NY,"[130 Water St, New York, NY 10005]",40.705634,-74.006867
0,WvzuSPayCKimxkjWYQHg0Q,False,B Side,+12124754600,$,4.0,90,204 Ave B,,,New York,10009,US,NY,"[204 Ave B, New York, NY 10009]",40.728454,-73.978988
0,18CzGWvJqc_9xoMpwEgl-w,False,Sanatorium,+12126140300,$$$,3.5,29,14 Avenue C,,,New York,10009,US,NY,"[14 Avenue C, New York, NY 10009]",40.721057,-73.980773
0,IS94JDTcq_WedBcj6P4qpQ,False,Sushi on Jones,+16463570453,$$$,4.0,78,210 W 10th St,,,New York,10014,US,NY,"[210 W 10th St, New York, NY 10014]",40.733888,-74.004155
0,jJYQJGygWjIX6_0MiIoJRQ,False,Ashoka on 2nd Avenue,+12128769100,$$,5.0,21,1718 2nd Ave,,,New York,10128,US,NY,"[1718 2nd Ave, New York, NY 10128]",40.779855,-73.950090


In [37]:
# Convert the final_appended DataFrame into a CSV to Import into SQL Database

final_appended.to_csv('yelp.csv',index=False)