In [77]:
# Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Additional Imports
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook



In [78]:
# Load API Credentials
with open('/Users/purvikansara/.secret/yelp_api.json') as f:   #use your path here!
    login = json.load(f)
# Instantiate YelpAPI Variable
yelp_api = YelpAPI(login['API Key'], timeout_s=5.0)



In [79]:
# set our API call parameters and filename before the first call
LOCATION  = 'NY'
TERM = 'pizza'
## INFORM USER AND SAVE EMPTY LIST
print(f"[i] JSON_FILE not found. Saving empty list to file.")


[i] JSON_FILE not found. Saving empty list to file.


In [80]:
# Specifying JSON_FILE filename (can include a folder)
# include the search terms in the filename
JSON_FILE = f"Data/results_in_progress_NY_pizza.json"
JSON_FILE



'Data/results_in_progress_NY_pizza.json'

In [81]:
## Check if JSON_FILE exists
file_exists = os.path.isfile(JSON_FILE)
## If it does not exist: 
if file_exists == False:
    ## INFORM USER AND SAVE EMPTY LIST
    print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
    ## CREATE ANY NEEDED FOLDERS
    # Get the Folder Name only
    folder = os.path.dirname(JSON_FILE)
    ## If JSON_FILE included a folder:
    if len(folder)>0:
        # create the folder
        os.makedirs(folder,exist_ok=True)
    
    
    ## save an empty list to start the file
    with open(JSON_FILE,'w') as f:
        json.dump([],f)  
## If it exists, inform user
else:
    print(f"[i] {JSON_FILE} already exists.")



[i] Data/results_in_progress_NY_pizza.json not found. Saving empty list to new file.


In [82]:
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')



- 0 previous results found.


In [83]:
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=5)
results.keys()



dict_keys(['businesses', 'total', 'region'])

In [84]:
## How many results total?
total_results = results['total']
total_results



12000

In [85]:
## How many did we get the details for?
results_per_page = len(results['businesses'])
results_per_page



20

In [86]:
# Import additional packages for controlling our loop
import time, math
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages



600

In [87]:
# join new results with old list with extend and save to file
previous_results.extend(results['businesses'])  
with open(JSON_FILE,'w') as f:
     json.dump(previous_results,f)


In [88]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [89]:
# from tqdm.notebook import tqdm_notebook
# import time
# for i in tqdm_notebook(range(n_pages)):
#     # adds 200 ms pause
#     time.sleep(.2) 


In [90]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET 
    if (n_results + results_per_page) >50:
        print('Exceeded 50api calls. Stopping loop.')
        break
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=5)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
#     ass="function call from-rainbow">display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    # add a 200ms pause
    #time.sleep(.2)



  0%|          | 0/600 [00:00<?, ?it/s]

Exceeded 50api calls. Stopping loop.


In [91]:
## delete file and confirm it no longer exits.
os.remove(JSON_FILE)
os.path.isfile(JSON_FILE)



False

In [92]:
# def create_json_file(JSON_FILE,  delete_if_exists=False):
    
#     ## Check if JSON_FILE exists
#     file_exists = os.path.isfile(JSON_FILE)
    
#     ## If it DOES exist:
#     if file_exists == True:
        
#         ## Check if user wants to delete if exists
#         if delete_if_exists==True:            
#             print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
#             ## delete file and confirm it no longer exits.
#             os.remove(JSON_FILE
#             create_json_file(JSON_FILE,delete_if_exists=False)
#             # Create new file in the folder
# #             folder = os.path.dirname(JSON_FILE)
# #             ## If JSON_FILE included a folder:
# #             if len(folder)>0:
# #                 # create the folder
# #                 os.makedirs(folder,exist_ok=True)
    
    
# #                 ## save an empty list to start the file
# #             with open(JSON_FILE,'w') as f:
# #                 json.dump([],f)  
                      
#         else:
#             print(f"[i] {JSON_FILE} already exists.")            
#             #return None
            
#             ## If it does NOT exist:
#     else:
        
#         ## INFORM USER AND SAVE EMPTY LIST
#         print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")
        
#         ## CREATE ANY NEEDED FOLDERS
#         # Get the Folder Name only
#         folder = os.path.dirname(JSON_FILE)
        
#         ## If JSON_FILE included a folder:
#         if len(folder)>0:
#             # create the folder
#             os.makedirs(folder,exist_ok=True)
#         ## Save empty list to start the json file
#         with open(JSON_FILE,'w') as f:
#             json.dump([],f)  



In [95]:
def create_json_file(JSON_FILE,  delete_if_exists=False): 
    
    ## Check if JSON_FILE exists
    file_exists = os.path.isfile(JSON_FILE)

    ## If it DOES exist:
    if file_exists == True:

        ## Check if user wants to delete if exists
        if delete_if_exists==True:

            print(f"[!] {JSON_FILE} already exists. Deleting previous file...")
            ## delete file and confirm it no longer exits.
            os.remove(JSON_FILE)
            ## Recursive call to function after old file deleted
            create_json_file(JSON_FILE,delete_if_exists=False)
        else:
            print(f"[i] {JSON_FILE} already exists.")            


    ## If it does NOT exist:

    else:

        ## INFORM USER AND SAVE EMPTY LIST
        print(f"[i] {JSON_FILE} not found. Saving empty list to new file.")

        ## CREATE ANY NEEDED FOLDERS
        # Get the Folder Name only
        folder = os.path.dirname(JSON_FILE)

        ## If JSON_FILE included a folder:
        if len(folder)>0:
            # create the folder
            os.makedirs(folder,exist_ok=True)
        ## Save empty list to start the json file
        with open(JSON_FILE,'w') as f:
            json.dump([],f)  

In [102]:
## Create a new empty json file (exist the previous if it exists)
create_json_file(JSON_FILE, delete_if_exists=True)
## Load previous results and use len of results for offset
with open(JSON_FILE,'r') as f:
    previous_results = json.load(f)
    
## set offset based on previous results
n_results = len(previous_results)
print(f'- {n_results} previous results found.')
# use our yelp_api variable's search_query method to perform our API call
results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
## How many results total?
total_results = results['total']
## How many did we get the details for?
results_per_page = len(results['businesses'])
# Use math.ceil to round up for the total number of pages of results.
n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages



[!] Data/results_in_progress_NY_pizza.json already exists. Deleting previous file...
[i] Data/results_in_progress_NY_pizza.json not found. Saving empty list to new file.
- 0 previous results found.


615

In [97]:
for i in tqdm_notebook( range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    
    if (n_results + results_per_page) > 100:
        print('Exceeded 1000 api calls. Stopping loop.')
        break
    
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results)
    
    
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    # display(previous_results)
    with open(JSON_FILE,'w') as f:
        json.dump(previous_results,f)
    
    time.sleep(.2)



  0%|          | 0/610 [00:00<?, ?it/s]

Exceeded 1000 api calls. Stopping loop.


In [98]:
# load final results
final_df = pd.read_json(JSON_FILE)
display(final_df.head(), final_df.tail())


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,zj8Lq1T8KIC5zwFief15jg,prince-street-pizza-new-york-2,Prince Street Pizza,https://s3-media4.fl.yelpcdn.com/bphoto/HVjttL...,False,https://www.yelp.com/biz/prince-street-pizza-n...,4512,"[{'alias': 'pizza', 'title': 'Pizza'}, {'alias...",4.5,"{'latitude': 40.72308755605564, 'longitude': -...","[pickup, delivery]",$,"{'address1': '27 Prince St', 'address2': None,...",12129664100,(212) 966-4100,1961.877142
1,ysqgdbSrezXgVwER2kQWKA,julianas-brooklyn-3,Juliana's,https://s3-media1.fl.yelpcdn.com/bphoto/OCDZ4n...,False,https://www.yelp.com/biz/julianas-brooklyn-3?a...,2528,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 40.70274718768062, 'longitude': -...",[delivery],$$,"{'address1': '19 Old Fulton St', 'address2': '...",17185966700,(718) 596-6700,308.569844
2,v1DHGRNCH9247WLYoaoA9A,l-industrie-pizzeria-brooklyn,L'industrie Pizzeria,https://s3-media3.fl.yelpcdn.com/bphoto/Llq71W...,False,https://www.yelp.com/biz/l-industrie-pizzeria-...,774,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.5,"{'latitude': 40.71162, 'longitude': -73.95783}",[delivery],$,"{'address1': '254 S 2nd St', 'address2': '', '...",17185990002,(718) 599-0002,3145.016041
3,WIhm0W9197f_rRtDziq5qQ,lombardis-pizza-new-york-4,Lombardi's Pizza,https://s3-media1.fl.yelpcdn.com/bphoto/UZ6V_h...,False,https://www.yelp.com/biz/lombardis-pizza-new-y...,6414,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.7215934960083, 'longitude': -7...","[pickup, delivery]",$$,"{'address1': '32 Spring St', 'address2': '', '...",12129417994,(212) 941-7994,1798.995978
4,WG639VkTjmK5dzydd1BBJA,rubirosa-new-york-2,Rubirosa,https://s3-media4.fl.yelpcdn.com/bphoto/LuSzR8...,False,https://www.yelp.com/biz/rubirosa-new-york-2?a...,2768,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.722766, 'longitude': -73.996233}",[pickup],$$,"{'address1': '235 Mulberry St', 'address2': ''...",12129650500,(212) 965-0500,1932.94677


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
95,ejuFKvSfGl0JMZO9yzSXNw,louies-pizza-elmhurst-2,Louie's Pizza,https://s3-media2.fl.yelpcdn.com/bphoto/g0TYOE...,False,https://www.yelp.com/biz/louies-pizza-elmhurst...,501,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.74600424769782, 'longitude': -...","[delivery, pickup]",$,"{'address1': '81-34 Baxter Ave', 'address2': '...",17184409346,(718) 440-9346,10363.958244
96,58YvTYsyxNyGixTAZiMHcg,williamsburg-pizza-new-york-3,Williamsburg Pizza,https://s3-media1.fl.yelpcdn.com/bphoto/uMXAMU...,False,https://www.yelp.com/biz/williamsburg-pizza-ne...,265,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.71822, 'longitude': -73.99123}","[delivery, pickup]",$,"{'address1': '277 Broome St', 'address2': '', ...",12122264455,(212) 226-4455,1447.543509
97,SSy5YkPo6S8RdCXiJ4EHXw,marks-red-hook-pizza-brooklyn,Mark's Red Hook Pizza,https://s3-media1.fl.yelpcdn.com/bphoto/cuGFpW...,False,https://www.yelp.com/biz/marks-red-hook-pizza-...,76,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.6788288, 'longitude': -74.0113...","[delivery, pickup]",$,"{'address1': '326 Van Brunt St', 'address2': '...",17186240690,(718) 624-0690,3289.399717
98,Ajvcvk6tvlKbRF8Zkf0M5Q,marinara-pizza-new-york-8,Marinara Pizza,https://s3-media3.fl.yelpcdn.com/bphoto/jHPHU5...,False,https://www.yelp.com/biz/marinara-pizza-new-yo...,60,"[{'alias': 'pizza', 'title': 'Pizza'}]",4.0,"{'latitude': 40.74241, 'longitude': -73.98452}","[delivery, pickup]",,"{'address1': '379 Park Ave S', 'address2': '',...",16466495282,(646) 649-5282,4191.65953
99,hdiuRS9sVZSMReZm4oV5SA,da-andrea-new-york,Da Andrea,https://s3-media2.fl.yelpcdn.com/bphoto/ZbJxx7...,False,https://www.yelp.com/biz/da-andrea-new-york?ad...,1575,"[{'alias': 'italian', 'title': 'Italian'}, {'a...",4.5,"{'latitude': 40.736218, 'longitude': -73.99597}","[restaurant_reservation, delivery, pickup]",$$,"{'address1': '35 W 13th St', 'address2': '', '...",12123671979,(212) 367-1979,3424.729337


In [99]:
# load previous final results
d1_df = pd.read_json('Data/D1results_in_progress_NY_pizza.json')
display(d1_df.head(), d1_df.tail())

ValueError: Expected object or value

In [None]:
# check for duplicate results
final_df.duplicated().sum()



In [None]:
# check for duplicate ID's 
final_df.duplicated(subset='id').sum()



In [None]:
## Drop duplicate ids and confirm there are no more duplicates
final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()


In [None]:
# save the final results to a compressed csv
final_df.to_csv('Data/final_results_NY_pizza.csv.gz', compression='gzip',index=False)

