# Yelp ADI

- Robert Yonce
- 5/3/23
- Updated 5/7/23

# Imports

In [None]:
# Standard Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Additional Imports

import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

# Keys & Login

In [None]:
# Keys

with open('/Users/robertyonce/.secret/yelp_api.json') as f:
    login = json.load(f)
login.keys()

In [None]:
# YelpAPI

yelp_api = YelpAPI(login['api-key'], timeout_s=5.0)
yelp_api


# Set Parameters and File

In [None]:
# set our API call parameters 
LOCATION = 'Knoxville,TN'
TERM = 'Asian'

In [None]:
# Specifying JSON_FILE filename and including the search terms in the filename

JSON_FILE_KNOX = "Data/results_in_progress_Knox_Asian.json"
JSON_FILE_KNOX

In [None]:
# Check if JSON_FILE_KNOX exists

file_exists = os.path.isfile(JSON_FILE_KNOX)

# If it does not exist: 

if file_exists == False:
    
    # Create New Folder if needed
    # Get the Folder Name only
    
    folder = os.path.dirname(JSON_FILE_KNOX)
    
    # If JSON_FILE_KNOX included a folder:
    
    if len(folder)>0:
        
        # create the folder
        
        os.makedirs(folder,exist_ok=True)
        
        
    # INFORM USER AND SAVE EMPTY LIST
    
    print(f'[i] {JSON_FILE_KNOX} not found. Saving empty list to file.')
    
    
    # save an empty list
    with open(JSON_FILE_KNOX,'w') as f:
        json.dump([],f)  
        
# If it exists, inform user
else:
    print(f"[i] {JSON_FILE_KNOX} already exists.")

In [None]:
# Load previous results and use len of results for offset

with open(JSON_FILE_KNOX,'r') as f:
    previous_results = json.load(f)
    
# Set offset based on previous results

n_results = len(previous_results)
print(f'- {n_results} previous results found.')


# API CALL

In [None]:
# use our yelp_api variable's search_query method to perform our API call

results = yelp_api.search_query(location=LOCATION,
                                term=TERM,
                               offset=n_results)
results.keys()

In [None]:
# How many results total?

total_results = results['total']
total_results


In [None]:
# Results per page

results_per_page = len(results['businesses'])
results_per_page

In [None]:
# Use math.ceil to round up for the total number of pages of results.

n_pages = math.ceil((results['total']-n_results)/ results_per_page)
n_pages

In [None]:
previous_results.extend(results['businesses'])  
with open(JSON_FILE_KNOX,'w') as f:
     json.dump(previous_results,f)

# Extend and Loop

In [None]:
for i in tqdm_notebook(range(1,n_pages+1)):
    
    ## Read in results in progress file and check the length
    with open(JSON_FILE_KNOX, 'r') as f:
        previous_results = json.load(f)
    ## save number of results for to use as offset
    n_results = len(previous_results)
    ## use n_results as the OFFSET 
    results = yelp_api.search_query(location=LOCATION,
                                    term=TERM, 
                                    offset=n_results+1)
    
    ## append new results and save to file
    previous_results.extend(results['businesses'])
    
    with open(JSON_FILE_KNOX,'w') as f:
        json.dump(previous_results,f)
    
    # add a 200ms pause
    time.sleep(.2)


# Final DF

In [None]:
# Load final results

final_df = pd.read_json(JSON_FILE_KNOX)
display(final_df.head(), final_df.tail())

In [None]:
final_df.info()

In [None]:
## Drop duplicate ids and confirm there are no more duplicates

final_df = final_df.drop_duplicates(subset='id')
final_df.duplicated(subset='id').sum()

In [None]:
# Save the final results to a compressed csv

final_df.to_csv('Data/results_in_progress_Knox_Asian.csv.gz', compression='gzip',index=False)
