### Rose Tovar
### Efficient Yelp Commands
### 9/20/2022

In [1]:
# imports
import numpy as np
import pandas as pd
import os, json, math, time
from yelpapi import YelpAPI
from tqdm.notebook import tqdm_notebook

### Setting up Yelp Creds and Yelp Search

In [2]:
with open('/home/vanica/.secret/yelp_api.json') as f:
    login = json.load(f)
    
yelp = YelpAPI(login['api-key'], timeout_s=5.0)

## Location, Term, and JSON File name

In [3]:
location = "Memphis, TN"
term = 'Wings'
folder = 'Data/'
json_file = folder + f"{location.split(',')[0]}-{term}.json"

### Helper Methods

In [4]:
def create_json_file(json_file, delete_if_exist=False):
    # Check is Json existw
    file_exists = os.path.isfile(json_file)
    
    # if does exist
    if file_exists == True:
        
        #check if needs to be deleted
        if delete_if_exist == True:
            print(f"[!] {json_file} is being deleted")
            
            os.remove(json_file)
            
        else:
            print(f"[!] {json_file} already exists")
    # if it does not exist
    else:
        #let user know
        print(f"[!] {json_file} does not exist, creating now")
        
        # create any needed folders
        folder = os.path.dirname(json_file)
        
        # if json file is included a folder
        if len(folder) > 0:
            os.makedirs(folder, exist_ok=True)
        # save empty list
        with open(json_file, 'w') as f:
            json.dump([], f)

### Setting up Initial Load

In [5]:
create_json_file(json_file, delete_if_exist=True)

# load up prev results
with open(json_file, 'r') as f:
    prev_results = json.load(f)
    

# set offset baseed on prev results
n_results = len(prev_results)
print(f"- {n_results} prev results found")

# use yelp api to search
results = yelp.search_query(location=location, term=term, offset=n_results)

results_per_page = len(results['businesses'])
n_pages = math.ceil((results['total']-n_results)/results_per_page)
n_pages

[!] Data/Memphis-Wings.json does not exist, creating now
- 0 prev results found


42

In [6]:
# Running loop to load rest of data through pagination

for i in tqdm_notebook(range(1, n_pages+1)):
    
    # read in prev results
    with open(json_file, 'r') as f:
        prev_results = json.load(f)
        
    n_results = len(prev_results)
    
    if(n_results + results_per_page) > 1000:
        print('Exceeding 1000 api calls, Stopping')
        break
    
    # using n results as a offset
    results = yelp.search_query(location=location, term=term, offset=n_results)
    
    ##appending new results and save file
    prev_results.extend(results['businesses'])
    with open(json_file, 'w') as f:
        json.dump(prev_results, f)
    

  0%|          | 0/42 [00:00<?, ?it/s]

### Dataframe Extraction

In [7]:
df = pd.read_json(json_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 838 entries, 0 to 837
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             838 non-null    object 
 1   alias          838 non-null    object 
 2   name           838 non-null    object 
 3   image_url      838 non-null    object 
 4   is_closed      838 non-null    bool   
 5   url            838 non-null    object 
 6   review_count   838 non-null    int64  
 7   categories     838 non-null    object 
 8   rating         838 non-null    float64
 9   coordinates    838 non-null    object 
 10  transactions   838 non-null    object 
 11  location       838 non-null    object 
 12  phone          838 non-null    object 
 13  display_phone  838 non-null    object 
 14  distance       838 non-null    float64
 15  price          595 non-null    object 
dtypes: bool(1), float64(2), int64(1), object(12)
memory usage: 99.1+ KB


### Checking for Duplicates

In [8]:
df.duplicated(subset='id').sum()

0

In [9]:
df = df.drop_duplicates(subset='id')
df.duplicated(subset='id').sum()

0

In [10]:
csv = folder + f"{location.split(',')[0]}-{term}.csv.gz"

df.to_csv(csv, compression='gzip', index=False)