# Source Yelp Review & Ratings Data from Fusion API

## Imports

In [1]:
# import pandas, numpy, matplotlib, seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing the requests library
import requests

import urllib.request
from pathlib import Path
import os
import json

# Yelp Fusion API search endpoints
https://www.yelp.com/developers/documentation/v3/get_started

In [2]:
## @deprecated ##
def fetchYelpDataByNameDeprecated(forceFetch=False):
    path = Path()
    key = 'yelpdataraw.csv'
    filename = path/key
    auth = 'fjdbfEV0kvgw9OV702-ml_8hzNdrv6Ird-HC9FvQon3ndvoepkBhlv7--R09I0i2BEE0_MxmxKA0ad86LvNxJRu14zrWFbVkC6oH6_4pltVfWvU9tdcQ7FM5aKB0YXYx'

    # If the file does not already exist in the directory, download it
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched yelp data data')
        df = pd.read_csv(key)
        print('yelpdataraw df shape:', df.shape)
        return df
    else:
        print('fetching yelp data (will take a WHILE...)')
        headers = {
                  'Authorization': f'Bearer {auth}'
                }
        # read restdata 
        rests = pd.read_csv('preprocessed_restaurants.csv', dtype={'PHONENUMBER': str})
        rows = []
        ids = []
        for restName in rests['NAME']:
                restName = restName.split("#")[0].strip() #to remove whitespace and remove #XXXX from chains 
                url = f"https://api.yelp.com/v3/businesses/search?term={restName}&location=Wake County&limit=50"
                try:
                    response = requests.request("GET", url, headers=headers).json()

                    if response['businesses']:
                        data = response['businesses']
                        for d in data:
                            if d['id'] not in ids:
                                rows.append(d)
                                ids.append(d['id'])
                except Exception as e:
                    print('Continuing. Error fetching for restName=', restName)
        print('done fetching data')
        df = pd.DataFrame(rows)
        df.to_csv('yelpdataraw.csv', index=False)
        df = df.drop_duplicates(subset=['id'])
        print('Done')
        return df

In [3]:
# checking how many restos don't have a phone, will won't have yelp data for that
rests = pd.read_csv('preprocessed_restaurants.csv', dtype={'PHONENUMBER': str})
print(rests['PHONENUMBER'].isna().sum())

91


# Search businesses by phone 

one of the challenges here is that yelp does not have a bulk business search api so we have to fetch by phone/name one-by-one. 

In [4]:
def fetchYelpDataByPhone(forceFetch=False):
    path = Path()
    key = 'yelpdataraw.csv'
    filename = path/key
    auth = 'fjdbfEV0kvgw9OV702-ml_8hzNdrv6Ird-HC9FvQon3ndvoepkBhlv7--R09I0i2BEE0_MxmxKA0ad86LvNxJRu14zrWFbVkC6oH6_4pltVfWvU9tdcQ7FM5aKB0YXYx'

    # If the file does not already exist in the directory, download it
    if os.path.exists(filename) and not forceFetch:
        print('Using pre-fetched yelp data')
        df = pd.read_csv(key)
        print('yelpdataraw df shape:', df.shape)
        return df
    
    else:
        print('fetching yelp data by phone (this will take a WHILE...X_X)')
        headers = {
                  'Authorization': f'Bearer {auth}'
                }
        # read restdata 
        rests = pd.read_csv('preprocessed_restaurants.csv', dtype={'PHONENUMBER': str})
        rows = []
        ids = []
        print('Total:', len(rests['PHONENUMBER']))
        
        # TODO: in case not valid phone/empty, search business by name (use diff yelp api)
        for phone, name in zip(rests['PHONENUMBER'], rests['NAME']):
                # like a progress bar
                if len(ids) % 200 == 0:
                    print('Aggregating next 200 rows...')
                if phone != None and str(phone).strip() != 'nan':
                    url = f"https://api.yelp.com/v3/businesses/search/phone?phone={phone}"
                    try:
                        response = requests.request("GET", url, headers=headers).json()

                        if response['businesses']:
                            data = response['businesses']
                            for d in data:
                                if d['id'] not in ids:
                                    rows.append(d)
                                    ids.append(d['id'])
                    except Exception as e:
                        try:
                            print('Error fetching restaurant for phone=', phone)
                            name_url = f"https://api.yelp.com/v3/businesses/search?term={restName}&location=Wake County&limit=50"
                            response = requests.request("GET", url, headers=headers).json()
                            if response['businesses']:
                                data = response['businesses']
                                for d in data:
                                    if d['id'] not in ids:
                                        rows.append(d)
                                        ids.append(d['id'])
                        except:
                            print('Continuing. Error fetching restaurant for name=', name)
        
        print('Done fetching data...')
        df = pd.DataFrame(rows)
        print('Data will be persisted in yelpdataraw.csv')
        df.to_csv('yelpdataraw.csv', index=False)
        print('All done!')
        return df

In [5]:
df = fetchYelpDataByPhone(forceFetch=False)
df.head()

Using pre-fetched yelp data
yelpdataraw df shape: (2144, 15)


Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone
0,RjELMSrh2DuTBJQ4YpzXUA,peace-china-raleigh-2,Peace China,https://s3-media0.fl.yelpcdn.com/bphoto/8pWw6x...,True,https://www.yelp.com/biz/peace-china-raleigh-2...,63,"[{'alias': 'chinese', 'title': 'Chinese'}]",3.5,"{'latitude': 35.90946322502375, 'longitude': -...","['pickup', 'delivery']",$,"{'address1': '13220 Strickland Rd', 'address2'...",19196769968,(919) 676-9968
1,OXdKUXoDnjLGpNW9Go1e8A,asian-cafe-raleigh,Asian Cafe,,True,https://www.yelp.com/biz/asian-cafe-raleigh?ad...,7,"[{'alias': 'chinese', 'title': 'Chinese'}, {'a...",3.0,"{'latitude': 35.906972810626, 'longitude': -78...",[],$$,"{'address1': '13220 Strickland Rd', 'address2'...",19196769968,(919) 676-9968
2,wDZG-Ry6IcC_QITBLBPHxQ,northside-bistro-and-cocktails-raleigh,Northside Bistro & Cocktails,https://s3-media0.fl.yelpcdn.com/bphoto/CGb1Gs...,False,https://www.yelp.com/biz/northside-bistro-and-...,23,"[{'alias': 'newamerican', 'title': 'American (...",4.5,"{'latitude': 35.86631037241957, 'longitude': -...",[],,"{'address1': '832 Spring Forest Rd', 'address2...",19198905225,(919) 890-5225
3,-qCrGWYePySXmcngRhal4Q,the-daily-planet-cafe-raleigh,The Daily Planet Cafe,https://s3-media0.fl.yelpcdn.com/bphoto/YM1ZTo...,False,https://www.yelp.com/biz/the-daily-planet-cafe...,89,"[{'alias': 'cafes', 'title': 'Cafes'}, {'alias...",4.0,"{'latitude': 35.7823492580703, 'longitude': -7...",['delivery'],$$,"{'address1': '121 W Jones St', 'address2': '',...",19197078060,(919) 707-8060
4,21FAnridQkQCJMM_PfyfcA,hibachi-88-raleigh,Hibachi 88,https://s3-media0.fl.yelpcdn.com/bphoto/sjRGZe...,False,https://www.yelp.com/biz/hibachi-88-raleigh?ad...,46,"[{'alias': 'japanese', 'title': 'Japanese'}, {...",3.5,"{'latitude': 35.76724, 'longitude': -78.57953}","['delivery', 'pickup']",$,"{'address1': '3416-100 Poole Rd', 'address2': ...",19192311688,(919) 231-1688


# Preprocessing

In [6]:
def encode_price(price_str):
    if price_str == '$':
        return 1
    elif price_str == '$$':
        return 2
    elif price_str == '$$$':
        return 3
    elif price_str == '$$$$':
        return 4
    elif price_str == '$$$$$':
        return 5
    else:
        return -1

def getTitle(category_json):
    try:
        return json.loads(category_json[1:-1])[0]['title']
    except:
        return ""

def preprocess_yelpdata(df):
    
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    cols_to_remove = ['id', 'image_url', 'url', 'transactions', 'alias', 'coordinates', 'categories', 'is_closed', 'location']
    
    # extract category title out of the alias/categories json
    df['alias'] = df['categories'].apply(lambda x: str(json.dumps(x)))
    df['alias'] = df['alias'].apply(lambda x: x.replace("\'", "\"").strip())
    df['category_title'] = df['alias'].apply(lambda x: getTitle(x).lower())
    
    df['price'] = df['price'].apply(lambda p: encode_price(p))
    
    
    # Drop irrelevant columns
    df = df.drop(cols_to_remove, axis=1)
    
    # return the processed df
    return df

In [7]:
yelpdatadf = preprocess_yelpdata(df.copy())

In [8]:
yelpdatadf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2144 entries, 0 to 2143
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            2144 non-null   object 
 1   review_count    2144 non-null   int64  
 2   rating          2144 non-null   float64
 3   price           2144 non-null   int64  
 4   phone           2144 non-null   int64  
 5   display_phone   2144 non-null   object 
 6   category_title  2144 non-null   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 134.0+ KB


In [9]:
yelpdatadf.head()

Unnamed: 0,name,review_count,rating,price,phone,display_phone,category_title
0,Peace China,63,3.5,1,19196769968,(919) 676-9968,chinese
1,Asian Cafe,7,3.0,2,19196769968,(919) 676-9968,chinese
2,Northside Bistro & Cocktails,23,4.5,-1,19198905225,(919) 890-5225,american (new)
3,The Daily Planet Cafe,89,4.0,2,19197078060,(919) 707-8060,cafes
4,Hibachi 88,46,3.5,1,19192311688,(919) 231-1688,japanese


In [10]:
print(yelpdatadf['price'].value_counts(dropna=False))

 2    931
 1    807
-1    370
 3     33
 4      3
Name: price, dtype: int64


In [11]:
yelpdatadf.to_csv('preprocessed_yelpdata.csv', index=False)