# Imports

In [3]:
import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup as soup
import pickle


In [23]:
## get business IDs for the category-restaurant##
id_list =[]
def get_business_id(country):
    
    """Yelp allows 1000 business IDs per location with 50 IDs per request
      country: string -> name of a country from locales available on Yelp API
    """
    for c in country:
        i = 1; j= 1
        while j<21:
            url = "https://api.yelp.com/v3/businesses/search"
            API_KEY = 'API'
            headers = {'Authorization': 'bearer %s' % API_KEY}
            PARAMETERS = {'locale':country[c], 'categories':'Restaurants(restaurants,All)', 'location': c, 'limit':50, 'offset' : i}
            response = requests.get(url, headers= headers, params = PARAMETERS)
            retrieve = response.json()
            i+=50; j+=1
            vals = list(retrieve.values())
            for v in vals[0]:
                ### some entries in the retrieved business details are in string format instead of dictionary.### 
                ###  We only keep those entries that are dictionaries ###
                if isinstance(v,dict):
                    id_list.append(v['id'])
                else:
                    break
    return id_list

In [24]:
## list of locales available on Yelp API ##
## 'Country name':'Country code' ##
countries = {'Czech Republic':'cs_CZ','Denmark':'da_DK','Austria':'de_AT','Switzerland':'de_CH','Germany':'de_DE',
            'Australia':'en_AU', 'Belgium':'en_BE','Canada':'en_CA','Switzerland':'en_CH','United Kingdom':'en_GB',
             'Hong Kong':'en_HK','Republic of Ireland':'en_IE','Malaysia':'en_MY','New Zealand':'en_NZ',
             'Philippines':'en_PH','Singapore':'en_SG','United States':'en_US','Argentina':'es_AR','Chile':'es_CL',
             'Spain':'es_ES','Mexico':'es_MX','Finland':'fi_FI','Philippines':'fil_PH','Belgium':'fr_BE',
             'Canada':'fr_CA', 'Switzerland':'fr_CH','France':'fr_FR', 'Switzerland':'it_CH',
             'Italy':'it_IT','Japan':'ja_JP','Malaysia':'ms_MY','Norway':'nb_NO',
             'Belgium':'nl_BE', 'The Netherlands':'nl_NL', 'Poland':'pl_PL','Brazil':'pt_BR','Portugal':'pt_PT',
              'Finland':'sv_FI', 'Sweden':'sv_SE', 'Turkey':'tr_TR', 'Hong Kong':'zh_HK', 'Taiwan':'zh_TW'}



In [25]:
### Get business IDs for restaurants for all available locations on Yelp ###
ID  = get_business_id(countries)

In [27]:
## Around 30k+ business IDs scraped from Yelp API ##
len(ID)

30400

In [30]:
df = pd.DataFrame(ID, columns =['businessID'])

In [35]:
df.to_pickle('business_ID.pkl')

# Load data

In [2]:
df = pd.read_pickle('business_ID')

In [3]:
df['businessID'].value_counts(ascending = False)

5UKX9xWwIMRLRdtzkRyZuA    2
TofRTCAfrXcvf7BYx0tqkQ    2
-Y_9AT6NpvHz15vd4B0CbA    2
iFZuhhzJCfq5qQBOV5LWKw    2
SqAMMAhQosxIcocdn-eJSQ    1
                         ..
_8vPU0nyEQ_cOtQyhc2P7w    1
qO9JM9rzZ99HwistT-y1yA    1
Wm1B9fpVOJdwrwDBYZNt3A    1
2ooCfDE50zgIkHNRCAalzw    1
y_0AvTblYwjyS1Qcv0Lpqg    1
Name: businessID, Length: 30396, dtype: int64

In [3]:
ID = df['businessID'].tolist()

# Get reviews

In [8]:
## Get user reviews for the restaurant IDs scraped above ##
def get_reviews(bizID):
    """Get user reviews for different business IDs
      returns a dataframe with biz_id, review id, review text, review rate and user id
  """
    data_final = pd.DataFrame()
    for i in bizID:
        data ={}
        url_rev = "https://api.yelp.com/v3/businesses/{}/reviews".format(i)
        API_KEY = 'API'
        headers = {'Authorization': 'bearer %s' % API_KEY}
        res = requests.get(url_rev, headers = headers)
        retrieve_rev = res.json()
        my_list = retrieve_rev['reviews']
        for j in my_list:
            data['biz_id']= i
            data['review_ID'] = j['id']
            data['review_text'] = j['text']
            data['review_rate'] = j['rating']
            data['review_time'] = j['time_created']
            data['userid'] = j['user']['id']
            dataset = pd.DataFrame([data])
            data_final = pd.concat([dataset, data_final],axis = 0)
    return data_final

Yelp returns 3 reviews per business ID. But we can only make 5000 API calls per day. Considering we have 30396 business IDs X 3 reviews , it far exceeds 5000 API calls. So the user reviews were retrieved in blocks. 
For instance in the code below we retrieve data for ID in rows 425 to 594

In [None]:
data_final = pd.DataFrame()
for i in ID[425:594]:
        data ={}
        url_rev = "https://api.yelp.com/v3/businesses/{}/reviews".format(i)
        API_KEY = 'API'
        headers = {'Authorization': 'bearer %s' % API_KEY}
        res_new = requests.get(url_rev, headers = headers)
        retrieve_rev_new = res_new.json()
        my_list = retrieve_rev_new['reviews']
        for j in my_list:
            data['biz_id']= i
            data['review_ID'] = j['id']
            data['review_text'] = j['text']
            data['review_rate'] = j['rating']
            data['review_time'] = j['time_created']
            data['userid'] = j['user']['id']
            dataset = pd.DataFrame([data])
            data_final = pd.concat([dataset, data_final],axis = 0)
            
           # data_final = pd.concat([dataset, data_final],axis = 0)

In [64]:
url_rev

'https://api.yelp.com/v3/businesses/iXawIr-ldV5Y-tSo4mQadA/reviews'

In [58]:
i

'ytn7WJi3y-KzSPYZFwRybA'

In [59]:
data_final

Unnamed: 0,biz_id,review_ID,review_text,review_rate,review_time,userid
0,2Uc7gpy3pyNBa1r-PFE_Yg,WK3tqWzoWuapvJWqjrYAzA,By far the best food in Prague! Very good pric...,5,2016-10-21 06:18:58,vT3LoECamH6elCxLDZ7DvQ
0,2Uc7gpy3pyNBa1r-PFE_Yg,DdSy510-P7WhZ5oHZkrHEA,Wonderful quaint Greek restaurant nestled in a...,4,2017-05-13 11:46:52,F2cMAsmFfxT1iy92fVCmFw
0,2Uc7gpy3pyNBa1r-PFE_Yg,xzdI-INb3RZbyIM_peUJYg,A place with great atmosphere and the best Gre...,5,2020-02-16 01:11:48,NfZNMppuV4hxLIcGP8voOg
0,yzXTHivtYStUD5v_QPO6IA,wcuRIGlsBC98__Mg44XmGQ,I had the chicken steak and it was a very very...,5,2018-08-08 11:45:27,IzOxW_z1DAAmF70uTwNLjg
0,yzXTHivtYStUD5v_QPO6IA,3qp43SpGwFMRuyw2cSErPg,Great restaurant. I couldn't make up my mind ...,5,2019-10-22 14:18:00,cI2_2ioC0M3_XkA1_NhLEw
...,...,...,...,...,...,...
0,o-M1z05z7MBTk_m2GCS6bA,l-qCoF5sHhov0iWtXOFKRA,This was such a great place to get some drinks...,5,2019-06-07 12:52:27,vkteh7lM65ZBn-ZG-KfhKw
0,o-M1z05z7MBTk_m2GCS6bA,Haz0_SI_aNC-gxeSAL29hQ,"This is a rare gem ! Lovely, well located, de...",5,2019-06-10 10:03:02,-NShH45YjbP72USLZgjn8g
0,vFl3TGZixk4jf2_qxr3Gmg,ZTEG3rkMxPjkfk5QIsQq3Q,A cool place near the river to drink beer. Can...,3,2016-09-13 04:42:46,ZH2oefuJlPKhynQRpxBb7g
0,vFl3TGZixk4jf2_qxr3Gmg,4e-k89JyH_EWaouD3FOGWg,Thanks for making Prague an even awesome-er (m...,5,2015-09-03 00:25:07,gtKm-eFYW8b6FBBGi3OPtQ


However, only 3 reviews per restaurant is very small data to build a recommender system utilizing Latent Factor Collaborative Filtering