In [1]:
import os
import requests
import datetime
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [2]:
options = Options()
browser = Chrome(ChromeDriverManager().install(), options=options)

[WDM] - Current google-chrome version is 110.0.5481
[WDM] - Get LATEST driver version for 110.0.5481






[WDM] - Driver [/Users/natalie/.wdm/drivers/chromedriver/mac64/110.0.5481.77/chromedriver] found in cache


In [3]:
browser.maximize_window()
time.sleep(5)

## Functions to Collect All Reviews for a Restaurant

In [4]:
def get_restaurant_details(browser): 
    page_html_pre_load_more = BeautifulSoup(browser.page_source, "html.parser")
    
    name = page_html_pre_load_more.find(class_="venue-name").text
    name = name.replace('\n', '')

    num_reviews = page_html_pre_load_more.find(class_="venue-count-reviews").text
    num_reviews = int(num_reviews.split('\n\n')[1].split(' Review')[0])

    neighbourhood = page_html_pre_load_more.find(class_="venue-area").text
    neighbourhood = neighbourhood.replace('\n', '')

    price = page_html_pre_load_more.find(class_="venue-price").text
    price = price.replace('\n', '')

    categories_html = page_html_pre_load_more.find_all(class_="venue-tag")
    categories = []
    for category_html in categories_html:
        categories.append(category_html.text)
    
    return name, num_reviews, neighbourhood, price, categories

In [5]:
def get_restaurant_reviews_per_page(browser, reviews_by_restaurant, url, name, neighbourhood, price, categories): # function to load reviews for each page    
    # find all reviews for each restaurant
    page_html = BeautifulSoup(browser.page_source, "html.parser")
    all_reviews = page_html.findAll(class_="food card feed-item")

    for review_listing in all_reviews:
        # review
        review = review_listing.find(class_="food-description").text

        # user_card
        user_card = review_listing.find(class_="food-user card-item")
        try:
            user = user_card.find(class_="card-item-set--link-title").text
            user = user.replace('\n', '')
        except:
            user = None
        try:
            date = user_card.find(class_="card-item-set--link-subtitle").text
            date = date.split('·')[0].replace('\n', '')
            # only take recent reviews, uptill 2020
            if ('ago' in date) or ('at' in date) or (int(date.split(', ')[-1]) >=2020): 
                continue_loading = True
            else:
                continue_loading = False
                break # no need to take remaining reviews in the page
        except:
            date = None
            continue_loading = True

        if continue_loading:
            reviews_by_restaurant.append([url, name, neighbourhood, price, categories, review, user, date])
    
    return reviews_by_restaurant, continue_loading

In [6]:
def get_all_reviews(browser, url, name, neighbourhood, price, categories):
    reviews_by_restaurant = []
    continue_loading = True

    while continue_loading: # to check if need to continue to click load more
        # for each page, collect review data
        reviews_by_restaurant, continue_loading = get_restaurant_reviews_per_page(browser, reviews_by_restaurant, url, name, neighbourhood, price, categories)
        time.sleep(2)
        
        try:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            more_reviews = WebDriverWait(browser, 10).until(
                EC.element_to_be_clickable((By.ID, "load-more-reviews")))
            more_reviews.send_keys(Keys.ENTER)
            time.sleep(3)
        except Exception as e:
            continue_loading = False
            print(e)
    
    return reviews_by_restaurant
        

## Scrape Reviews for Restaurants

In [7]:
file_name = 'restaurant-data/data_00.xlsx' ### CHANGE THIS
restaurant_list = pd.read_excel(file_name, index_col=0, engine='openpyxl')['link']

In [8]:
output_text = ""

In [9]:
for url in restaurant_list:
    browser.get(url)

    name, num_reviews, neighbourhood, price, categories = get_restaurant_details(browser)
    if (num_reviews > 20):
        if (name[0:9] != "[CLOSED] "): # check if restaurant closed down
            reviews_by_restaurant = get_all_reviews(browser, url, name, neighbourhood, price, categories)
            reviews_by_restaurant_df = pd.DataFrame(reviews_by_restaurant, columns=['url', 'name', 'neighbourhood', 'price', 'categories', 'review', 'user', 'date'])
            reviews_by_restaurant_df.to_csv('restaurant-data/section0/' + name + '_reviews.csv') ### CHANGE FOLDER TO YOUR SECTION
            time.sleep(5) # sleep for each restaurant
        else: 
            output_text = output_text + "\n" + "### CLOSED  ### " + name
            print("### CLOSED  ### " + name)
    else:
        output_text = output_text + "\n" + "### TOO FEW ### " + name + ' not collected. Only has ' + str(num_reviews) + ' reviews'
        print("### TOO FEW ### " + name + ' not collected. Only has ' + str(num_reviews) + ' reviews')

### TOO FEW ### Ya Kun Kaya Toast not collected. Only has 1 reviews
### TOO FEW ### 511 Lor Mee not collected. Only has 5 reviews
### TOO FEW ### McDonald's not collected. Only has 1 reviews
### TOO FEW ### Tian Yu Tian Fish Head Steamboat not collected. Only has 7 reviews
### TOO FEW ### Kiroi Freshly Baked Cheese Cake not collected. Only has 13 reviews


In [10]:
with open("section0_errors.txt", "w") as text_file: ### CHANGE FILE NAME
    text_file.write(output_text)
    text_file.close()