In [None]:
import os
import requests
import datetime
import time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [None]:
options = Options()
browser = Chrome(ChromeDriverManager().install(), options=options)

In [None]:
browser.maximize_window()
time.sleep(5)

## Functions to Collect All Reviews for a Restaurant

In [None]:
def get_restaurant_details(browser): 
    page_html_pre_load_more = BeautifulSoup(browser.page_source, "html.parser")
    
    name = page_html_pre_load_more.find(class_="venue-name").text
    name = name.replace('\n', '')

    num_reviews = page_html_pre_load_more.find(class_="venue-count-reviews").text
    num_reviews = int(num_reviews.split('\n\n')[1].split(' Reviews')[0])

    neighbourhood = page_html_pre_load_more.find(class_="venue-area").text
    neighbourhood = neighbourhood.replace('\n', '')

    price = page_html_pre_load_more.find(class_="venue-price").text
    price = price.replace('\n', '')

    categories_html = page_html_pre_load_more.find_all(class_="venue-tag")
    categories = []
    for category_html in categories_html:
        categories.append(category_html.text)
    
    return name, num_reviews, neighbourhood, price, categories

In [None]:
def load_all_reviews(browser):
    load_more = True
    page_count = 0
    while load_more:
        if page_count == 5:
            break
        try:
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            more_reviews = WebDriverWait(browser, 10).until(
                EC.element_to_be_clickable((By.ID, "load-more-reviews")))
            more_reviews.send_keys(Keys.ENTER)
            time.sleep(3)
            page_count +=1
        except Exception as e:
            load_more = False
            print(e)

In [None]:
def get_restaurant_reviews(browser, url, name, num_reviews, neighbourhood, price, categories):
    load_all_reviews(browser)
    
    # find all reviews for each restaurant
    page_html = BeautifulSoup(browser.page_source, "html.parser")
    all_reviews = page_html.findAll(class_="food card feed-item")
    
    if len(all_reviews) != num_reviews:
        print("### DIFF NUM ###: " + str(len(all_reviews)) + ' out of ' + str(num_reviews) + ' reviews collected.')

    reviews_by_restaurant = []
    for review_listing in all_reviews:
        # review
        review = review_listing.find(class_="food-description").text

        # user_card
        user_card = review_listing.find(class_="food-user card-item")
        try:
            user = user_card.find(class_="card-item-set--link-title").text
            user = user.replace('\n', '')
        except:
            user = None
        try:
            date = user_card.find(class_="card-item-set--link-subtitle").text
            date = date.split('·')[0].replace('\n', '')
        except:
            date = None

        reviews_by_restaurant.append([url, name, neighbourhood, price, categories, review, user, date])
    
    return reviews_by_restaurant

## Scrape Reviews for Restaurants

In [None]:
file_name = 'restaurant_links_0.csv' ### CHANGE THIS
restaurant_list = pd.read_csv(file_name, index_col=0)['links']

In [None]:
for url in restaurant_list:
    browser.get(url)
    time.sleep(5) # sleep for each restaurant

    name, num_reviews, neighbourhood, price, categories = get_restaurant_details(browser)
    if (num_reviews > 50):
        if (name[0:9] != "[CLOSED] "):
            reviews_by_restaurant = get_restaurant_reviews(browser, url, name, num_reviews, neighbourhood, price, categories)
            reviews_by_restaurant_df = pd.DataFrame(reviews_by_restaurant, columns=['url', 'name', 'neighbourhood', 'price', 'categories', 'review', 'user', 'date'])
            reviews_by_restaurant_df.to_csv('restaurant-data/' + name + '_reviews.csv')
        else: 
            print("### CLOSED   ### " + name)
    else:
        print("### TOO FEW  ### " + name + ' not collected. Only has ' + str(num_reviews) + ' reviews')