# Scrapes "Rotten Tomatoes" reviews using Selenium

More detailed:
 - List of movie titles as input (eventually stored somewhere in the repo)
 - Maybe a number N as input too for number of reviews to scrape per movie
 - Uses selenium to open reviews by "Top Critics" and "Verified Audience" for the given movie
 - Based on N, clicks "Next" to keep scraping reviews until the required number is reached or no more reviews exist

# Setup

## Imports

In [136]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup as bs
import re

## Selenium configs

In [3]:
# specify Edge driver location
service = webdriver.EdgeService(executable_path='../selenium/msedgedriver.exe')

# 
# options = webdriver.EdgeOptions()

In [4]:
def open_browser_instance():
    '''For instantiating a selenium browser'
    Tries to visit the rottentomatoes homepage
    Waits max 30 seconds for it to load (waits for cookie popup to load)
    If it loads, clicks reject cookies and returns the instance
    If it fails to load, aborts and prints an error message'''
    
    browser = webdriver.Edge()
    browser.get('https://www.rottentomatoes.com/')
    
    # wait until cookie popup appears
    try:
        reject_button = WebDriverWait(browser, 30).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="onetrust-reject-all-handler"]'))
        )
        print('Cookie popup appeared')
    except TimeoutException:
        print('Cookie popup never appeared')

    # click reject button
    reject_button.click()
    print('Cookies rejected')
    
    return browser

In [133]:
def grab_critic_reviewer_data(browser, reviews):
    '''Given a selenium instance of critic reviews on rotten tomatoes and the reviews dict, uses BeautifulSoup to find all:
     - reviewer_name
     - is_rotten
     - review_text
    Adds them to the reviews dict and returns it'''

    soup = bs(browser.page_source)

    # get names
    reviewer_name_soup = soup.find_all('a', class_='display-name')
    for i in reviewer_name_soup:
        name = i.contents[0].strip()
        reviews['reviewer_name'].append(name)
    
    # get review texts
    reviewer_text_soup = soup.find_all('p', class_='review-text')
    for i in reviewer_text_soup:
        review_text = i.contents[0].strip()
        reviews['review_text'].append(review_text)

    # get reviewer rating
    reviewer_rating_soup = soup.find_all('score-icon-critic-deprecated')
    for i in reviewer_rating_soup:
        # finds unwanted tags (not sure who they belong to or what) so need to exclude those
        # the desired reviews have hidden rotten percentages, we only see if they are fesh or rotten
        percentage = i['percentage']
        if percentage == 'hide':
            rating = i['state']
            if rating == 'rotten':
                reviews['is_rotten'].append(True)
            elif rating == 'fresh':
                reviews['is_rotten'].append(False)
            else:
                print('Something went wrong when checking review rating')
                break

    return reviews

In [139]:
def scrape_critic_reviews(browser, movie_title, N):
    '''Given a selenium instance, movie title, and number of reviews to scrape:
     - Loads the first page for "Top Critics"
     - Scrapes the 20 reviews in the page (reviewer name, if movie is rotten or a tomato, and text of review)
     - Clicks next button to load more reviews
     - Continues until N reviews gathered or all reviews gathered
     - Puts reviews in pandas DataFrame
     - Returns the DataFrame
    '''

    # format movie_title to appear as it should in rotten tomatoes url
    movie_title = movie_title.replace(" ", "_").lower()
    url = 'https://www.rottentomatoes.com/m/' + movie_title + '/reviews?type=top_critics'

    # load first page
    browser.get(url)

    # wait until the text of first review appears
    WebDriverWait(browser, 30).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[2]/div[1]/div[2]/p[1]'))
    )

    # scrape initial page
    reviews = {'reviewer_name': [], 'review_text': [], 'is_rotten': []}
    reviews = grab_critic_reviewer_data(browser, reviews)

    # keep clicking next page and scraping more reviews
    done = False
    while done == False:
        # stop when either the desired number of reviews scraped (N) is reached...
        if len(reviews) >= N:
            done = True
            break

        # ...or when no next page exists
        try:
            next_page = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]/rt-button[2]')
            next_page.click()
        except NoSuchElementException:
            done = True
            break

        # wait until the text of first review appears
        WebDriverWait(browser, 30).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[2]/div[1]/div[2]/p[1]'))
        )

        # scrape it
        reviews = grab_critic_reviewer_data(browser, reviews)
    
    # print(reviews)

    # # turn data into pandas DataFrame
    df = pd.DataFrame(reviews)
    print(df)
    
    return 

# Running Scrapes

In [140]:
movie_title_test_list = ['Cocaine Bear']
reviews_per_movie = 100

browser = open_browser_instance()

for movie_title in movie_title_test_list:
    df = scrape_critic_reviews(browser, movie_title, reviews_per_movie)


browser.close()

Cookie popup appeared
Cookies rejected


ElementNotInteractableException: Message: element not interactable
  (Session info: MicrosoftEdge=117.0.2045.60)
Stacktrace:
	GetHandleVerifier [0x00007FF7366D99F2+63682]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF736661942+267762]
	(No symbol) [0x00007FF73641AA40]
	(No symbol) [0x00007FF73646C991]
	(No symbol) [0x00007FF73645F1DC]
	(No symbol) [0x00007FF73648C84A]
	(No symbol) [0x00007FF73645EC51]
	(No symbol) [0x00007FF73645EAFD]
	(No symbol) [0x00007FF73648CB70]
	(No symbol) [0x00007FF73645EC51]
	(No symbol) [0x00007FF7364A6A81]
	(No symbol) [0x00007FF73648C623]
	(No symbol) [0x00007FF73645DA7A]
	(No symbol) [0x00007FF73645CD6B]
	(No symbol) [0x00007FF73645E204]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF7368A5EF9+1233737]
	(No symbol) [0x00007FF7364DB274]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF7365A32AA+33498]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF73659BDE9+3609]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF7368A4C94+1229028]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF73666BAE8+309144]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7366667C4+287860]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF7366668F2+288162]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF736659FF1+236705]
	BaseThreadInitThunk [0x00007FFAA7687344+20]
	RtlUserThreadStart [0x00007FFAA78226B1+33]
