# Scrapes "Rotten Tomatoes" reviews using Selenium

### Notebook info:
 - Various functions are put together, they are called at the bottom of the notebook
 - Uses a list of movie titles (We need to create one at some point)
 - Can customise how many reviews per movie to scrape
 - Scrapes both "Top Critics" and "Verified Audience" reviews

# Setup

## Imports

In [None]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup as bs
import time

# specify Edge driver location for selenium
service = webdriver.EdgeService(executable_path='../selenium/msedgedriver.exe')

# note to self for listing versions
# pip list --format=freeze

## Functions

There are some differences with critic and audience reviews, so I made two sections with a lot of copy paste code but accounting for those differences.

Was not bothered to make it nicer in the moment lmao.

In [None]:
def open_browser_instance():
    '''For instantiating a selenium browser'
    Tries to visit the rottentomatoes homepage
    Waits max 30 seconds for it to load (waits for cookie popup to load)
    If it loads, clicks reject cookies and returns the instance
    If it fails to load, aborts and prints an error message'''
    
    browser = webdriver.Edge()
    browser.get('https://www.rottentomatoes.com/')
    
    # wait until cookie popup appears
    reject_button = WebDriverWait(browser, 30).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="onetrust-reject-all-handler"]'))
    )

    # click reject button
    reject_button.click()
    
    return browser

### Top Critic functions

In [None]:
def grab_critic_reviewer_data(browser, reviews):
    '''Given a selenium instance of critic reviews on rotten tomatoes and the reviews dict, uses BeautifulSoup to find all:
     - reviewer_name
     - review_text
     - is_rotten
    Adds them to the reviews dict and returns it'''

    soup = bs(browser.page_source)

    # get names
    reviewer_name_soup = soup.find_all('a', class_='display-name')
    for i in reviewer_name_soup:
        name = i.contents[0].strip()
        reviews['reviewer_name'].append(name)
    
    # get review texts
    reviewer_text_soup = soup.find_all('p', class_='review-text')
    for i in reviewer_text_soup:
        review_text = i.contents[0].strip()
        reviews['review_text'].append(review_text)

    # get reviewer rating
    reviewer_rating_soup = soup.find_all('score-icon-critic-deprecated')
    for i in reviewer_rating_soup:
        # finds unwanted tags (not sure who they belong to or what) so need to exclude those
        # the desired reviews have hidden rotten percentages, we only see if they are fesh or rotten
        percentage = i['percentage']
        if percentage == 'hide':
            rating = i['state']
            if rating == 'rotten':
                reviews['is_rotten'].append(True)
            elif rating == 'fresh':
                reviews['is_rotten'].append(False)
            else:
                print('Something went wrong when checking review rating')
                break

    return reviews

In [None]:
def create_critic_url(movie_title):
    '''Given the string of a movie title, formats to how it appears in
    the rotten tomatoes url. The steps taken here are:
     - lowercase all letters
     - replace spaces with underscores
     - replace dashes with underscores
     - removes :'''

    movie_title = movie_title.lower()
    movie_title = movie_title.replace(' ', '_')
    movie_title = movie_title.replace('-', '_')
    movie_title = movie_title.replace(':', '')

    url = 'https://www.rottentomatoes.com/m/' + movie_title + '/reviews?type=top_critics'

    return url

In [None]:
def scrape_critic_reviews(browser, movie_title, N):
    '''Given a selenium instance, movie title, and number of reviews to scrape:
     - Loads the first page for "Top Critics"
     - Scrapes the 20 reviews in the page (reviewer name, if movie is rotten or a tomato, and text of review)
     - Clicks next button to load more reviews
     - Continues until N reviews gathered or all reviews gathered
     - Puts reviews in pandas DataFrame
     - Returns the DataFrame
    '''

    # getting full rotten tomatoes url for top critic reviews of given movie
    url = create_critic_url(movie_title)

    # load first page
    browser.get(url)

    # wait until the text of first review appears
    WebDriverWait(browser, 30).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[2]/div[1]/div[2]/p[1]'))
    )

    reviews = {'movie_title': [], 'reviewer_name': [], 'review_text': [], 'is_rotten': []}

    # scrape initial page
    reviews = grab_critic_reviewer_data(browser, reviews)

    # keep clicking next page and scraping more reviews
    # stops when either the desired number of reviews scraped (N) is reached or when no more reviews to scrape exist
    done = False
    while done == False:
        # Check for if N has been reached
        if len(reviews) >= N:
            done = True
            print(f'Found {N} reviews for {movie_title}')
            break

        # Issue with not being able to smartly check when prev/next buttons are loaded, so using time.sleep
        time.sleep(1)

        # Check if next button is missing
        buttons = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]')
        soup = bs(buttons.get_attribute('innerHTML'))
        if soup.find('rt-button', class_='next hide'):
            print(f'Only found {len(reviews["reviewer_name"])} reviews for {movie_title}')
            break

        # Next button must exist, click it
        next_button = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]/rt-button[2]')
        next_button.click()

        # Issue with not being able to smartly check when next page is loaded, so using time.sleep
        time.sleep(1)

        # scrape it
        reviews = grab_critic_reviewer_data(browser, reviews)
    
    # also adding movie titles
    reviews['movie_title'] = [movie_title] * len(reviews['reviewer_name'])
    
    # return pd.DataFrame.from_dict(reviews).astype({'movie_title': str, 'reviewer_name': str, 'review_text': str, 'is_rotten': bool})
    return pd.DataFrame(reviews)

### Verified Audience functions

In [None]:
def grab_audience_reviewer_data(browser, reviews):
    '''Given a selenium instance of critic reviews on rotten tomatoes and the reviews dict, uses BeautifulSoup to find all:
     - reviewer_name
     - review_text
     - review_score
    Adds them to the reviews dict and returns it'''

    soup = bs(browser.page_source)

    # get names
    reviewer_name_soup = soup.find_all('div', class_='audience-reviews__name-wrap')
    for i in reviewer_name_soup:
        name = i.contents[1].text.strip()
        reviews['reviewer_name'].append(name)

    # get review texts
    reviewer_text_soup = soup.find_all('p', class_='audience-reviews__review js-review-text')
    for i in reviewer_text_soup:
        review_text = i.contents[0].strip()
        reviews['review_text'].append(review_text)

    # get reviewer rating
    reviewer_rating_soup = soup.find_all('span', class_='star-display')
    for i in reviewer_rating_soup:
        # count number of filled stars
        count = 0
        for tag in i:
            class_name = tag['class'][0]
            if class_name == 'star-display__filled':
                count += 1
        reviews['review_score'].append(count)

    return reviews

In [None]:
def create_audience_url(movie_title):
    '''Given the string of a movie title, formats to how it appears in
    the rotten tomatoes url. The steps taken here are:
     - lowercase all letters
     - replace spaces with underscores
     - replace dashes with underscores
     - removes :'''

    movie_title = movie_title.lower()
    movie_title = movie_title.replace(' ', '_')
    movie_title = movie_title.replace('-', '_')
    movie_title = movie_title.replace(':', '')

    url = 'https://www.rottentomatoes.com/m/' + movie_title + '/reviews?type=verified_audience&intcmp=rt-scorecard_audience-score-reviews'

    return url

In [None]:
def scrape_audience_reviews(browser, movie_title, N):
    '''Given a selenium instance, movie title, and number of reviews to scrape:
     - Loads the first page for "Verified Audience"
     - Scrapes the 20 reviews in the page (reviewer name, 1-5 score of movie, and text of review)
     - Clicks next button to load more reviews
     - Continues until N reviews gathered or all reviews gathered
     - Puts reviews in pandas DataFrame
     - Returns the DataFrame
    '''

    # getting full rotten tomatoes url for top critic reviews of given movie
    url = create_audience_url(movie_title)

    # load first page
    browser.get(url)

    # wait until the text of first review appears
    
    WebDriverWait(browser, 30).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[2]/div[2]/div[1]/div[2]/drawer-more/p'))
    )

    reviews = {'movie_title': [], 'reviewer_name': [], 'review_text': [], 'review_score': []}

    # scrape initial page
    reviews = grab_audience_reviewer_data(browser, reviews)

    # keep clicking next page and scraping more reviews
    # stops when either the desired number of reviews scraped (N) is reached or when no more reviews to scrape exist
    done = False
    while done == False:
        # Check for if N has been reached
        if len(reviews['reviewer_name']) >= N:
            done = True
            print(f'Found {N} reviews for {movie_title}')
            break

        # Issue with not being able to smartly check when prev/next buttons are loaded, so using time.sleep
        time.sleep(1)

        # Check if next button is missing
        buttons = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]')
        soup = bs(buttons.get_attribute('innerHTML'))
        if soup.find('rt-button', class_='next hide'):
            print(f'Only found {len(reviews["reviewer_name"])} reviews for {movie_title}')
            break

        # Next button must exist, click it
        next_button = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]/rt-button[2]')
        next_button.click()

        # Issue with not being able to smartly check when next page is loaded, so using time.sleep
        time.sleep(1)

        # scrape it
        reviews = grab_audience_reviewer_data(browser, reviews)
    
    # also adding movie titles
    reviews['movie_title'] = [movie_title] * len(reviews['reviewer_name'])
    
    return pd.DataFrame(reviews)

# Running Scrapes

In [None]:
# ~50 seconds to run scrapes with these params
# eventually use list of movies we have agreed on
movie_title_list = ['Cocaine Bear', 'Spider-Man: No Way Home']
reviews_per_movie = 100

## Top Critics

In [None]:
browser = open_browser_instance()

df_list = []
for movie_title in movie_title_list:
    df_list.append(scrape_critic_reviews(browser, movie_title, reviews_per_movie))
browser.close()

df = pd.concat(df_list, ignore_index=True, sort=False)
print(f'Scraped {df.shape[0]} total reviews!')

# saving data
df.to_csv('../data/raw/reviews/RT_top_critics.csv', index=False)

## Verified Audience

In [None]:
browser = open_browser_instance()

df_list = []
for movie_title in movie_title_list:
    df_list.append(scrape_audience_reviews(browser, movie_title, reviews_per_movie))
browser.close()

df = pd.concat(df_list, ignore_index=True, sort=False)
print(f'Scraped {df.shape[0]} total reviews!')

# saving data
df.to_csv('../data/raw/reviews/RT_verified_audience.csv', index=False)

## Minor test for if reading them is easy and if it looks right

In [None]:
df = pd.read_csv('../data/raw/reviews/RT_top_critics.csv')
print(df.shape)
print(df.dtypes)
df.head()

In [None]:
df = pd.read_csv('../data/raw/reviews/RT_verified_audience.csv')
print(df.shape)
print(df.dtypes)
df.head()