# Scrapes "Rotten Tomatoes" reviews using Selenium

### Notebook info:
 - Various functions are put together, they are called at the bottom of the notebook
 - Uses a list of movie titles (We need to create one at some point)
 - Can customise how many reviews per movie to scrape
 - Scrapes both "Critics" and "Audience" reviews

# Setup

## Imports

In [None]:
import pandas as pd
import json
from os import listdir
from os.path import isfile, join

from selenium import webdriver
from selenium.webdriver import EdgeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup as bs
import time

# specify Edge driver location for selenium
service = webdriver.EdgeService(executable_path='../selenium/msedgedriver.exe')

# run in headless mode
# options = EdgeOptions().add_argument('--headless')  # comment out this line to disable headless mode

# note to self for listing versions
# pip list --format=freeze

## Functions

There are some differences with critic and audience reviews, so I made two sections with a lot of copy paste code but accounting for those differences.

Was not bothered to make it nicer in the moment lmao.

In [None]:
def open_browser_instance():
    '''For instantiating a selenium browser'
    Tries to visit the rottentomatoes homepage
    Waits max 30 seconds for it to load (waits for cookie popup to load)
    If it loads, clicks reject cookies and returns the instance
    If it fails to load, aborts and prints an error message'''
    
    if 'options' in globals():
        browser = webdriver.Edge(service=service, options=options)
    else:
        browser = webdriver.Edge(service=service)

    browser.get('https://www.rottentomatoes.com/')
    
    # wait until cookie popup appears
    reject_button = WebDriverWait(browser, 30).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="onetrust-reject-all-handler"]'))
    )

    # click reject button
    reject_button.click()
    
    return browser

### Critic functions

In [None]:
def grab_critic_reviewer_data(browser, reviews):
    '''Given a selenium instance of critic reviews on rotten tomatoes and the reviews dict, uses BeautifulSoup to find all:
     - reviewer_name
     - review_text
     - is_rotten
    Adds them to the reviews dict and returns it'''

    soup = bs(browser.page_source)
    row_soup = soup.find_all('div', class_='review-row')

    for row in row_soup:
        # names
        name = row.select_one('a[data-qa*=review-critic-link]').text.strip()
        
        # review text
        text = row.select_one('p[data-qa*=review-quote]').text.strip()

        # reviewer rating
        state = row.select_one('score-icon-critic-deprecated')['state']
        if state == 'fresh':
            rating = False
        elif state == 'rotten':
            rating = True
        else:
            print('Something went wrong getting rotten state')

        # date of review posted
        date = row.select_one('span[data-qa*=review-date]').text.strip()

        # adding data to dict
        reviews['user'].append(name)
        reviews['text'].append(text)
        reviews['is_rotten'].append(rating)
        reviews['date'].append(date)

    return reviews

In [None]:
def create_critic_url(movie_title):
    '''Given the string of a movie title, formats to how it appears in
    the rotten tomatoes url. The steps taken here are:
     - lowercase all letters
     - replace spaces with underscores
     - replace dashes with underscores
     - removes :'''

    # handling annoying specific movie titles
    if movie_title == 'the-avengers': movie_title = 'marvels_the_avengers'
    if movie_title == 'black-panther': movie_title = 'black_panther_2018'
    if movie_title == 'black-widow': movie_title = 'black_widow_2021'
    if movie_title == 'doctor-strange': movie_title = 'doctor_strange_2016'

    movie_title = movie_title.lower()
    movie_title = movie_title.replace(' ', '_')
    movie_title = movie_title.replace('-', '_')
    movie_title = movie_title.replace(':', '')
    movie_title = movie_title.replace('.', '')

    url = 'https://www.rottentomatoes.com/m/' + movie_title + '/reviews'

    return url

In [None]:
def scrape_critic_reviews(browser, movie_title, N):
    '''Given a selenium instance, movie title, and number of reviews to scrape:
     - Loads the first page for "Top Critics"
     - Scrapes the 20 reviews in the page (reviewer name, if movie is rotten or a tomato, and text of review)
     - Clicks next button to load more reviews
     - Continues until N reviews gathered or all reviews gathered
     - Puts reviews in pandas DataFrame
     - Returns the DataFrame
    '''

    # getting full rotten tomatoes url for top critic reviews of given movie
    url = create_critic_url(movie_title)

    # load first page
    browser.get(url)

    # wait until the text of first review appears
    WebDriverWait(browser, 30).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[2]/div[1]/div[2]/p[1]'))
    )

    reviews = {'movie': [], 'user': [], 'is_rotten': [], 'date': [], 'text': []}

    # scrape initial page
    reviews = grab_critic_reviewer_data(browser, reviews)
    time.sleep(1)

    # keep clicking next page and scraping more reviews
    # stops when either the desired number of reviews scraped (N) is reached or when no more reviews to scrape exist
    done = False
    while done == False:
        # Check for if N has been reached
        if len(reviews) >= N:
            done = True
            print(f'Found {N} reviews for {movie_title}')
            break

        # Check if next button is missing
        buttons = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]')
        soup = bs(buttons.get_attribute('innerHTML'))
        if soup.find('rt-button', class_='next hide'):
            print(f'Only found {len(reviews["user"])} reviews for {movie_title}')
            break

        # Next button must exist, click it
        next_button = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]/rt-button[2]')
        next_button.click()

        # wait until buttons have finished loading
        while(True):
            soup = bs(browser.page_source)
            prev_button = soup.select_one('rt-button[class*=prev]').prettify()
            next_button = soup.select_one('rt-button[class*=next]').prettify()
            if 'disabled' not in prev_button and 'disabled' not in next_button:
                break

        # scrape it
        reviews = grab_critic_reviewer_data(browser, reviews)
    
    # also adding movie titles
    reviews['movie'] = [movie_title] * len(reviews['user'])

    # saving to temporary file
    df = pd.DataFrame(reviews)
    df.to_csv(f'../data/raw/reviews/tmp_RT_critics_{movie_title}.csv', index=False)
    
    return

### Audience functions

In [None]:
def grab_audience_reviewer_data(browser, reviews):
    '''Given a selenium instance of critic reviews on rotten tomatoes and the reviews dict, uses BeautifulSoup to find all:
     - reviewer_name
     - review_text
     - review_score
    Adds them to the reviews dict and returns it'''

    soup = bs(browser.page_source)

    # get all rows of reviews
    row_soup = soup.find_all('div', class_='audience-review-row')
    
    for row in row_soup:

        # reviewer name (different for verified and non verified users)
        if row.select_one('span[data-qa*=review-name]'): # verified user
            name = row.select_one('span[data-qa*=review-name]').text.strip()
        elif row.select_one('a[data-qa*=review-name]'): # non verified user
            name = row.select_one('a[data-qa*=review-name]').text.strip()
        else: # sometimes reviewer has no name, no clue why but have to skip
            name = None

        # review text
        review_text = row.select_one('p[data-qa*=review-text]').text.strip()
        
        # review score
        full_stars = len(row.select('span[class*=star-display__filled]'))
        half_stars = len(row.select('span[class*=star-display__half]'))
        score = full_stars + (half_stars/2)

        # date of review
        date = row.select_one('span[data-qa*=review-duration]').text.strip()

        # appending data to reviews dict
        reviews['user'].append(name)
        reviews['text'].append(review_text)
        reviews['score'].append(score)
        reviews['date'].append(date)

    return reviews

In [None]:
def create_audience_url(movie_title):
    '''Given the string of a movie title, formats to how it appears in
    the rotten tomatoes url. The steps taken here are:
     - lowercase all letters
     - replace spaces with underscores
     - replace dashes with underscores
     - removes :'''
    
    # handling annoying specific movie titles
    if movie_title == 'the-avengers': movie_title = 'marvels_the_avengers'
    if movie_title == 'black-panther': movie_title = 'black_panther_2018'
    if movie_title == 'black-widow': movie_title = 'black_widow_2021'
    if movie_title == 'doctor-strange': movie_title = 'doctor_strange_2016'

    movie_title = movie_title.lower()
    movie_title = movie_title.replace(' ', '_')
    movie_title = movie_title.replace('-', '_')
    movie_title = movie_title.replace(':', '')
    movie_title = movie_title.replace('.', '')

    url = 'https://www.rottentomatoes.com/m/' + movie_title + '/reviews?type=user'

    return url

In [None]:
def scrape_audience_reviews(browser, movie_title, N):
    '''Given a selenium instance, movie title, and number of reviews to scrape:
     - Loads the first page for "Verified Audience"
     - Scrapes the 20 reviews in the page (reviewer name, 1-5 score of movie, and text of review)
     - Clicks next button to load more reviews
     - Continues until N reviews gathered or all reviews gathered
     - Puts reviews in pandas DataFrame
     - Returns the DataFrame
    '''

    # getting full rotten tomatoes url for top critic reviews of given movie
    url = create_audience_url(movie_title)

    # load first page
    browser.get(url)

    # wait until the text of first review appears
    WebDriverWait(browser, 30).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[2]/div[2]/div[1]/div[2]/drawer-more/p'))
    )

    reviews = {'movie': [], 'user': [], 'score': [], 'date': [], 'text': []}

    # scrape initial page
    reviews = grab_audience_reviewer_data(browser, reviews)

    time.sleep(1)

    # keep clicking next page and scraping more reviews
    # stops when either the desired number of reviews scraped (N) is reached or when no more reviews to scrape exist
    done = False
    while done == False:
        # Check for if N has been reached
        if len(reviews['user']) >= N:
            done = True
            print(f'Found {N} reviews for {movie_title}')
            break

        # Check if next button is missing
        buttons = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]')
        soup = bs(buttons.get_attribute('innerHTML'))
        if soup.find('rt-button', class_='next hide'):
            print(f'Only found {len(reviews["user"])} reviews for {movie_title}')
            break

        # Next button must exist, click it
        next_button = browser.find_element(By.XPATH, '/html/body/div[3]/main/div/div/section/div/div[1]/rt-button[2]')
        next_button.click()

        # wait until buttons have finished loading
        count = 0
        while(True):
            soup = bs(browser.page_source)
            prev_button = soup.select_one('rt-button[class*=prev]').prettify()
            next_button = soup.select_one('rt-button[class*=next]').prettify()
            if 'disabled' not in prev_button and 'disabled' not in next_button:
                break
            count += 1
            if count >= 1000:
                print(f'prev/next buttons never left disabled state for some reason... ({movie_title})')
                cause_error

        # scrape it
        reviews = grab_audience_reviewer_data(browser, reviews)
    
    # also adding movie titles
    reviews['movie'] = [movie_title] * len(reviews['user'])
    
    df = pd.DataFrame(reviews)
    df = df.head(N) # keep only N reviews
    df.to_csv(f'../data/raw/reviews/tmp_RT_audience_{movie_title}.csv', index=False)
    
    return

# Running Scrapes

In [None]:
# ~50 seconds to run scrapes with these params
# eventually use list of movies we have agreed on
f = open('../data/raw/movie_stats/mcu_list.json')
movie_title_list = json.load(f)
reviews_per_movie = 10000

# don't scrape already scraped movies
scraped_files = [f for f in listdir('../data/raw/reviews/') if isfile(join('../data/raw/reviews/', f))]

critic_movie_title_list = []
audience_movie_title_list = []
for movie_title in movie_title_list:
    if f'tmp_RT_critics_{movie_title}.csv' not in scraped_files:
        critic_movie_title_list.append(movie_title)
    if f'tmp_RT_audience_{movie_title}.csv' not in scraped_files:
        audience_movie_title_list.append(movie_title)

print(f'Critic   scraping: {reviews_per_movie} reviews from {len(critic_movie_title_list)} movies')
print(f'Audience scraping: {reviews_per_movie} reviews from {len(audience_movie_title_list)} movies')
print(f'(max {(reviews_per_movie*len(critic_movie_title_list))*2} total reviews)')

## Critics

In [None]:
browser = open_browser_instance()
for movie_title in critic_movie_title_list:
    scrape_critic_reviews(browser, movie_title, reviews_per_movie)
    print(f'Scraping {movie_title} successful!')
browser.close()

## Audience

In [None]:
browser = open_browser_instance()
for movie_title in audience_movie_title_list:
    scrape_audience_reviews(browser, movie_title, reviews_per_movie)
    print(f'Scraping {movie_title} successful!')
    time.sleep(120)
browser.close()

## Compiling files into two large files

In [None]:
scraped_files = [f for f in listdir('../data/raw/reviews/') if isfile(join('../data/raw/reviews/', f))]
audience_dfs = []
critic_dfs = []
for file in scraped_files:
    if 'audience' in file:
        audience_dfs.append(pd.read_csv(f'../data/raw/reviews/{file}'))
    if 'critic' in file:
        critic_dfs.append(pd.read_csv(f'../data/raw/reviews/{file}'))

# concatting the files
audience_df = pd.concat(audience_dfs, ignore_index=True, sort=False)
critic_df = pd.concat(critic_dfs, ignore_index=True, sort=False)

# saving them
audience_df.to_csv('../data/raw/reviews/RT_audience.csv', index=False)
critic_df.to_csv('../data/raw/reviews/RT_critics.csv', index=False)

## Minor test for if reading them is easy and if it looks right

In [None]:
df = pd.read_csv('../data/raw/reviews/RT_critics.csv')
print(df.shape)
print(df.dtypes)
df.head()

In [None]:
df = pd.read_csv('../data/raw/reviews/RT_audience.csv')
print(df.shape)
print(df.dtypes)
df.head()

# Formatting files to be as agreed

In [None]:
# reading them in
df_critics = pd.read_csv('../data/raw/reviews/RT_critics_old.csv')
df_audience = pd.read_csv('../data/raw/reviews/RT_audience_old.csv')

### Removing rows where text is nan values

In [None]:
def remove_nan(df):
    '''Given df, removes rows where text is NaN.
    returns new df'''
    df = df[df['text'].notna()]
    return df

In [None]:
# test
critics_test = df_critics.copy()
audience_test = df_audience.copy()
ct_before = critics_test.shape[0]
at_before = audience_test.shape[0]
critics_test = remove_nan(critics_test)
audience_test = remove_nan(audience_test)
ct_after = critics_test.shape[0]
at_after = audience_test.shape[0]
print(f'Removed {ct_before-ct_after} from critics')
print(f'Removed {at_before-at_after} from audience')

### Removing quotes unnecessary as they are for pandas to read in data correctly

### Change date columns to be pandas datetime format

Done using pandas to_datetime converter

In [None]:
def convert_date(df):
    df['date'] = pd.to_datetime(df['date'])
    return df

In [None]:
# test
at = df_audience.copy()
at = convert_date(at)
at.head()

### Adding url columns to files

Done by using previously made url functions

In [None]:
def convert_url(df):
    if 'is_rotten' in df.columns:
        df['url'] = df['movie'].transform(lambda x: create_critic_url(x))
    else: 
        df['url'] = df['movie'].transform(lambda x: create_audience_url(x))
    return df

In [None]:
# test
at = df_audience.copy()
ct = df_critics.copy()

at = convert_url(at)
ct = convert_url(ct)

print(at['url'][0])
print(ct['url'][0])

### Changing audience ratings from 0-5 to 0-10

Done by multiplying score by 2

In [None]:
def convert_score(df):
    df['score'] = df['score'].transform(lambda x: x*2)
    return df

In [None]:
# test
test = df_audience.copy()
print(test['score'][:4])
test = convert_score(test)
print(test['score'][:4])

## Pipeline conversion function

Just runs all the other functions one after the other

In [None]:
def convert_pipeline(df):
    df = remove_nan(df)
    df = convert_date(df)
    df = convert_url(df)
    if 'score' in df.columns:
        df = convert_score(df)
    return df

In [None]:
# testing audience
at = df_audience.copy()
at = convert_pipeline(at)
at.head()

In [None]:
# testing critics
ct = df_critics.copy()
ct = convert_pipeline(ct)
ct.head()

## Running and saving conversion

In [None]:
df_critics = pd.read_csv('../data/raw/reviews/RT_critics_old.csv')

df_critics = convert_pipeline(df_critics)
df_audience = convert_pipeline(df_audience)

df_critics.to_csv('../data/raw/reviews/RT_critics.csv', index=False)
df_audience.to_csv('../data/raw/reviews/RT_audience.csv', index=False)

## Small data check

In [None]:
df_audience = pd.read_csv('../data/raw/reviews/RT_audience.csv')

unique_scores = sorted(df_audience['score'].unique())
print(unique_scores)
print(len(unique_scores))