# Scraping the BoxOfficeMojo with Selenium and BeautifulSoup

#### Tools and Libraries

In [147]:
from bs4 import BeautifulSoup
import requests
import time, os
import dateutil.parser
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import re
import seaborn as sns 
import matplotlib.pyplot as plt

from urllib.parse import urljoin

chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

#### URL details for the request

In [148]:
extension_url = '/year/2020/?ref_=bo_yl_table_1' 
base_url = 'https://www.boxofficemojo.com'   

In [149]:
response = requests.get(base_url + extension_url)
response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df.drop(columns=['Genre', 'Budget', 'Running Time'], inplace=True)

In [150]:
df.head()

Unnamed: 0,Rank,Release,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated
0,1,Avengers: Endgame,"$858,373,000",4662,"$858,373,000",Apr 26,-,False
1,2,The Lion King,"$543,638,043",4802,"$543,638,043",Jul 19,-,False
2,3,Toy Story 4,"$434,038,008",4575,"$434,038,008",Jun 21,-,False
3,4,Frozen II,"$430,144,682",4440,"$477,373,578",Nov 22,-,False
4,5,Captain Marvel,"$426,829,839",4310,"$426,829,839",Mar 8,-,False


#### Helper functions: money --> integer, hours+min --> min, date --> datestring

In [158]:
def money_to_int(moneystring):
    if type(moneystring) != float:
        if moneystring != None:
            moneystring = moneystring.replace('$', '').replace(',', '')
        else:
            moneystring = 'None'
        return int(moneystring)

def runtime_to_minutes(runtimestring):
    if runtimestring != None:
        runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

#### Helper function: findNext( ) to find sibling object values.

In [159]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    if next_element:
        return next_element.text 
    else:
        return None

#### Helper function: extracts movie stats, such as title, money, rating etc. and puts them into a dictionary.

In [160]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + extension_url
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']
    
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    
    #Get domestic gross
    try:
        raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    except:
        raw_domestic_total_gross = float("NaN")
    
    if raw_domestic_total_gross is None:
        print('This is NaN')
        domestic_total_gross = float("NaN")
    else:
        domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    if raw_runtime != None:
        runtime = runtime_to_minutes(raw_runtime)
    
    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    if '-' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
    elif '(' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    else:
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    release_date = to_date(raw_release_date)
    


    # Get budget alt 
    raw_budget = get_movie_value(soup,'Budget')
    if raw_budget is None:
        budget = raw_budget
    else:
        budget = money_to_int(raw_budget)

    #Create movie dictionary and return
    movie_dict = dict(zip(headers,[title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date,
                                budget]))

    return movie_dict

In [161]:
def get_selenium_dict(driver):
    current_url = driver.current_url
    response = requests.get(current_url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']
    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()
    print('Title: {}'.format(title))
    #Get domestic gross
    try:
        raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    except:
        raw_domestic_total_gross = float("NaN")
    print('Raw gross: {}'.format(raw_domestic_total_gross))
    if raw_domestic_total_gross == None or type(raw_domestic_total_gross) == float:
        domestic_total_gross = float("NaN")
    else:
        domestic_total_gross = money_to_int(raw_domestic_total_gross)
    print('Gross gross: {}'.format(domestic_total_gross))
    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    if type(raw_runtime) != float and raw_runtime != None:
        runtime = runtime_to_minutes(raw_runtime)
    print('Runtime: {}'.format(runtime))
    #Get rating
    rating = get_movie_value(soup,'MPAA')
    print(f'Rating: {rating}')
    #Get release date
    if '-' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
    elif '(' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    else:
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    # release_date = to_date(raw_release_date)
    release_date = raw_release_date
    print(f'Release Date: {release_date}')
    # Get budget alt
    obj = soup.find(text=re.compile('Budget'))
    if not obj:
        obj = None
    # this works for most of the values
    if obj:
        next_element = obj.findNext()
    else:
        next_element=None
    if next_element:
        raw_budget = next_element.text
    else:
        raw_budget = None
    if raw_budget != None:
        budget = money_to_int(raw_budget)
    else:
        budget = 0
    print(f'Budget: {budget}')
    #Create movie dictionary and return
    movie_dict = dict(zip(headers,[title,
                                domestic_total_gross,
                                runtime,
                                rating,
                                release_date,
                                budget]))
    print(movie_dict)
    return movie_dict (edited) 

In [162]:
mojo_links = [ 
        'rl1182631425',
        'rl2969994753',
        'rl4244997633',
        'rl755467777',
        'rl3305145857',
        'rl3640886785',
        'rl2164295169',
        'rl218596865',
        'rl50628097',
        'rl2533524993',
        'rl3433267713',
        'rl3204875777',
        'rl2424210945',
        'rl1333691905',
        'rl3473442305',
        'rl419792385',
        'rl1258849793',
        'rl1611040257',
        'rl3825763841',
        'rl4278486529',
        'rl1745126913',
        'rl1655931393',
        'rl302548481',
        'rl3020195329',
        'rl3842541057',
        'rl867926529',
        'rl2550760961',
        'rl4127819265',
        'rl2567538177',
        'rl2030601729',
        'rl2998501889',
        'rl990348801',
        'rl3221784065',
        'rl235374081',
        'rl1628014081',
        'rl2611249665',
        'rl3669066241',
        'rl2684847617',
        'rl1459979777',
        'rl3775038977',
        'rl2651096577',
        'rl2903213569',
        'rl1325958657',
        'rl615875073'
        'rl50824705',
        'rl537298433',
        'rl1617987073',
        'rl1386316289',
        'rl716997121',
        'rl252151297'
        ]

dicts = []

for link in mojo_links:
    dicts.append(get_movie_dict('/release/{}/'.format(link)))

dicts

ValueError: ('String does not contain a date:', '')

## Step 1: extracting movie titles from the soup.

In [127]:
titles = soup.find_all(class_ = "a-text-left mojo-field-type-release mojo-cell-wide")
print(titles[0])

<td class="a-text-left mojo-field-type-release mojo-cell-wide"><a class="a-link-normal" href="/release/rl3059975681/?ref_=bo_yld_table_1">Avengers: Endgame</a></td>


In [128]:
titles[0].select("a")

[<a class="a-link-normal" href="/release/rl3059975681/?ref_=bo_yld_table_1">Avengers: Endgame</a>]

In [129]:
titles[0].select("a")[0].string

'Avengers: Endgame'

In [130]:
all_titles = []
for title in titles:
    all_titles.append(title.select("a")[0].text)

In [131]:
all_titles

['Avengers: Endgame',
 'The Lion King',
 'Toy Story 4',
 'Frozen II',
 'Captain Marvel',
 'Star Wars: Episode IX - The Rise of Skywalker',
 'Spider-Man: Far from Home',
 'Aladdin',
 'Joker',
 'It Chapter Two',
 'Jumanji: The Next Level',
 'Us',
 'Fast & Furious Presents: Hobbs & Shaw',
 'John Wick: Chapter 3 - Parabellum',
 'How to Train Your Dragon: The Hidden World',
 'The Secret Life of Pets 2',
 'Pokémon Detective Pikachu',
 'Once Upon a Time... in Hollywood',
 'Shazam!',
 'Aquaman',
 'Knives Out',
 'Dumbo',
 'Maleficent: Mistress of Evil',
 'Glass',
 'Godzilla: King of the Monsters',
 'The Upside',
 'Ford v Ferrari',
 'The Lego Movie 2: The Second Part',
 'Hustlers',
 'The Addams Family',
 'Downton Abbey',
 'Rocketman',
 'Alita: Battle Angel',
 'Good Boys',
 'Spider-Man: Into the Spider-Verse',
 'Men in Black: International',
 'Annabelle Comes Home',
 'Yesterday',
 'A Madea Family Funeral',
 'Zombieland: Double Tap',
 'Angel Has Fallen',
 'Scary Stories to Tell in the Dark',
 'Mar

In [86]:
len(all_titles)

909

## Step 2: extracting gross domestic revenue for 2019.

In [87]:
print(soup.find(class_= "a-text-right mojo-field-type-money mojo-estimatable").prettify())

<td class="a-text-right mojo-field-type-money mojo-estimatable">
 $858,373,000
</td>



In [77]:
gross_revenue = soup.find_all(class_ = "a-text-right mojo-field-type-money mojo-estimatable" )
print(gross_revenue[0])

<td class="a-text-right mojo-field-type-money mojo-estimatable">$858,373,000</td>


In [93]:
gross_revenue[0].text

'$858,373,000'

In [102]:
# ATTN: The code below return twice as many values as there are movie titles.
# Conclusion: It groups ALL the money values together.
all_revenues = []
for item in gross_revenue:
    all_revenues.append(item.text)

In [95]:
all_revenues

['$858,373,000',
 '$858,373,000',
 '$543,638,043',
 '$543,638,043',
 '$434,038,008',
 '$434,038,008',
 '$430,144,682',
 '$477,373,578',
 '$426,829,839',
 '$426,829,839',
 '$390,706,234',
 '$515,202,542',
 '$390,532,085',
 '$390,532,085',
 '$355,559,216',
 '$355,559,216',
 '$333,772,511',
 '$335,451,311',
 '$211,593,228',
 '$211,593,228',
 '$192,094,536',
 '$316,831,246',
 '$175,084,580',
 '$175,084,580',
 '$173,956,935',
 '$173,956,935',
 '$171,015,687',
 '$171,015,687',
 '$160,799,505',
 '$160,799,505',
 '$157,949,395',
 '$158,874,395',
 '$144,105,346',
 '$144,105,346',
 '$141,076,968',
 '$142,502,728',
 '$140,371,656',
 '$140,371,656',
 '$136,001,983',
 '$335,061,807',
 '$115,711,579',
 '$165,363,234',
 '$114,766,307',
 '$114,766,307',
 '$113,294,737',
 '$113,929,605',
 '$111,048,468',
 '$111,048,468',
 '$110,500,138',
 '$110,500,138',
 '$108,252,517',
 '$108,252,517',
 '$107,196,444',
 '$117,624,357',
 '$105,806,508',
 '$105,806,508',
 '$104,960,643',
 '$104,963,598',
 '$97,185,807'

In [103]:
grosses = soup.find_all(class_='mojo-field-type-money')
all_grosses = []

for gross in grosses:
    all_grosses.append(gross.get_text())

all_grosses
# Note that the budget is missing from the output

['Budget\n            ',
 'Gross',
 'Total Gross',
 '-',
 '$858,373,000',
 '$858,373,000',
 '-',
 '$543,638,043',
 '$543,638,043',
 '-',
 '$434,038,008',
 '$434,038,008',
 '-',
 '$430,144,682',
 '$477,373,578',
 '-',
 '$426,829,839',
 '$426,829,839',
 '-',
 '$390,706,234',
 '$515,202,542',
 '-',
 '$390,532,085',
 '$390,532,085',
 '-',
 '$355,559,216',
 '$355,559,216',
 '-',
 '$333,772,511',
 '$335,451,311',
 '-',
 '$211,593,228',
 '$211,593,228',
 '-',
 '$192,094,536',
 '$316,831,246',
 '-',
 '$175,084,580',
 '$175,084,580',
 '-',
 '$173,956,935',
 '$173,956,935',
 '-',
 '$171,015,687',
 '$171,015,687',
 '-',
 '$160,799,505',
 '$160,799,505',
 '-',
 '$157,949,395',
 '$158,874,395',
 '-',
 '$144,105,346',
 '$144,105,346',
 '-',
 '$141,076,968',
 '$142,502,728',
 '-',
 '$140,371,656',
 '$140,371,656',
 '-',
 '$136,001,983',
 '$335,061,807',
 '-',
 '$115,711,579',
 '$165,363,234',
 '-',
 '$114,766,307',
 '$114,766,307',
 '-',
 '$113,294,737',
 '$113,929,605',
 '-',
 '$111,048,468',
 '$111

In [104]:
gross_list1 = []
gross_list2 = []

for i, gross in enumerate(all_grosses):
    if gross == '-':
        gross_list1.append(all_grosses[i + 1])
        gross_list2.append(all_grosses[i + 2])

gross_list3 = []
gross_list4 = []

for gross in gross_list1:
    gross_list3.append(gross[1:])
    
for gross in gross_list2:
    gross_list4.append(gross[1:])
        
print(gross_list1)
print(gross_list2)
print(gross_list3)
print(gross_list4)

['$858,373,000', '$543,638,043', '$434,038,008', '$430,144,682', '$426,829,839', '$390,706,234', '$390,532,085', '$355,559,216', '$333,772,511', '$211,593,228', '$192,094,536', '$175,084,580', '$173,956,935', '$171,015,687', '$160,799,505', '$157,949,395', '$144,105,346', '$141,076,968', '$140,371,656', '$136,001,983', '$115,711,579', '$114,766,307', '$113,294,737', '$111,048,468', '$110,500,138', '$108,252,517', '$107,196,444', '$105,806,508', '$104,960,643', '$97,185,807', '$96,854,135', '$96,368,160', '$85,710,210', '$83,140,306', '$82,094,865', '$80,001,807', '$74,152,591', '$73,286,650', '$73,257,045', '$72,930,156', '$69,030,436', '$68,947,075', '$66,027,977', '$65,845,974', '$62,253,077', '$60,679,265', '$60,477,943', '$57,005,601', '$56,706,993', '$56,279,164', '$55,769,470', '$54,733,739', '$54,724,696', '$54,611,903', '$52,927,165', '$50,188,370', '$48,791,187', '$48,546,770', '$45,896,028', '$45,729,221', '$45,216,793', '$44,819,352', '$44,151,387', '$42,547,700', '$42,004,3

## Step 3: extracting the budget data

*How do we get the budget from the separate page and incorporate it with the whole soup?*

In [120]:
budget = soup.find("span", class_ = "money" )
print(budget)

None


In [108]:
budget_regex = re.compile('Budget')
soup.find(text=budget_regex)

'Budget'

In [109]:
budget_string = soup.find(text = budget_regex)
print(budget_string)

Budget


In [116]:
budget_string.findNext()

<th class="a-text-right mojo-field-type-duration hidden mojo-sortable-column hidden a-nowrap"><span title="Running Time">Running Time</span>
            </th>

In [112]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [115]:
runtime = get_movie_value(soup,'Run')
print(runtime)

Gross


In [105]:
Data = pd.read_html(r'https://www.boxofficemojo.com/year/2019/?sortDir=asc&sort=rank&ref_=bo_yld__resort#table')
for data in Data:
    data.to_csv('Data.csv', ',')