# Scraping BoxOfficeMojo with BeautifulSoup

In [26]:
from bs4 import BeautifulSoup
import requests
import time, os
import dateutil.parser

import pandas as pd
import re
import seaborn as sns 
import matplotlib.pyplot as plt

from urllib.parse import urljoin
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [34]:
link = '/year/2020/?ref_=bo_yl_table_1'
base_url = 'https://www.boxofficemojo.com' 
url = base_url + link

In [35]:
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df.drop(columns=['Genre', 'Budget', 'Running Time'], inplace=True)

In [36]:
df.head()

Unnamed: 0,Rank,Release,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated
0,1,Bad Boys for Life,"$204,417,855",3775,"$204,417,855",Jan 17,-,False
1,2,1917,"$157,901,466",3987,"$159,227,644",Dec 25,Universal Pictures,False
2,3,Sonic the Hedgehog,"$146,066,470",4198,"$146,066,470",Feb 14,Paramount Pictures,False
3,4,Jumanji: The Next Level,"$124,736,710",4227,"$316,831,246",Dec 13,-,False
4,5,Star Wars: Episode IX - The Rise of Skywalker,"$124,496,308",4406,"$515,202,542",Dec 20,-,False


In [37]:
def money_to_int(moneystring):
    if type(moneystring) != float:
        moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    if runtimestring != None:
        runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [38]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [43]:
def get_movie_dict(link):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - domestic gross
        - runtime 
        - MPAA rating
        - full release date
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = urljoin(base_url, link)
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")


    headers = ['movie_title', 'domestic_total_gross',
               'runtime_minutes', 'rating', 'release_date', 'budget']

    #Get title
    title_string = soup.find('title').text
    title = title_string.split('-')[0].strip()

    #Get domestic gross
    try:
        raw_domestic_total_gross = (soup.find(class_='mojo-performance-summary-table')
                                    .find_all('span', class_='money')[0]
                                    .text
                               )
    except:
        raw_domestic_total_gross = float("NaN")

    if type(raw_domestic_total_gross) == float or type(raw_domestic_total_gross) == 'NoneType':
        print('This is NaN')
        domestic_total_gross = float("NaN")
    else:
        domestic_total_gross = money_to_int(raw_domestic_total_gross)

    #Get runtime
    raw_runtime = get_movie_value(soup,'Running')
    if type(raw_runtime) != float and type(raw_runtime) != 'NoneType':
        runtime = runtime_to_minutes(raw_runtime)

    #Get rating
    rating = get_movie_value(soup,'MPAA')

    #Get release date
    if '-' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('-')[0]
    elif '(' in get_movie_value(soup, 'Release Date'):
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    else:
        raw_release_date = get_movie_value(soup,'Release Date').split('(')[0]
    release_date = to_date(raw_release_date)



    # Get budget alt
    raw_budget = get_movie_value(soup,'Budget')
    if raw_budget:
        budget = money_to_int(raw_budget)
    else:
        budget = 0

    #Create movie dictionary and return
    movie_dict = dict(zip(headers,[title,
                                domestic_total_gross,
                                runtime,
                                rating,
                                release_date,
                                budget]))

    return movie_dict


In [44]:
mojo_links = [ 
        'rl1182631425',
        'rl2969994753',
        'rl4244997633',
        'rl755467777',
        'rl3305145857',
        'rl3640886785',
        'rl2164295169',
        'rl218596865',
        'rl50628097',
        'rl2533524993',
        'rl3433267713',
        'rl3204875777',
        'rl2424210945',
        'rl1333691905',
        'rl3473442305',
        'rl419792385',
        'rl1258849793',
        'rl1611040257',
        'rl3825763841',
        'rl4278486529',
        'rl1745126913',
        'rl1655931393',
        'rl302548481',
        'rl3020195329',
        'rl3842541057',
        'rl867926529',
        'rl2550760961',
        'rl4127819265',
        'rl2567538177',
        'rl2030601729',
        'rl2998501889',
        'rl990348801',
        'rl3221784065',
        'rl235374081',
        'rl1628014081',
        'rl2611249665',
        'rl3669066241',
        'rl2684847617',
        'rl1459979777',
        'rl3775038977',
        'rl2651096577',
        'rl2903213569',
        'rl1325958657',
        'rl615875073'
        'rl50824705',
        'rl537298433',
        'rl1617987073',
        'rl1386316289',
        'rl716997121',
        'rl252151297'
        ]

dicts = []

for link in mojo_links:
    dicts.append(get_movie_dict('/release/{}/'.format(link)))

dicts

[{'movie_title': 'Bad Boys for Life',
  'domestic_total_gross': 204417855,
  'runtime_minutes': 124,
  'rating': 'R',
  'release_date': datetime.datetime(2020, 1, 17, 0, 0),
  'budget': 90000000},
 {'movie_title': '1917',
  'domestic_total_gross': 159227644,
  'runtime_minutes': 119,
  'rating': 'R',
  'release_date': datetime.datetime(2019, 12, 25, 0, 0),
  'budget': 95000000},
 {'movie_title': 'Sonic the Hedgehog',
  'domestic_total_gross': 146066470,
  'runtime_minutes': 99,
  'rating': 'PG',
  'release_date': datetime.datetime(2020, 2, 14, 0, 0),
  'budget': 85000000},
 {'movie_title': 'Jumanji: The Next Level',
  'domestic_total_gross': 316831246,
  'runtime_minutes': 123,
  'rating': 'PG-13',
  'release_date': datetime.datetime(2019, 12, 13, 0, 0),
  'budget': 125000000},
 {'movie_title': 'Star Wars: Episode IX',
  'domestic_total_gross': 515202542,
  'runtime_minutes': 142,
  'rating': 'PG-13',
  'release_date': datetime.datetime(2019, 12, 20, 0, 0),
  'budget': 275000000},
 {'m

In [52]:
headers = ['movie title', 'domestic total gross',
           'runtime (mins)', 'rating', 'release date']

movie_data = []
movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date]))

movie_data.append(movie_dict)
movie_data

NameError: name 'title' is not defined

In [55]:
movie_df = pd.DataFrame(dicts)  #convert list of dict to df
movie_df.set_index('movie_title', inplace=True)
movie_df.head()

Unnamed: 0_level_0,domestic_total_gross,runtime_minutes,rating,release_date,budget
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bad Boys for Life,204417855,124,R,2020-01-17,90000000
1917,159227644,119,R,2019-12-25,95000000
Sonic the Hedgehog,146066470,99,PG,2020-02-14,85000000
Jumanji: The Next Level,316831246,123,PG-13,2019-12-13,125000000
Star Wars: Episode IX,515202542,142,PG-13,2019-12-20,275000000
