In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests

import pandas as pd

from splinter import Browser


In [2]:
# URL to the page with movies that grosses > $50M
url = 'https://www.boxofficemojo.com/alltime/weekends/?pagenum=m50&sort=opengross&p=.htm&order=DESC'

#Scrape data on the page
tables = pd.read_html(url)

#Tables is a list of all the tables on the page, the required dataframe is stored in index 3
df = tables[3]

df.columns = ['Rank', 'Title', 'Studio', 'Opening', '% of Total', 'Theaters', 'Average', 'Total Gross', 'Date']

#Remove heading row that's saved as the first row
df = df.iloc[1:]

df.set_index('Rank', inplace=True)

df.head()

Unnamed: 0_level_0,Title,Studio,Opening,% of Total,Theaters,Average,Total Gross,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Avengers: Endgame,BV,"$357,115,007",41.6%,4662,"$76,601","$858,373,000",4/26/2019
2,Avengers: Infinity War,BV,"$257,698,183",38.0%,4474,"$57,599","$678,815,482",4/27/2018
3,Star Wars: The Force Awakens,BV,"$247,966,675",26.5%,4134,"$59,982","$936,662,225",12/18/2015
4,Star Wars: The Last Jedi,BV,"$220,009,584",35.5%,4232,"$51,987","$620,181,382",12/15/2017
5,Jurassic World,Uni.,"$208,806,270",32.0%,4274,"$48,855","$652,270,625",6/12/2015


In [3]:
# Tab -  $40 - 50M, has the remaining data
url2 = 'https://www.boxofficemojo.com/alltime/weekends/?pagenum=m4050&sort=opengross&p=.htm&order=DESC'

tables2 = pd.read_html(url2)

#Tables is a list of all the tables on the page, the required dataframe is stored in index 3
df2 = tables2[3]

df2.columns = ['Rank', 'Title', 'Studio', 'Opening', '% of Total', 'Theaters', 'Average', 'Total Gross', 'Date']

#Remove heading row that's saved as the first row
df2 = df2.iloc[1:]

df2.set_index('Rank', inplace=True)


df2.head()

Unnamed: 0_level_0,Title,Studio,Opening,% of Total,Theaters,Average,Total Gross,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
249,Taken 2,Fox,"$49,514,769",35.4%,3661,"$13,525","$139,854,287",10/5/2012
250,How to Train Your Dragon 2,Fox,"$49,451,322",27.9%,4253,"$11,627","$177,002,924",6/13/2014
251,Rush Hour 3,NL,"$49,100,158",35.0%,3778,"$12,996","$140,125,968",8/10/2007
252,Wreck-It Ralph,BV,"$49,038,712",25.9%,3752,"$13,070","$189,422,889",11/2/2012
253,Neighbors,Uni.,"$49,033,915",32.7%,3279,"$14,954","$150,157,400",5/9/2014


In [4]:
#Concatenate the 2 dataframes
boxOffice_df = pd.concat([df,df2])

#We only need the top 250
boxOffice_df = boxOffice_df.iloc[0:250]

boxOffice_df.head()


Unnamed: 0_level_0,Title,Studio,Opening,% of Total,Theaters,Average,Total Gross,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Avengers: Endgame,BV,"$357,115,007",41.6%,4662,"$76,601","$858,373,000",4/26/2019
2,Avengers: Infinity War,BV,"$257,698,183",38.0%,4474,"$57,599","$678,815,482",4/27/2018
3,Star Wars: The Force Awakens,BV,"$247,966,675",26.5%,4134,"$59,982","$936,662,225",12/18/2015
4,Star Wars: The Last Jedi,BV,"$220,009,584",35.5%,4232,"$51,987","$620,181,382",12/15/2017
5,Jurassic World,Uni.,"$208,806,270",32.0%,4274,"$48,855","$652,270,625",6/12/2015


In [5]:
#Replace abbreviated studio names with actual Studio names
print(boxOffice_df['Studio'].unique())

boxOffice_df['Studio'].replace(['BV', 'Uni.', 'WB', 'LGF' ,'Sony', 'Sum.' ,'LG/S', 'Fox' ,'Par.', 'WB (NL)', 'P/DW' ,'DW', 'NM' ,'NL', 'MGM'], ['Buena Vista', 'Universal', 'Warner Bros.', 'Lionsgate' ,'Sony / Columbia', 'Summit Entertainment' ,'Lionsgate' , '20th Century Fox' ,'Paramount', 'Warner Bros.', 'Paramount' ,'Dreamworks SKG', 'Newmarket' ,'New Line', 'MGM'], inplace=True)


['BV' 'Uni.' 'WB' 'LGF' 'Sony' 'Sum.' 'LG/S' 'Fox' 'Par.' 'WB (NL)' 'P/DW'
 'DW' 'NM' 'NL' 'MGM']


In [6]:
boxOffice_df.head()

Unnamed: 0_level_0,Title,Studio,Opening,% of Total,Theaters,Average,Total Gross,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Avengers: Endgame,Buena Vista,"$357,115,007",41.6%,4662,"$76,601","$858,373,000",4/26/2019
2,Avengers: Infinity War,Buena Vista,"$257,698,183",38.0%,4474,"$57,599","$678,815,482",4/27/2018
3,Star Wars: The Force Awakens,Buena Vista,"$247,966,675",26.5%,4134,"$59,982","$936,662,225",12/18/2015
4,Star Wars: The Last Jedi,Buena Vista,"$220,009,584",35.5%,4232,"$51,987","$620,181,382",12/15/2017
5,Jurassic World,Universal,"$208,806,270",32.0%,4274,"$48,855","$652,270,625",6/12/2015


## Retrieve Movie URLs

In [7]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [8]:
# Retrieve page with the requests module
browser.visit(url)

In [9]:
# Create BeautifulSoup object
soup = bs(browser.html,'html.parser')

#Retrieve the required table
movieTable = soup.find_all("table")[4]

In [10]:

eachMovie = movieTable.find_all("tr")

movieList = []

for one in range(len(eachMovie)):
    movieURL = eachMovie[one].find_all("td")[1].find("a")["href"]
    movieName = eachMovie[one].find_all("td")[1].text
    movieRank = eachMovie[one].find_all("td")[0].text
    
    movieList.append({"movieURL" : "https://www.boxofficemojo.com" + movieURL, "movieName" : movieName, "movieRank": movieRank})
    
    #print(f"movieURL: {movieLink}, movieName: {movieName}, movieRank: {movieRank}")
    
movieList_df = pd.DataFrame( movieList)

movieList_df = movieList_df.iloc[1:250]


movieList_df#.head(15)

Unnamed: 0,movieName,movieRank,movieURL
1,Avengers: Endgame,1,https://www.boxofficemojo.com/movies/?id=marve...
2,Avengers: Infinity War,2,https://www.boxofficemojo.com/movies/?id=marve...
3,Star Wars: The Force Awakens,3,https://www.boxofficemojo.com/movies/?id=starw...
4,Star Wars: The Last Jedi,4,https://www.boxofficemojo.com/movies/?id=starw...
5,Jurassic World,5,https://www.boxofficemojo.com/movies/?id=juras...
6,Marvel's The Avengers,6,https://www.boxofficemojo.com/movies/?id=aveng...
7,Black Panther,7,https://www.boxofficemojo.com/movies/?id=marve...
8,The Lion King (2019),8,https://www.boxofficemojo.com/movies/?id=lionk...
9,Avengers: Age of Ultron,9,https://www.boxofficemojo.com/movies/?id=aveng...
10,Incredibles 2,10,https://www.boxofficemojo.com/movies/?id=thein...


In [11]:
#Second tab

# Retrieve page with the requests module
browser.visit(url2)

In [12]:
# Create BeautifulSoup object
soup = bs(browser.html,'html.parser')
    
#Retrieve the required table
movieTable2 = soup.find_all("table")[4]

In [14]:

eachMovie2 = movieTable2.find_all("tr")

movieList2 = []

for one in range(len(eachMovie2)):
    movieURL = eachMovie2[one].find_all("td")[1].find("a")["href"]
    movieName = eachMovie2[one].find_all("td")[1].text
    movieRank = eachMovie2[one].find_all("td")[0].text
    
    movieList2.append({"movieURL" : "https://www.boxofficemojo.com" + movieURL, "movieName" : movieName, "movieRank": movieRank})
    
    #print(f"movieURL: {movieLink}, movieName: {movieName}, movieRank: {movieRank}")
    
movieList_df2 = pd.DataFrame( movieList2)

movieList_df2 = movieList_df2.iloc[1:3]


movieList_df2.head()

Unnamed: 0,movieName,movieRank,movieURL
1,Taken 2,249,https://www.boxofficemojo.com/movies/?id=taken...
2,How to Train Your Dragon 2,250,https://www.boxofficemojo.com/movies/?id=howto...


In [15]:
#Concatenate the 2 dataframes
movieList_df = pd.concat([movieList_df,movieList_df2])

movieList_df.head()


Unnamed: 0,movieName,movieRank,movieURL
1,Avengers: Endgame,1,https://www.boxofficemojo.com/movies/?id=marve...
2,Avengers: Infinity War,2,https://www.boxofficemojo.com/movies/?id=marve...
3,Star Wars: The Force Awakens,3,https://www.boxofficemojo.com/movies/?id=starw...
4,Star Wars: The Last Jedi,4,https://www.boxofficemojo.com/movies/?id=starw...
5,Jurassic World,5,https://www.boxofficemojo.com/movies/?id=juras...


In [17]:

movieList_df.set_index('movieRank', inplace=True)

movieList_df

Unnamed: 0_level_0,movieName,movieURL
movieRank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Avengers: Endgame,https://www.boxofficemojo.com/movies/?id=marve...
2,Avengers: Infinity War,https://www.boxofficemojo.com/movies/?id=marve...
3,Star Wars: The Force Awakens,https://www.boxofficemojo.com/movies/?id=starw...
4,Star Wars: The Last Jedi,https://www.boxofficemojo.com/movies/?id=starw...
5,Jurassic World,https://www.boxofficemojo.com/movies/?id=juras...
6,Marvel's The Avengers,https://www.boxofficemojo.com/movies/?id=aveng...
7,Black Panther,https://www.boxofficemojo.com/movies/?id=marve...
8,The Lion King (2019),https://www.boxofficemojo.com/movies/?id=lionk...
9,Avengers: Age of Ultron,https://www.boxofficemojo.com/movies/?id=aveng...
10,Incredibles 2,https://www.boxofficemojo.com/movies/?id=thein...
