In [1]:
import pandas as pd
import numpy as np
from datetime import date

In [2]:
# url for 2021 domestic box office
url_2021 = 'https://www.boxofficemojo.com/year/2021/?ref_=bo_yl_table_1'

In [3]:
# use read_html function in Pandas to automatically scrape any tabular data from BoxOfficeMojo
# returns a list of dataframes for any tabular data found
table_2021 = pd.read_html(url_2021)
table_2021

[    Rank                                          Release Genre Budget  \
 0      1                            The Croods: A New Age     -      -   
 1      2                                Wonder Woman 1984     -      -   
 2      3                                    Tom and Jerry     -      -   
 3      4                                     The Marksman     -      -   
 4      5                                The Little Things     -      -   
 ..   ...                                              ...   ...    ...   
 74    75                                     Elbow Grease     -      -   
 75    76                             In the Life of Music     -      -   
 76    77  Killer Raccoons! 2! Dark Christmas in the Dark!     -      -   
 77    78                                          The Bra     -      -   
 78    79                                       The Rescue     -      -   
 
    Running Time        Gross Theaters  Total Gross Release Date  \
 0             -  $20,402,655 

In [4]:
# slice off dataframe we want using normal indexing
df_21 = table_2021[0]
df_21

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated
0,1,The Croods: A New Age,-,-,-,"$20,402,655",2211,"$52,736,935",Nov 25,Universal Pictures,False
1,2,Wonder Woman 1984,-,-,-,"$17,755,000",2013,"$43,618,000",Dec 25,Warner Bros.,False
2,3,Tom and Jerry,-,-,-,"$14,110,000",2475,"$14,110,000",Feb 26,Warner Bros.,False
3,4,The Marksman,-,-,-,"$12,468,367",2018,"$12,468,367",Jan 15,Open Road Films (II),False
4,5,The Little Things,-,-,-,"$11,405,000",2206,"$12,910,000",Jan 29,Warner Bros.,False
...,...,...,...,...,...,...,...,...,...,...,...
74,75,Elbow Grease,-,-,-,$917,5,$917,Feb 19,-,False
75,76,In the Life of Music,-,-,-,$912,1,"$7,238",Aug 14,Indican Pictures,False
76,77,Killer Raccoons! 2! Dark Christmas in the Dark!,-,-,-,$900,12,"$13,282",Jul 31,Indican Pictures,False
77,78,The Bra,-,-,-,$572,2,"$8,314",Oct 16,Indican Pictures,False


In [5]:
# create column of years for clarification later
df_21["Year"] = 2021

In [7]:
# create list of years we want data from
years = np.arange(2010, 2021, 1).tolist()
years

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [9]:
# create list of columns to create an empty dataframe with
column_list = df_21.columns.tolist()
print(column_list)

['Rank', 'Release', 'Genre', 'Budget', 'Running Time', 'Gross', 'Theaters', 'Total Gross', 'Release Date', 'Distributor', 'Estimated', 'Year']


In [10]:
# create dateframe with empty columns
new_df = pd.DataFrame(columns = column_list)
new_df

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,Year


In [11]:
# for loop that iterates through our selected years and appends each year's box office data into the empty dataframe
for year in years:
    url = f'https://www.boxofficemojo.com/year/{year}/?grossesOption=calendarGrosses'
    tables = pd.read_html(url)
    df = tables[0]
    df["Year"] = year
    new_df = new_df.append(df)

In [12]:
# check results
new_df

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,Year
0,1,Avatar,-,-,-,"$466,141,929",3461,"$749,766,139",Dec 18,Twentieth Century Fox,False,2010
1,2,Toy Story 3,-,-,-,"$415,004,880",4028,"$415,004,880",Jun 18,Walt Disney Studios Motion Pictures,False,2010
2,3,Alice in Wonderland,-,-,-,"$334,191,110",3739,"$334,191,110",Mar 5,Walt Disney Studios Motion Pictures,False,2010
3,4,Iron Man 2,-,-,-,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,False,2010
4,5,The Twilight Saga: Eclipse,-,-,-,"$300,531,751",4468,"$300,531,751",Jun 30,Summit Entertainment,False,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
449,450,Asako I & II,-,-,-,$231,3,"$25,559",May 17,Grasshopper Film,False,2020
450,451,Chained for Life,-,-,-,$115,3,"$17,431",Sep 13,Kino International,False,2020
451,452,Shooting the Mafia,-,-,-,$88,3,"$10,881",Nov 22,Cohen Media Group,False,2020
452,453,Benjamin the Elephant (2020),-,-,-,$49,1,$49,Oct 16,Viva Pictures,False,2020


In [13]:
# combine previous tabular data from 2021 using .append()
new_df = new_df.append(df_21)

In [14]:
new_df

Unnamed: 0,Rank,Release,Genre,Budget,Running Time,Gross,Theaters,Total Gross,Release Date,Distributor,Estimated,Year
0,1,Avatar,-,-,-,"$466,141,929",3461,"$749,766,139",Dec 18,Twentieth Century Fox,False,2010
1,2,Toy Story 3,-,-,-,"$415,004,880",4028,"$415,004,880",Jun 18,Walt Disney Studios Motion Pictures,False,2010
2,3,Alice in Wonderland,-,-,-,"$334,191,110",3739,"$334,191,110",Mar 5,Walt Disney Studios Motion Pictures,False,2010
3,4,Iron Man 2,-,-,-,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,False,2010
4,5,The Twilight Saga: Eclipse,-,-,-,"$300,531,751",4468,"$300,531,751",Jun 30,Summit Entertainment,False,2010
...,...,...,...,...,...,...,...,...,...,...,...,...
74,75,Elbow Grease,-,-,-,$917,5,$917,Feb 19,-,False,2021
75,76,In the Life of Music,-,-,-,$912,1,"$7,238",Aug 14,Indican Pictures,False,2021
76,77,Killer Raccoons! 2! Dark Christmas in the Dark!,-,-,-,$900,12,"$13,282",Jul 31,Indican Pictures,False,2021
77,78,The Bra,-,-,-,$572,2,"$8,314",Oct 16,Indican Pictures,False,2021


In [15]:
# drop empty columns
new_df = new_df.drop(['Genre', 'Budget', 'Running Time', 'Estimated'], axis=1)

In [16]:
# reset index
new_df = new_df.reset_index(drop=True)

In [17]:
# final check
new_df

Unnamed: 0,Rank,Release,Gross,Theaters,Total Gross,Release Date,Distributor,Year
0,1,Avatar,"$466,141,929",3461,"$749,766,139",Dec 18,Twentieth Century Fox,2010
1,2,Toy Story 3,"$415,004,880",4028,"$415,004,880",Jun 18,Walt Disney Studios Motion Pictures,2010
2,3,Alice in Wonderland,"$334,191,110",3739,"$334,191,110",Mar 5,Walt Disney Studios Motion Pictures,2010
3,4,Iron Man 2,"$312,433,331",4390,"$312,433,331",May 7,Paramount Pictures,2010
4,5,The Twilight Saga: Eclipse,"$300,531,751",4468,"$300,531,751",Jun 30,Summit Entertainment,2010
...,...,...,...,...,...,...,...,...
8849,75,Elbow Grease,$917,5,$917,Feb 19,-,2021
8850,76,In the Life of Music,$912,1,"$7,238",Aug 14,Indican Pictures,2021
8851,77,Killer Raccoons! 2! Dark Christmas in the Dark!,$900,12,"$13,282",Jul 31,Indican Pictures,2021
8852,78,The Bra,$572,2,"$8,314",Oct 16,Indican Pictures,2021


In [18]:
# export dataframe of results with today's date
today = date.today()
d_today = today.strftime("%Y_%m_%d")

In [19]:
#export as csv for use elsewhere
new_df.to_csv(f"{d_today}_DomesticBoxOffice_BoxOfficeMojo_2010_2021.csv", index=False, header=True)