# Web Scrape Box Office Mojo
## Import Modules

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [2]:
url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_lnav_hm_shrt'
response = requests.get(url)
response.status_code

200

In [3]:
response.text[:1000]

'<!doctype html><html class="a-no-js" data-19ax5a9jf="dingo"><head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1.0" />\n                <meta charset="utf-8" />\n            <title dir="ltr">Top Lifetime Grosses - Box Office Mojo</title><meta content="Top Lifetime Grosses" name="title" />\n            <meta content="Box Office Mojo" property="og:site_name" />\n            <meta content="https://m.media-amazon.com/images/G/01/boxofficemojo/logo/mojo-logo-bg.png" property="og:image"/>\n            <meta name="format-detection" content="telephone=no" />\n            <link href="https://m.media-amazon.com/images/G/01/boxofficemojo/v2/favicon._CB448965889_.ico" type="image/x-icon" rel="icon" />\n            <link rel="stylesheet" href="https://images-na.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|012LjolmrML.css,41DAFIecsVL.css,51IB+wfP8qL.css,01ZfXnjPmmL.css,01oDR3IULNL.css,01Vctty9pOL.c

In [3]:
page = response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find('table')
#table

In [5]:
rows = [row for row in table.find_all('tr')] 

rows[1]

<tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">1</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt2488496/?ref_=bo_cso_table_1">Star Wars: Episode VII - The Force Awakens</a></td><td class="a-text-right mojo-field-type-money">$936,662,225</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2015/?ref_=bo_cso_table_1">2015</a></td></tr>

## Collect All Movies
Begin by scraping the 1000 rows of the top grossing movies

In [6]:
url_list = ['https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_lnav_hm_shrt', 
            'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=200', 
            'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=400', 
            'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=600', 
            'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=800']

rows = []

for url in url_list:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    table = soup.find('table')
    rows.extend(table.find_all('tr')[1:]) 

In [7]:
print(len(rows))

1000


In [8]:
movies = {}

for row in rows[:1001]:
    items = row.find_all('td')
    title_link = items[1].find('a')
    title, url = title_link.text, title_link['href']
    link_stub = title_link['href']
    movies[link_stub] = [url] + [i.text for i in items]
    
print(len(movies))

1000


### Create dataframe 
Convert dictionary to dataframe. Create columns for collected data.

In [9]:
import pandas as pd

top_movies = pd.DataFrame(movies).T  #transpose
top_movies.columns = ['link_stub', 'rank', 'title','lifetime_gross', 'year']

print(len(top_movies))

1000


In [10]:
top_movies.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year
/title/tt2488496/?ref_=bo_cso_table_1,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015
/title/tt4154796/?ref_=bo_cso_table_2,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019
/title/tt0499549/?ref_=bo_cso_table_3,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009
/title/tt1825683/?ref_=bo_cso_table_4,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018
/title/tt4154756/?ref_=bo_cso_table_5,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018


### Create functions
Creating functions to further scrape tables for each movie title.

In [4]:
url = 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1'
response = requests.get(url)

response.status_code

200

In [5]:
page = response.text

In [3]:
soup = BeautifulSoup(page, 'lxml')

NameError: name 'page' is not defined

In [4]:
def movie_value(soup, field_name):
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [25]:
'''distributor_str = movie_value(soup, 'Distributor').split('See')[0]
distributor_str'''

"distributor_str = movie_value(soup, 'Distributor').split('See')[0]\ndistributor_str"

In [26]:
'''world_gross_str = soup.find(class_ = 'a-section a-spacing-none mojo-performance-summary-table').find_all(class_ = 'money')
worldwide_gross = world_gross_str[-1].text
worldwide_gross'''

"world_gross_str = soup.find(class_ = 'a-section a-spacing-none mojo-performance-summary-table').find_all(class_ = 'money')\nworldwide_gross = world_gross_str[-1].text\nworldwide_gross"

In [27]:
'''# rating
try:
    rating = movie_value(soup, 'MPAA')
except Exception:
    pass

print(rating)'''

"# rating\ntry:\n    rating = movie_value(soup, 'MPAA')\nexcept Exception:\n    pass\n\nprint(rating)"

Created function using selenium to scrape the lead actor/actress for all 1000 movies and put them into a list. This was added as a new column in my dataframe.

In [5]:
from bs4 import BeautifulSoup
import requests
import time, os

In [16]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [17]:
driver = webdriver.Chrome(chromedriver)

In [18]:
actor_list = []

try:
    for link in top_movies.link_stub:
        base_url = 'https://www.boxofficemojo.com'
        url = base_url + link
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        short_link = driver.find_element_by_xpath("//*[@id='tabs']/div/a[2]")
        short_link.click()
        time.sleep(1)
        actor = driver.find_element_by_xpath('//*[@id="principalCast"]/tbody/tr[2]/td[1]/a')
        actor = actor.text
        actor_list.append(actor)
except NoSuchElementException:
    pass

In [20]:
len(actor_list)

1000

In [21]:
top_movies['actor_actress'] = actor_list
top_movies.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,actor_actress
/title/tt2488496/?ref_=bo_cso_table_1,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,Daisy Ridley
/title/tt4154796/?ref_=bo_cso_table_2,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,Robert Downey Jr.
/title/tt0499549/?ref_=bo_cso_table_3,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,Sam Worthington
/title/tt1825683/?ref_=bo_cso_table_4,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,Chadwick Boseman
/title/tt4154756/?ref_=bo_cso_table_5,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,Robert Downey Jr.


In [22]:
top_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, /title/tt2488496/?ref_=bo_cso_table_1 to /title/tt0075265/?ref_=bo_cso_table_200
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   link_stub       1000 non-null   object
 1   rank            1000 non-null   object
 2   title           1000 non-null   object
 3   lifetime_gross  1000 non-null   object
 4   year            1000 non-null   object
 5   actor_actress   1000 non-null   object
dtypes: object(6)
memory usage: 54.7+ KB


In [23]:
top_movies.to_csv('first_movies_df.csv', index=False)

## More functions
Created a function to scrape all info I wanted from each movie's page. This dictionary was converted into a dataframe and merged with the earlier dataframe.

In [7]:
def get_movie_dict(link):
    
    base_url = 'https://www.boxofficemojo.com'
    
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    movie_dict = {}

    #link_stub
    movie_dict['link_stub'] = link
    
    #title
    title_string = soup.find('title').text
    movie_dict['title'] = title_string.split(' - Box')[0].strip()
    
    # budget
    try:
        movie_dict['budget'] = movie_value(soup,'Budget')
    except AttributeError:
        pass
    
    #rating
    try:
        movie_dict['rating'] = movie_value(soup, 'MPAA')
    except AttributeError:
        pass
    
    # genre
    try:
        movie_dict['genre'] = movie_value(soup,'Genres').split()
    except AttributeError:
        pass
  
    #runtime
    raw_runtime = movie_value(soup,'Running')
    try:
        raw_runtime = raw_runtime.split()
        movie_dict['runtime (mins)'] = int(raw_runtime[0])*60 + int(raw_runtime[2])
    except:
        pass

    #release date
    try:
        release_date_str = soup.find(text = re.compile('Release Date'))
        movie_dict['release date'] = release_date_str.findNext().text.split('\n')[0]
    except AttributeError:
        pass
    
    #ditributor2
    try:
        movie_dict['distributor'] = movie_value(soup, 'Distributor').split('See')[0]
    except AttributeError:
        pass
    
    #opening gross
    try:
        movie_dict['opening gross'] = movie_value(soup, 'Opening')
    except AttributeError:
        pass
    
    return movie_dict

    

In [6]:
top_movies = pd.read_csv('first_movies_df.csv')

In [8]:
top_movies_list = []

for link in top_movies.link_stub:
    top_movies_list.append(get_movie_dict(link))
    time.sleep(2)

In [9]:
print(len(top_movies_list))

1000


In [18]:
#convert list to df
top_movies_df = pd.DataFrame(top_movies_list)  
top_movies_df.set_index('link_stub', inplace=True)

top_movies_df.shape

(1000, 8)

In [19]:
df = top_movies_df['genre'].str.join('|').str.get_dummies()

In [20]:
df

Unnamed: 0_level_0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
link_stub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/title/tt2488496/?ref_=bo_cso_table_1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
/title/tt4154796/?ref_=bo_cso_table_2,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
/title/tt0499549/?ref_=bo_cso_table_3,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
/title/tt1825683/?ref_=bo_cso_table_4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
/title/tt4154756/?ref_=bo_cso_table_5,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/title/tt1060277/?ref_=bo_cso_table_196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
/title/tt0087277/?ref_=bo_cso_table_197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/title/tt0989757/?ref_=bo_cso_table_198,0,0,0,0,0,0,0,1,0,0,...,0,0,1,1,0,0,0,1,1,0
/title/tt2283336/?ref_=bo_cso_table_199,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [29]:
movies_df = top_movies_df.merge(df, left_index=True, right_index=True)

movies_df.head()

Unnamed: 0_level_0,title,budget,rating,genre,runtime (mins),release date,distributor,opening gross,Action,Adventure,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
link_stub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/title/tt2488496/?ref_=bo_cso_table_1,Star Wars: Episode VII - The Force Awakens,"$245,000,000",PG-13,"[Action, Adventure, Sci-Fi]",138.0,"December 16, 2015",Walt Disney Studios Motion Pictures,"$247,966,675",1,1,...,0,0,0,0,1,0,0,0,0,0
/title/tt4154796/?ref_=bo_cso_table_2,Avengers: Endgame,"$356,000,000",PG-13,"[Action, Adventure, Drama, Sci-Fi]",181.0,"April 24, 2019",Walt Disney Studios Motion Pictures,"$357,115,007",1,1,...,0,0,0,0,1,0,0,0,0,0
/title/tt0499549/?ref_=bo_cso_table_3,Avatar,"$237,000,000",PG-13,"[Action, Adventure, Fantasy, Sci-Fi]",162.0,"December 16, 2009",Twentieth Century Fox,"$77,025,481",1,1,...,0,0,0,0,1,0,0,0,0,0
/title/tt1825683/?ref_=bo_cso_table_4,Black Panther,,PG-13,"[Action, Adventure, Sci-Fi]",134.0,"February 13, 2018",Walt Disney Studios Motion Pictures,"$202,003,951",1,1,...,0,0,0,0,1,0,0,0,0,0
/title/tt4154756/?ref_=bo_cso_table_5,Avengers: Infinity War,,PG-13,"[Action, Adventure, Sci-Fi]",149.0,"April 25, 2018",Walt Disney Studios Motion Pictures,"$257,698,183",1,1,...,0,0,0,0,1,0,0,0,0,0


## EDA
Check for nulls and either replace or drop. Convert data types as needed. Create dummies for categorical features.

In [30]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, /title/tt2488496/?ref_=bo_cso_table_1 to /title/tt0075265/?ref_=bo_cso_table_200
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           1000 non-null   object 
 1   budget          823 non-null    object 
 2   rating          868 non-null    object 
 3   genre           996 non-null    object 
 4   runtime (mins)  975 non-null    float64
 5   release date    996 non-null    object 
 6   distributor     995 non-null    object 
 7   opening gross   972 non-null    object 
 8   Action          1000 non-null   int64  
 9   Adventure       1000 non-null   int64  
 10  Animation       1000 non-null   int64  
 11  Biography       1000 non-null   int64  
 12  Comedy          1000 non-null   int64  
 13  Crime           1000 non-null   int64  
 14  Documentary     1000 non-null   int64  
 15  Drama           1000 non-null   int64  
 16  Family      

In [31]:
bool_series = pd.isnull(movies_df['rating'])

In [32]:
movies_df[bool_series].head()

Unnamed: 0_level_0,title,budget,rating,genre,runtime (mins),release date,distributor,opening gross,Action,Adventure,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
link_stub,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/title/tt0435761/?ref_=bo_cso_table_30,Toy Story 3,"$200,000,000",,"[Adventure, Animation, Comedy, Family, Fantasy]",103.0,"June 16, 2010",Walt Disney Studios Motion Pictures,"$110,307,189",0,1,...,0,0,0,0,0,0,0,0,0,0
/title/tt0266543/?ref_=bo_cso_table_43,Finding Nemo,"$94,000,000",,"[Adventure, Animation, Comedy, Family]",100.0,"May 30, 2003",Walt Disney Studios Motion Pictures,"$70,251,710",0,1,...,0,0,0,0,0,0,0,0,0,0
/title/tt0080684/?ref_=bo_cso_table_94,Server Error,,,,,,,,0,0,...,0,0,0,0,0,0,0,0,0,0
/title/tt0099785/?ref_=bo_cso_table_102,Home Alone,"$18,000,000",,"[Comedy, Family]",103.0,"November 16, 1990",Twentieth Century Fox,"$17,081,997",0,0,...,0,0,0,0,0,0,0,0,0,0
/title/tt0317705/?ref_=bo_cso_table_116,The Incredibles,"$92,000,000",,"[Action, Adventure, Animation, Family]",115.0,"November 5, 2004",Walt Disney Studios Motion Pictures,"$70,467,623",1,1,...,0,0,0,0,0,0,0,0,0,0


In [34]:
top_movies.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,actor_actress
0,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,Daisy Ridley
1,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,Robert Downey Jr.
2,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,Sam Worthington
3,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,Chadwick Boseman
4,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,Robert Downey Jr.


In [38]:
top_movies_merge = top_movies.merge(movies_df)

In [45]:
top_movies_merge.shape
top_movies_merge.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,actor_actress,budget,rating,genre,runtime (mins),...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,Daisy Ridley,"$245,000,000",PG-13,"[Action, Adventure, Sci-Fi]",138.0,...,0,0,0,0,1,0,0,0,0,0
1,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,Robert Downey Jr.,"$356,000,000",PG-13,"[Action, Adventure, Drama, Sci-Fi]",181.0,...,0,0,0,0,1,0,0,0,0,0
2,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,Sam Worthington,"$237,000,000",PG-13,"[Action, Adventure, Fantasy, Sci-Fi]",162.0,...,0,0,0,0,1,0,0,0,0,0
3,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,Chadwick Boseman,,PG-13,"[Action, Adventure, Sci-Fi]",134.0,...,0,0,0,0,1,0,0,0,0,0
4,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,Robert Downey Jr.,,PG-13,"[Action, Adventure, Sci-Fi]",149.0,...,0,0,0,0,1,0,0,0,0,0


In [46]:
#set(top_movies.index) - set(top_movies_df.index)

In [47]:
#set(top_movies_df.index)-set(top_movies.index)

In [48]:
#check for nulls
top_movies_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1020 entries, 0 to 1019
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   link_stub       1020 non-null   object 
 1   rank            1020 non-null   object 
 2   title           1020 non-null   object 
 3   lifetime_gross  1020 non-null   object 
 4   year            1020 non-null   int64  
 5   actor_actress   1020 non-null   object 
 6   budget          843 non-null    object 
 7   rating          885 non-null    object 
 8   genre           1020 non-null   object 
 9   runtime (mins)  999 non-null    float64
 10  release date    1020 non-null   object 
 11  distributor     1019 non-null   object 
 12  opening gross   993 non-null    object 
 13  Action          1020 non-null   int64  
 14  Adventure       1020 non-null   int64  
 15  Animation       1020 non-null   int64  
 16  Biography       1020 non-null   int64  
 17  Comedy          1020 non-null   i

In [49]:
top_movies_merge.to_csv('top_movies_merge_1.csv', index=False)

In [50]:
bool_series = pd.isnull(top_movies_merge['genre'])

In [52]:
top_movies_merge[bool_series]

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,actor_actress,budget,rating,genre,runtime (mins),...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western


In [53]:
#turn year into int
top_movies_merge['year'] = top_movies_merge['year'].astype(int)

In [54]:
top_movies_merge['year']

0       2015
1       2019
2       2009
3       2018
4       2018
        ... 
1015    2015
1016    2012
1017    2008
1018    2010
1019    2019
Name: year, Length: 1020, dtype: int64

In [55]:
#handle runtime nulls
top_movies_merge['runtime (mins)'] = top_movies_merge['runtime (mins)'].fillna(113.0)

top_movies_merge['runtime (mins)']

0       138.0
1       181.0
2       162.0
3       134.0
4       149.0
        ...  
1015    103.0
1016    130.0
1017     85.0
1018    108.0
1019    114.0
Name: runtime (mins), Length: 1020, dtype: float64

In [56]:
#handle budget nulls
top_movies_merge['budget'] = top_movies_merge['budget'].astype(str)

top_movies_merge["budget"] = top_movies_merge["budget"].str.replace(",","")

In [57]:
top_movies_merge['budget'] = top_movies_merge['budget'].str.replace('$',"")

  top_movies_merge['budget'] = top_movies_merge['budget'].str.replace('$',"")


In [58]:
#top_movies_merge['new budget'] = top_movies_merge['budget'].dropna()

#top_movies_merge['new budget'] = top_movies_merge['new budget'].replace('None', '0')

#top_movies_merge['new budget'] = top_movies_merge['new budget'].astype(float)

#top_movies_merge['new budget'].mean()

In [61]:
top_movies_merge['budget'] = top_movies_merge.budget.replace('None','72067774')

In [62]:
top_movies_merge['budget'].head()

0    245000000
1    356000000
2    237000000
3     72067774
4     72067774
Name: budget, dtype: object

In [63]:
top_movies_merge['budget'] = top_movies_merge['budget'].astype(int)

top_movies_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1020 entries, 0 to 1019
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   link_stub       1020 non-null   object 
 1   rank            1020 non-null   object 
 2   title           1020 non-null   object 
 3   lifetime_gross  1020 non-null   object 
 4   year            1020 non-null   int64  
 5   actor_actress   1020 non-null   object 
 6   budget          1020 non-null   int64  
 7   rating          885 non-null    object 
 8   genre           1020 non-null   object 
 9   runtime (mins)  1020 non-null   float64
 10  release date    1020 non-null   object 
 11  distributor     1019 non-null   object 
 12  opening gross   993 non-null    object 
 13  Action          1020 non-null   int64  
 14  Adventure       1020 non-null   int64  
 15  Animation       1020 non-null   int64  
 16  Biography       1020 non-null   int64  
 17  Comedy          1020 non-null   i

In [64]:
#top_movies_merge['new open gross'] = top_movies_merge['opening gross'].dropna() 

In [65]:
#top_movies_merge['new open gross'] = top_movies_merge['new open gross'].astype(str)

#top_movies_merge['new open gross'] = top_movies_merge['new open gross'].str.replace(",","")

In [66]:
#top_movies_merge['new open gross'] = top_movies_merge['new open gross'].str.replace('$',"")

In [67]:
#top_movies_merge['new open gross'] = top_movies_merge['new open gross'].replace('nan', '0')

In [68]:
#top_movies_merge['new open gross'] = top_movies_merge['new open gross'].replace('Gross', '0')

In [69]:
#top_movies_merge['new open gross'] = top_movies_merge['new open gross'].astype(int)

In [70]:
#top_movies_merge['new open gross'].median()

In [71]:
#handle opening gross nulls
top_movies_merge['opening gross'] = top_movies_merge['opening gross'].astype(str)

top_movies_merge['opening gross'] = top_movies_merge['opening gross'].str.replace(",","")

In [74]:
top_movies_merge['opening gross'] = top_movies_merge['opening gross'].str.replace('$',"")

top_movies_merge['opening gross'] = top_movies_merge['opening gross'].replace('None','31113954')

top_movies_merge['opening gross'] = top_movies_merge['opening gross'].replace('Gross','31113954')

  top_movies_merge['opening gross'] = top_movies_merge['opening gross'].str.replace('$',"")


In [75]:
top_movies_merge['opening gross'] = top_movies_merge['opening gross'].astype(int)

top_movies_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1020 entries, 0 to 1019
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   link_stub       1020 non-null   object 
 1   rank            1020 non-null   object 
 2   title           1020 non-null   object 
 3   lifetime_gross  1020 non-null   object 
 4   year            1020 non-null   int64  
 5   actor_actress   1020 non-null   object 
 6   budget          1020 non-null   int64  
 7   rating          885 non-null    object 
 8   genre           1020 non-null   object 
 9   runtime (mins)  1020 non-null   float64
 10  release date    1020 non-null   object 
 11  distributor     1019 non-null   object 
 12  opening gross   1020 non-null   int64  
 13  Action          1020 non-null   int64  
 14  Adventure       1020 non-null   int64  
 15  Animation       1020 non-null   int64  
 16  Biography       1020 non-null   int64  
 17  Comedy          1020 non-null   i

In [76]:
#top_movies_merge = top_movies_merge.drop(['new open gross','new budget'], axis=1)

In [77]:
#top_movies_merge.info()

In [78]:
#create dummies for actor_actress, distributor
top_movies_merge = pd.get_dummies(top_movies_merge, columns = ['distributor'], drop_first=True)

In [79]:
top_movies_merge.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,actor_actress,budget,rating,genre,runtime (mins),...,distributor_Summit Entertainment,distributor_The Weinstein Company,distributor_TriStar Pictures,distributor_Twentieth Century Fox,distributor_USA Films,distributor_United Artists,distributor_United Artists Releasing,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.
0,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,Daisy Ridley,245000000,PG-13,"[Action, Adventure, Sci-Fi]",138.0,...,0,0,0,0,0,0,0,0,1,0
1,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,Robert Downey Jr.,356000000,PG-13,"[Action, Adventure, Drama, Sci-Fi]",181.0,...,0,0,0,0,0,0,0,0,1,0
2,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,Sam Worthington,237000000,PG-13,"[Action, Adventure, Fantasy, Sci-Fi]",162.0,...,0,0,0,1,0,0,0,0,0,0
3,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,Chadwick Boseman,72067774,PG-13,"[Action, Adventure, Sci-Fi]",134.0,...,0,0,0,0,0,0,0,0,1,0
4,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,Robert Downey Jr.,72067774,PG-13,"[Action, Adventure, Sci-Fi]",149.0,...,0,0,0,0,0,0,0,0,1,0


In [80]:
#top_movies_merge = pd.get_dummies(top_movies_merge, columns = ['actor_actress'], drop_first=True)

In [81]:
#top_movies_merge.head()

In [82]:
final_df = pd.get_dummies(top_movies_merge, columns = ['rating'], drop_first=True)

In [83]:
'''#create covid
for date_str in final_df['release date']:
    if date_str < 'March 15, 2020':
        final_df['pre-covid'].append(date_str)
    else:
        final_df['covid'].append(date_str)'''

"#create covid\nfor date_str in final_df['release date']:\n    if date_str < 'March 15, 2020':\n        final_df['pre-covid'].append(date_str)\n    else:\n        final_df['covid'].append(date_str)"

In [84]:
#try to fix rating so that not as many drop
final_df = final_df.dropna()

In [85]:
final_df.corr()

Unnamed: 0,year,budget,runtime (mins),opening gross,Action,Adventure,Animation,Biography,Comedy,Crime,...,distributor_USA Films,distributor_United Artists,distributor_United Artists Releasing,distributor_Universal Pictures,distributor_Walt Disney Studios Motion Pictures,distributor_Warner Bros.,rating_G,rating_PG,rating_PG-13,rating_R
year,1.000000,0.275371,0.018428,0.327105,0.143629,0.205207,0.068410,0.046772,-0.005764,-0.012820,...,-0.010630,-0.124919,0.002940,0.035054,-0.020455,0.064458,-0.139887,0.076948,0.262431,0.064274
budget,0.275371,1.000000,0.272658,0.576296,0.399747,0.502570,0.101184,-0.125850,-0.213509,-0.138174,...,-0.007568,-0.033730,-0.048767,-0.068688,0.217986,0.056526,-0.014791,0.094049,0.273979,-0.264676
runtime (mins),0.018428,0.272658,1.000000,0.220360,0.228239,-0.012688,-0.434188,0.138553,-0.497500,0.007912,...,0.046684,-0.010183,-0.061729,0.008876,-0.112879,0.133590,-0.176626,-0.280495,0.238543,0.136431
opening gross,0.327105,0.576296,0.220360,1.000000,0.285752,0.377636,0.038886,-0.129026,-0.170880,-0.093528,...,-0.035364,-0.043950,-0.013122,0.010142,0.195212,0.037664,-0.025533,0.031740,0.284564,-0.151038
Action,0.143629,0.399747,0.228239,0.285752,1.000000,0.323887,-0.210166,-0.131458,-0.320200,0.070866,...,-0.025578,-0.062807,-0.036191,-0.033717,-0.088317,0.075178,-0.139674,-0.175162,0.334610,-0.074203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
distributor_Warner Bros.,0.064458,0.056526,0.133590,0.037664,0.075178,-0.024774,-0.122460,-0.028365,-0.116920,0.058222,...,-0.014010,-0.034401,-0.019822,-0.170165,-0.202817,1.000000,-0.076503,-0.087412,0.034782,0.126206
rating_G,-0.139887,-0.014791,-0.176626,-0.025533,-0.139674,0.100390,0.306804,-0.035888,0.066104,-0.066812,...,-0.005359,-0.013159,-0.007582,-0.065091,0.283108,-0.076503,1.000000,-0.089707,-0.142256,-0.089187
rating_PG,0.076948,0.094049,-0.280495,0.031740,-0.175162,0.282202,0.437973,-0.038843,0.252958,-0.134474,...,-0.016428,-0.009168,0.084523,-0.070449,0.180418,-0.087412,-0.089707,1.000000,-0.436090,-0.273406
rating_PG-13,0.262431,0.273979,0.238543,0.284564,0.334610,0.077670,-0.339540,0.033946,-0.217816,-0.012895,...,-0.026051,-0.037890,-0.036860,0.049569,-0.138561,0.034782,-0.142256,-0.436090,1.000000,-0.433561


In [86]:
final_df.shape

(1020, 74)

In [87]:
final_df.to_csv('complete_web_scraping.csv', index=False)