In [822]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [823]:
#request page's HTML 
url = 'https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1'
response = requests.get(url)

In [824]:
response.status_code

200

In [825]:
response.text[:1000]

'<!doctype html><html class="a-no-js" data-19ax5a9jf="dingo"><head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1.0" />\n                <meta charset="utf-8" />\n            <title dir="ltr">Star Wars: Episode VII - The Force Awakens - Box Office Mojo</title><meta content="Star Wars: Episode VII - The Force Awakens" name="title" />\n            <meta content="Box Office Mojo" property="og:site_name" />\n            <meta name="format-detection" content="telephone=no" />\n            <link href="https://m.media-amazon.com/images/G/01/boxofficemojo/v2/favicon._CB448965889_.ico" type="image/x-icon" rel="icon" />\n            <link rel="stylesheet" href="https://images-na.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|012LjolmrML.css,41DAFIecsVL.css,51IB+wfP8qL.css,01ZfXnjPmmL.css,01oDR3IULNL.css,01Vctty9pOL.css,31q1y1irc5L.css,01XPHJk60-L.css,21qPwhPKAAL.css,01R0k0yxPXL.css,21xVR0NtxzL.css

In [826]:
page = response.text

In [827]:
soup = BeautifulSoup(page, 'lxml')

In [828]:
#get title
title_str = soup.find('title').text
title_str

'Star Wars: Episode VII - The Force Awakens - Box Office Mojo'

In [829]:
title = title_str.split('-')[0].strip()
title

'Star Wars: Episode VII'

In [830]:
#get domestic gross
dom_gross = soup.find(class_ = 'a-section a-spacing-none mojo-performance-summary-table').find(class_ = 'money').text
dom_gross

'$936,662,225'

In [831]:
#get runtime. First create navigable string
import re
runtime_str = soup.find(text = re.compile('Run'))
print(runtime_str)
type(runtime_str)

Running Time


bs4.element.NavigableString

In [832]:
rt = runtime_str.findNext().text
rt = rt.split()
minutes = int(rt[0])*60 + int(rt[2])
print(minutes)

138


In [833]:
def movie_value(soup, field_name):
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [834]:
# rating
rating = movie_value(soup,'MPAA')
print(rating)

PG-13


In [835]:
#rank
rank_str = soup.find_all(class_= 'a-text-right')[17]
rank = rank_str.text
print(rank)

1


In [836]:
# distributor
distributor = movie_value(soup,'Distributor')
distributor = distributor.split('See')[0]
print(distributor)

Walt Disney Studios Motion Pictures


In [837]:
#release date
release_date = movie_value(soup,'Release Date')
release_date = release_date.split('\n')[0]
print(release_date)

December 16, 2015


In [838]:
# budget
budget = movie_value(soup,'Budget')
print(budget)

$245,000,000


In [839]:
# genre
genre = movie_value(soup,'Genres').split()
print(genre)

['Action', 'Adventure', 'Sci-Fi']


In [840]:
# number of opening theatres


In [841]:
import dateutil.parser

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)

def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

In [842]:
'''raw_domestic_total_gross = dom_gross
domestic_total_gross = money_to_int(raw_domestic_total_gross)

raw_runtime = movie_value(soup,'Running')
runtime = runtime_to_minutes(raw_runtime)'''

"raw_domestic_total_gross = dom_gross\ndomestic_total_gross = money_to_int(raw_domestic_total_gross)\n\nraw_runtime = movie_value(soup,'Running')\nruntime = runtime_to_minutes(raw_runtime)"

In [843]:
headers = ['movie title', 'domestic total gross',
           'runtime (mins)', 'rating', 'rank', 'release date', 'budget', 'genre', 'distributor']

movie_data = []
movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                rank, 
                                release_date, 
                                budget,
                                genre, 
                                distributor]))

movie_data.append(movie_dict)
movie_data

[{'movie title': 'Star Wars: Episode VII',
  'domestic total gross': 936662225,
  'runtime (mins)': 138,
  'rating': 'PG-13',
  'rank': '1',
  'release date': 'December 16, 2015',
  'budget': '$245,000,000',
  'genre': ['Action', 'Adventure', 'Sci-Fi'],
  'distributor': 'Walt Disney Studios Motion Pictures'}]

In [844]:
#scrape table

url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_lnav_hm_shrt'

response = requests.get(url)
page = response.text

soup = BeautifulSoup(page,"lxml")

In [845]:
table = soup.find('table')
table

<table class="a-bordered a-horizontal-stripes a-size-base a-span12 mojo-body-table mojo-table-annotated"><tr><th class="a-text-right mojo-field-type-rank a-nowrap"><span title="Rank">Rank</span>
</th><th class="a-text-left mojo-field-type-title a-nowrap"><span title="Title">Title</span>
</th><th class="a-text-right mojo-field-type-money a-nowrap"><span title="Lifetime Gross">Lifetime Gross</span>
</th><th class="a-text-left mojo-field-type-year a-nowrap"><span title="Year">Year</span>
</th></tr><tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">1</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt2488496/?ref_=bo_cso_table_1">Star Wars: Episode VII - The Force Awakens</a></td><td class="a-text-right mojo-field-type-money">$936,662,225</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2015/?ref_=bo_cso_table_1">2015</a></td></tr><tr><td class="a-text-right mojo-header-column mojo-t

In [846]:
#rows = [row for row in table.find_all('tr')] 

In [847]:
rows[1]

<tr><td class="a-text-right mojo-header-column mojo-truncate mojo-field-type-rank">2</td><td class="a-text-left mojo-field-type-title"><a class="a-link-normal" href="/title/tt4154796/?ref_=bo_cso_table_2">Avengers: Endgame</a></td><td class="a-text-right mojo-field-type-money">$858,373,000</td><td class="a-text-left mojo-field-type-year"><a class="a-link-normal" href="/year/2019/?ref_=bo_cso_table_2">2019</a></td></tr>

In [848]:
url_list = ['https://www.boxofficemojo.com/chart/top_lifetime_gross/?ref_=bo_lnav_hm_shrt', 
            'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=200', 
           'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=400', 
           'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=600', 
           'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=800']

#url = 'https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=i'
rows = []

for url in url_list:
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    table = soup.find('table')
    rows.extend(table.find_all('tr')[1:]) 

In [849]:
print(len(rows))

1000


In [850]:
#rows[1].find_all('td')[1].find('a')['href']

In [851]:
movies = {}

for row in rows[:1001]:
    items = row.find_all('td')
    link = items[1].find('a')
    title, url = link.text, link['href']
    movies[title] = [url] + [i.text for i in items]
    
print(len(movies))

988


In [852]:
#scrape multiple pages
import pandas as pd

In [853]:
top_movies = pd.DataFrame(movies).T  #transpose
top_movies.columns = ['link_stub', 'rank', 'title', 
                    'lifetime_gross', 'year']

top_movies

Unnamed: 0,link_stub,rank,title,lifetime_gross,year
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018
...,...,...,...,...,...
Cloverfield,/title/tt1060277/?ref_=bo_cso_table_195,995,Cloverfield,"$80,048,433",2008
Footloose,/title/tt0087277/?ref_=bo_cso_table_196,996,Footloose,"$80,035,402",1984
Dear John,/title/tt0989757/?ref_=bo_cso_table_197,997,Dear John,"$80,014,842",2010
Men in Black: International,/title/tt2283336/?ref_=bo_cso_table_198,998,Men in Black: International,"$80,001,807",2019


In [854]:
def get_movie_dict(link):
    
    base_url = 'https://www.boxofficemojo.com'
    
    #Create full url to scrape
    url = base_url + link
    
    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    headers = ['movie title', 'domestic total gross', 
               'runtime (mins)', 'rating', 'release date', 'budget', 'genre', 'distributor']

  
    
    #title
    title_string = soup.find('title').text
    title = title_string.split(' - Box')[0].strip()

    #domestic gross
    dom_gross = soup.find(class_ = 'a-section a-spacing-none mojo-performance-summary-table').find(class_ = 'money').text
    domestic_total_gross = money_to_int(dom_gross)

    #runtime
    raw_runtime = movie_value(soup,'Running')
    runtime = runtime_to_minutes(raw_runtime)
    
    #rating
    rating = movie_value(soup,'MPAA')

    #release date
    release_date = movie_value(soup,'Release Date')
    release_date = release_date.split('\n')[0]
    
    # budget
    budget = movie_value(soup,'Budget')

    # genre
    genre = movie_value(soup,'Genres').split()
    
    '''# distributor
    distributor = movie_value(soup,'Distributor')
    distributor = distributor.split('See')[0]'''
    
    #create movie dict
    movie_dict = dict(zip(headers, [title,
                                domestic_total_gross,
                                runtime,
                                rating, 
                                release_date, 
                                budget,
                                genre, 
                                distributor]))

    return movie_dict

In [855]:
top_movies_list = []

for link in top_movies.link_stub:
    top_movies_list.append(get_movie_dict(link))

In [856]:
print(len(top_movies_list))

988


In [857]:
#convert list to df
top_movies_df = pd.DataFrame(top_movies_list)  
top_movies_df.set_index('movie title', inplace=True)

top_movies_df.shape

(988, 7)

In [858]:
top_movies.shape

(988, 5)

In [883]:
top_movies_merge = top_movies.merge(top_movies_df, left_index=True, right_index=True)

top_movies_merge.shape
top_movies_merge.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,domestic total gross,runtime (mins),rating,release date,budget,genre,distributor
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,936662225,138.0,PG-13,"December 16, 2015","$245,000,000","[Action, Adventure, Sci-Fi]",Walt Disney Studios Motion Pictures
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,858373000,181.0,PG-13,"April 24, 2019","$356,000,000","[Action, Adventure, Drama, Sci-Fi]",Walt Disney Studios Motion Pictures
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,760507625,162.0,PG-13,"December 16, 2009","$237,000,000","[Action, Adventure, Fantasy, Sci-Fi]",Walt Disney Studios Motion Pictures
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,700426566,134.0,PG-13,"February 13, 2018",,"[Action, Adventure, Sci-Fi]",Walt Disney Studios Motion Pictures
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,678815482,149.0,PG-13,"April 25, 2018",,"[Action, Adventure, Sci-Fi]",Walt Disney Studios Motion Pictures


In [860]:
#set(top_movies.index) - set(top_movies_df.index)

In [861]:
#set(top_movies_df.index)-set(top_movies.index)

In [885]:
top_movies_merge['runtime (mins)'] = top_movies_merge['runtime (mins)'].fillna(113.0)

In [886]:
top_movies_merge['runtime (mins)']

Star Wars: Episode VII - The Force Awakens    138.0
Avengers: Endgame                             181.0
Avatar                                        162.0
Black Panther                                 134.0
Avengers: Infinity War                        149.0
                                              ...  
Cloverfield                                    85.0
Footloose                                     107.0
Dear John                                     108.0
Men in Black: International                   114.0
Knowing                                       121.0
Name: runtime (mins), Length: 988, dtype: float64

In [888]:
top_movies_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 988 entries, Star Wars: Episode VII - The Force Awakens to Knowing
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   link_stub             988 non-null    object 
 1   rank                  988 non-null    object 
 2   title                 988 non-null    object 
 3   lifetime_gross        988 non-null    object 
 4   year                  988 non-null    object 
 5   domestic total gross  988 non-null    int64  
 6   runtime (mins)        988 non-null    float64
 7   rating                860 non-null    object 
 8   release date          988 non-null    object 
 9   budget                816 non-null    object 
 10  genre                 988 non-null    object 
 11  distributor           988 non-null    object 
dtypes: float64(1), int64(1), object(10)
memory usage: 132.6+ KB


In [889]:
top_movies_merge.to_csv('top_movies_merge_1.csv', index=False)

In [890]:
top_movies_merge.dtypes

link_stub                object
rank                     object
title                    object
lifetime_gross           object
year                     object
domestic total gross      int64
runtime (mins)          float64
rating                   object
release date             object
budget                   object
genre                    object
distributor              object
dtype: object

In [891]:
top_movies_merge['budget'] = top_movies_merge['budget'].astype(str)

In [892]:
top_movies_merge["budget"] = top_movies_merge["budget"].str.replace(",","")

In [893]:
top_movies_merge['budget'] = top_movies_merge['budget'].str.replace('$',"")

  top_movies_merge['budget'] = top_movies_merge['budget'].str.replace('$',"")


In [894]:
top_movies_merge['budget'] = top_movies_merge.budget.replace('None', 60000000)

In [895]:
top_movies_merge['budget'] = top_movies_merge['budget'].astype(int)

In [896]:
top_movies_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 988 entries, Star Wars: Episode VII - The Force Awakens to Knowing
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   link_stub             988 non-null    object 
 1   rank                  988 non-null    object 
 2   title                 988 non-null    object 
 3   lifetime_gross        988 non-null    object 
 4   year                  988 non-null    object 
 5   domestic total gross  988 non-null    int64  
 6   runtime (mins)        988 non-null    float64
 7   rating                860 non-null    object 
 8   release date          988 non-null    object 
 9   budget                988 non-null    int64  
 10  genre                 988 non-null    object 
 11  distributor           988 non-null    object 
dtypes: float64(1), int64(2), object(9)
memory usage: 132.6+ KB


In [897]:
top_movies_merge.head()

Unnamed: 0,link_stub,rank,title,lifetime_gross,year,domestic total gross,runtime (mins),rating,release date,budget,genre,distributor
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,"$936,662,225",2015,936662225,138.0,PG-13,"December 16, 2015",245000000,"[Action, Adventure, Sci-Fi]",Walt Disney Studios Motion Pictures
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,"$858,373,000",2019,858373000,181.0,PG-13,"April 24, 2019",356000000,"[Action, Adventure, Drama, Sci-Fi]",Walt Disney Studios Motion Pictures
Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,"$760,507,625",2009,760507625,162.0,PG-13,"December 16, 2009",237000000,"[Action, Adventure, Fantasy, Sci-Fi]",Walt Disney Studios Motion Pictures
Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,"$700,426,566",2018,700426566,134.0,PG-13,"February 13, 2018",60000000,"[Action, Adventure, Sci-Fi]",Walt Disney Studios Motion Pictures
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$678,815,482",2018,678815482,149.0,PG-13,"April 25, 2018",60000000,"[Action, Adventure, Sci-Fi]",Walt Disney Studios Motion Pictures


In [898]:
top_movies_merge['lifetime_gross']= top_movies_merge['lifetime_gross'].astype(str)

In [899]:
top_movies_merge["lifetime_gross"] = top_movies_merge["lifetime_gross"].str.replace(",","").str.replace('$','')

  top_movies_merge["lifetime_gross"] = top_movies_merge["lifetime_gross"].str.replace(",","").str.replace('$','')


In [900]:
top_movies_merge['lifetime_gross']= top_movies_merge['lifetime_gross'].astype(int)

In [901]:
top_movies_merge.lifetime_gross

Star Wars: Episode VII - The Force Awakens    936662225
Avengers: Endgame                             858373000
Avatar                                        760507625
Black Panther                                 700426566
Avengers: Infinity War                        678815482
                                                ...    
Cloverfield                                    80048433
Footloose                                      80035402
Dear John                                      80014842
Men in Black: International                    80001807
Knowing                                        79957634
Name: lifetime_gross, Length: 988, dtype: int64

In [902]:
top_movies_merge.info()

<class 'pandas.core.frame.DataFrame'>
Index: 988 entries, Star Wars: Episode VII - The Force Awakens to Knowing
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   link_stub             988 non-null    object 
 1   rank                  988 non-null    object 
 2   title                 988 non-null    object 
 3   lifetime_gross        988 non-null    int64  
 4   year                  988 non-null    object 
 5   domestic total gross  988 non-null    int64  
 6   runtime (mins)        988 non-null    float64
 7   rating                860 non-null    object 
 8   release date          988 non-null    object 
 9   budget                988 non-null    int64  
 10  genre                 988 non-null    object 
 11  distributor           988 non-null    object 
dtypes: float64(1), int64(3), object(8)
memory usage: 132.6+ KB


In [903]:
top_movies_merge.to_csv('top_movies_merge_2.csv', index=False)

In [904]:
top_movies_merge['genre']

Star Wars: Episode VII - The Force Awakens             [Action, Adventure, Sci-Fi]
Avengers: Endgame                               [Action, Adventure, Drama, Sci-Fi]
Avatar                                        [Action, Adventure, Fantasy, Sci-Fi]
Black Panther                                          [Action, Adventure, Sci-Fi]
Avengers: Infinity War                                 [Action, Adventure, Sci-Fi]
                                                              ...                 
Cloverfield                                     [Action, Horror, Sci-Fi, Thriller]
Footloose                                                  [Drama, Music, Romance]
Dear John                                                    [Drama, Romance, War]
Men in Black: International                    [Action, Adventure, Comedy, Sci-Fi]
Knowing                                          [Action, Drama, Sci-Fi, Thriller]
Name: genre, Length: 988, dtype: object

In [905]:
#dummies for categorical

from sklearn.preprocessing import MultiLabelBinarizer

# Binarise labels
mlb = MultiLabelBinarizer()
expandedLabelData = mlb.fit_transform(top_movies_merge['genre'])
labelClasses = mlb.classes_


# Create a pandas.DataFrame from output
expandedLabels = pd.DataFrame(expandedLabelData, columns=labelClasses)

#credit: https://www.generacodice.com/en/articolo/1117363/split-a-list-of-values-into-columns-of-a-dataframe&amp;quest;

In [906]:
print(len(expandedLabels))

988


In [907]:
final_df = pd.merge(top_movies_merge, expandedLabels, on=top_movies_merge.index)

In [908]:
final_df.head()

Unnamed: 0,key_0,link_stub,rank,title,lifetime_gross,year,domestic total gross,runtime (mins),rating,release date,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_1,1,Star Wars: Episode VII - The Force Awakens,936662225,2015,936662225,138.0,PG-13,"December 16, 2015",...,0,0,0,0,1,0,0,0,0,0
1,Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_2,2,Avengers: Endgame,858373000,2019,858373000,181.0,PG-13,"April 24, 2019",...,0,0,0,0,1,0,0,0,0,0
2,Avatar,/title/tt0499549/?ref_=bo_cso_table_3,3,Avatar,760507625,2009,760507625,162.0,PG-13,"December 16, 2009",...,0,0,0,0,1,0,0,0,0,0
3,Black Panther,/title/tt1825683/?ref_=bo_cso_table_4,4,Black Panther,700426566,2018,700426566,134.0,PG-13,"February 13, 2018",...,0,0,0,0,1,0,0,0,0,0
4,Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,678815482,2018,678815482,149.0,PG-13,"April 25, 2018",...,0,0,0,0,1,0,0,0,0,0


In [909]:
final_df['rank'] = final_df['rank'].astype(str)
final_df['rank'] = final_df['rank'].str.replace(',','').astype(int)
final_df['rank']

0         1
1         2
2         3
3         4
4         5
       ... 
983     995
984     996
985     997
986     998
987    1000
Name: rank, Length: 988, dtype: int64

In [910]:
for date_str in final_df['release date']:
    final_df['release date'] = dateutil.parser.parse(date_str)

In [911]:
final_df['release date'] = final_df['release date'].astype(str)

In [912]:
final_df["release date"] = final_df["release date"].str.replace("-","").astype(float)

In [913]:
final_df['release date']

0      20070516.0
1      20070516.0
2      20070516.0
3      20070516.0
4      20070516.0
          ...    
983    20070516.0
984    20070516.0
985    20070516.0
986    20070516.0
987    20070516.0
Name: release date, Length: 988, dtype: float64

In [914]:
final_df['year'] = final_df['year'].astype(int)

In [915]:
#dummy = pd.get_dummies(final_df['rating'])

In [916]:
#print(dummy)
#merge dummy to final_df

In [936]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 988 entries, 0 to 987
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   key_0                 988 non-null    object 
 1   link_stub             988 non-null    object 
 2   rank                  988 non-null    int64  
 3   title                 988 non-null    object 
 4   lifetime_gross        988 non-null    int64  
 5   year                  988 non-null    int64  
 6   domestic total gross  988 non-null    int64  
 7   runtime (mins)        988 non-null    float64
 8   rating                860 non-null    object 
 9   release date          988 non-null    float64
 10  budget                988 non-null    int64  
 11  genre                 988 non-null    object 
 12  distributor           988 non-null    object 
 13  Action                988 non-null    int64  
 14  Adventure             988 non-null    int64  
 15  Animation             9

In [937]:
final_df.corr()

Unnamed: 0,rank,lifetime_gross,year,domestic total gross,runtime (mins),release date,budget,Action,Adventure,Animation,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
rank,1.0,-0.792687,-0.166033,-0.792687,-0.232542,,-0.406551,-0.196869,-0.337648,-0.131862,...,0.078633,-0.023544,0.055334,0.108885,-0.187775,0.041107,0.092082,0.067091,0.048472,0.042971
lifetime_gross,-0.792687,1.0,0.180941,1.0,0.277472,,0.444792,0.217946,0.360292,0.089324,...,-0.067614,-0.001205,-0.051438,-0.106192,0.276978,-0.023575,-0.080404,-0.098686,-0.042145,-0.049788
year,-0.166033,0.180941,1.0,0.180941,0.022784,,0.33115,0.150882,0.210446,0.06111,...,-0.003506,-0.100239,0.026542,-0.140611,0.154984,-0.01611,-0.054329,0.031573,-0.057391,-0.053695
domestic total gross,-0.792687,1.0,0.180941,1.0,0.277472,,0.444792,0.217946,0.360292,0.089324,...,-0.067614,-0.001205,-0.051438,-0.106192,0.276978,-0.023575,-0.080404,-0.098686,-0.042145,-0.049788
runtime (mins),-0.232542,0.277472,0.022784,0.277472,1.0,,0.275162,0.2244,-0.008719,-0.42818,...,-0.027676,-0.120813,0.074231,-0.006056,0.110959,-0.003877,-0.022921,0.132338,0.185342,0.077013
release date,,,,,,,,,,,...,,,,,,,,,,
budget,-0.406551,0.444792,0.33115,0.444792,0.275162,,1.0,0.426108,0.502147,0.104472,...,-0.12458,-0.043648,-0.085279,-0.181245,0.371808,-0.012927,-0.078003,-0.012727,-0.038046,0.020724
Action,-0.196869,0.217946,0.150882,0.217946,0.2244,,0.426108,1.0,0.335334,-0.202653,...,-0.114635,-0.177319,-0.086656,-0.276984,0.419176,-0.026033,-0.081888,0.293113,-0.002606,0.009606
Adventure,-0.337648,0.360292,0.210446,0.360292,-0.008719,,0.502147,0.335334,1.0,0.363433,...,-0.090717,0.0355,-0.092197,-0.230876,0.33548,-0.027827,-0.104598,-0.107521,-0.094303,0.017296
Animation,-0.131862,0.089324,0.06111,0.089324,-0.42818,,0.104472,-0.202653,0.363433,1.0,...,0.019903,0.310593,-0.092819,-0.053555,-0.043186,-0.013308,0.009771,-0.21964,-0.061888,-0.031107


In [938]:
final_df.to_csv('web_scraping_final.csv', index=False)

In [927]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

lm = LinearRegression()

#drop 'lifetime_gross' because that is the target (y value) value 
X, y = final_df.drop(['lifetime_gross', 'domestic total gross', 'rating','distributor','title','link_stub','key_0','genre'], axis=1), final_df['lifetime_gross']

In [929]:
lm.fit(X, y)
lm.score(X, y)

0.6694608979892236

In [930]:
X.head()

Unnamed: 0,rank,year,runtime (mins),release date,budget,Action,Adventure,Animation,Biography,Comedy,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,1,2015,138.0,20070516.0,245000000,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,2019,181.0,20070516.0,356000000,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,2009,162.0,20070516.0,237000000,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,2018,134.0,20070516.0,60000000,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,2018,149.0,20070516.0,60000000,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [931]:
y.head()

0    936662225
1    858373000
2    760507625
3    700426566
4    678815482
Name: lifetime_gross, dtype: int64