# Exporting American Movie Box Office Hits 

### boxofficemojo.com exploratory data analysis

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import re
import dateutil.parser
import time
import random


In [2]:
# movies_df = pd.read_pickle('adaptation_movies_df.pkl')
df = pd.read_csv('adaptation_movies_df.csv')

In [3]:
df.info()

# n = 901
# features = 19

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   link_stub                  901 non-null    object 
 1   rank                       901 non-null    int64  
 2   movie_title                901 non-null    object 
 3   lifetime_gross             901 non-null    object 
 4   max_theaters               901 non-null    object 
 5   opening_weekend            901 non-null    object 
 6   opening_theathers          901 non-null    object 
 7   release_date_x             901 non-null    object 
 8   distributor_x              901 non-null    object 
 9   domestic_total_gross       901 non-null    int64  
 10  international_total_gross  901 non-null    int64  
 11  worldwide_total_gross      901 non-null    int64  
 12  domestic_opening           901 non-null    int64  
 13  budget                     901 non-null    int64  

In [4]:
# drop duplicate columns 
df.drop(columns=['lifetime_gross',
                        'opening_weekend',
                        'release_date_x',
                        'distributor_x'],inplace=True)
df.head(2)

Unnamed: 0,link_stub,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date_y,runtime,distributor_y,genres,rating
0,/title/tt0115433/?ref_=bo_ge_table_49,149,101 Dalmatians,2901,2794,136189294,184500000,320689294,33504025,0,1996-11-27,103.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Crime, Family",
1,/title/tt0211181/?ref_=bo_ge_table_35,335,102 Dalmatians,2704,2704,66957026,116654745,183611771,19883351,85000000,2000-11-22,100.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Family",


In [5]:
# rename columns
df = df.rename({'distributor_y':'distributor', 'release_date_y':'release_date'}, axis=1)

df.head(2)

Unnamed: 0,link_stub,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating
0,/title/tt0115433/?ref_=bo_ge_table_49,149,101 Dalmatians,2901,2794,136189294,184500000,320689294,33504025,0,1996-11-27,103.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Crime, Family",
1,/title/tt0211181/?ref_=bo_ge_table_35,335,102 Dalmatians,2704,2704,66957026,116654745,183611771,19883351,85000000,2000-11-22,100.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Family",


In [6]:
# strip commas and dashes
df = df.replace(',','', regex=True)
df = df.replace('-', '', regex=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   link_stub                  901 non-null    object 
 1   rank                       901 non-null    int64  
 2   movie_title                901 non-null    object 
 3   max_theaters               901 non-null    object 
 4   opening_theathers          901 non-null    object 
 5   domestic_total_gross       901 non-null    int64  
 6   international_total_gross  901 non-null    int64  
 7   worldwide_total_gross      901 non-null    int64  
 8   domestic_opening           901 non-null    int64  
 9   budget                     901 non-null    int64  
 10  release_date               901 non-null    object 
 11  runtime                    888 non-null    float64
 12  distributor                899 non-null    object 
 13  genres                     901 non-null    object 

In [7]:
# change theather data to float
df["max_theaters"] = pd.to_numeric(df["max_theaters"], downcast="float")
df["opening_theathers"] = pd.to_numeric(df["opening_theathers"], downcast="float")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   link_stub                  901 non-null    object 
 1   rank                       901 non-null    int64  
 2   movie_title                901 non-null    object 
 3   max_theaters               892 non-null    float32
 4   opening_theathers          883 non-null    float32
 5   domestic_total_gross       901 non-null    int64  
 6   international_total_gross  901 non-null    int64  
 7   worldwide_total_gross      901 non-null    int64  
 8   domestic_opening           901 non-null    int64  
 9   budget                     901 non-null    int64  
 10  release_date               901 non-null    object 
 11  runtime                    888 non-null    float64
 12  distributor                899 non-null    object 
 13  genres                     901 non-null    object 

In [8]:
# shift link_stub to last column
link_stub_col = df.pop('link_stub')
df.insert(14, 'link_stub', link_stub_col)
df.head(2)

Unnamed: 0,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
0,149,101 Dalmatians,2901.0,2794.0,136189294,184500000,320689294,33504025,0,19961127,103.0,Walt Disney Studios Motion Pictures,Adventure Comedy Crime Family,,/title/tt0115433/?ref_=bo_ge_table_49
1,335,102 Dalmatians,2704.0,2704.0,66957026,116654745,183611771,19883351,85000000,20001122,100.0,Walt Disney Studios Motion Pictures,Adventure Comedy Family,,/title/tt0211181/?ref_=bo_ge_table_35


In [9]:
# shift domestic_total_gross to first column
domestic_total_gross_col = df.pop('domestic_total_gross')
df.insert(0, 'domestic_total_gross', domestic_total_gross_col)
df.head(2)

Unnamed: 0,domestic_total_gross,rank,movie_title,max_theaters,opening_theathers,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
0,136189294,149,101 Dalmatians,2901.0,2794.0,184500000,320689294,33504025,0,19961127,103.0,Walt Disney Studios Motion Pictures,Adventure Comedy Crime Family,,/title/tt0115433/?ref_=bo_ge_table_49
1,66957026,335,102 Dalmatians,2704.0,2704.0,116654745,183611771,19883351,85000000,20001122,100.0,Walt Disney Studios Motion Pictures,Adventure Comedy Family,,/title/tt0211181/?ref_=bo_ge_table_35


In [10]:
# shift international_total_gross to first column
international_total_gross_col = df.pop('international_total_gross')
df.insert(0, 'international_total_gross', international_total_gross_col)
df.head(2)

Unnamed: 0,international_total_gross,domestic_total_gross,rank,movie_title,max_theaters,opening_theathers,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
0,184500000,136189294,149,101 Dalmatians,2901.0,2794.0,320689294,33504025,0,19961127,103.0,Walt Disney Studios Motion Pictures,Adventure Comedy Crime Family,,/title/tt0115433/?ref_=bo_ge_table_49
1,116654745,66957026,335,102 Dalmatians,2704.0,2704.0,183611771,19883351,85000000,20001122,100.0,Walt Disney Studios Motion Pictures,Adventure Comedy Family,,/title/tt0211181/?ref_=bo_ge_table_35


In [11]:
# sort by domestic_total_gross instead of rank
df.sort_values('domestic_total_gross', ascending=False, inplace=True)
df.head(2)

Unnamed: 0,international_total_gross,domestic_total_gross,rank,movie_title,max_theaters,opening_theathers,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
702,1119261396,543638043,6,The Lion King,4802.0,4725.0,1662899439,191770759,260000000,20190711,118.0,Walt Disney Studios Motion Pictures,Adventure Animation Drama Family Musical,PG,/title/tt6105098/?ref_=bo_ge_table_6
637,471115201,534987076,7,The Dark Knight,4366.0,4366.0,1006102277,158411483,185000000,20080716,152.0,Warner Bros.,Action Crime Drama Thriller,PG13,/title/tt0468569/?ref_=bo_ge_table_7


In [12]:
df.reset_index(drop=True)

Unnamed: 0,international_total_gross,domestic_total_gross,rank,movie_title,max_theaters,opening_theathers,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
0,1119261396,543638043,6,The Lion King,4802.0,4725.0,1662899439,191770759,260000000,20190711,118.0,Walt Disney Studios Motion Pictures,Adventure Animation Drama Family Musical,PG,/title/tt6105098/?ref_=bo_ge_table_6
1,471115201,534987076,7,The Dark Knight,4366.0,4366.0,1006102277,158411483,185000000,20080716,152.0,Warner Bros.,Action Crime Drama Thriller,PG13,/title/tt0468569/?ref_=bo_ge_table_7
2,769095055,504481165,8,Beauty and the Beast,4210.0,4210.0,1273576220,174750616,160000000,20170316,129.0,Walt Disney Studios Motion Pictures,Adventure Family Fantasy Musical Romance,PG,/title/tt2771200/?ref_=bo_ge_table_8
3,943803672,459005868,9,Avengers: Age of Ultron,4276.0,4276.0,1402809540,191271109,250000000,20150422,141.0,Walt Disney Studios Motion Pictures,Action Adventure SciFi,PG13,/title/tt2395427/?ref_=bo_ge_table_9
4,633003513,448149584,10,The Dark Knight Rises,4404.0,4404.0,1081153097,160887295,250000000,20120719,164.0,Warner Bros.,Action Crime Drama,PG13,/title/tt1345836/?ref_=bo_ge_table_10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,269097,2638,960,Come Out and Play,10.0,10.0,271735,1600,0,20130322,105.0,Cinedigm Entertainment Group,Horror,R,/title/tt2341664/?ref_=bo_ge_table_60
897,7100936,1519,961,Postman Pat: The Movie,2.0,2.0,7102455,1254,0,20140523,88.0,Shout! Factory,Animation Comedy Family,,/title/tt2062622/?ref_=bo_ge_table_61
898,17399159,894,962,Billy and Buddy,2.0,,17400053,0,0,20130227,82.0,Distrib Films,Comedy Family,,/title/tt2538654/?ref_=bo_ge_table_62
899,677795,809,963,I Spit on Your Grave 2,1.0,1.0,678604,441,0,20130905,106.0,Anchor Bay Films,Horror Thriller,R,/title/tt2537176/?ref_=bo_ge_table_63


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 901 entries, 702 to 44
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   international_total_gross  901 non-null    int64  
 1   domestic_total_gross       901 non-null    int64  
 2   rank                       901 non-null    int64  
 3   movie_title                901 non-null    object 
 4   max_theaters               892 non-null    float32
 5   opening_theathers          883 non-null    float32
 6   worldwide_total_gross      901 non-null    int64  
 7   domestic_opening           901 non-null    int64  
 8   budget                     901 non-null    int64  
 9   release_date               901 non-null    object 
 10  runtime                    888 non-null    float64
 11  distributor                899 non-null    object 
 12  genres                     901 non-null    object 
 13  rating                     734 non-null    object

In [14]:
# picke df 
clean_df = df 
clean_df.to_pickle('clean_df.pkl')


In [15]:
# save to csv
clean_df.to_csv(r'/Users/sandraparedes/Dropbox/Mac/Downloads/clean_df.csv', index=False)
