# Exporting American Movie Box Office Hits 

### boxofficemojo.com exploratory data analysis

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import re
import dateutil.parser
import time
import random


In [2]:
# movies_df = pd.read_pickle('adaptation_movies_df.pkl')
df = pd.read_csv('adaptation_movies_df.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   link_stub                  901 non-null    object 
 1   rank                       901 non-null    int64  
 2   movie_title                901 non-null    object 
 3   lifetime_gross             901 non-null    object 
 4   max_theaters               901 non-null    object 
 5   opening_weekend            901 non-null    object 
 6   opening_theathers          901 non-null    object 
 7   release_date_x             901 non-null    object 
 8   distributor_x              901 non-null    object 
 9   domestic_total_gross       901 non-null    int64  
 10  international_total_gross  901 non-null    int64  
 11  worldwide_total_gross      901 non-null    int64  
 12  domestic_opening           901 non-null    int64  
 13  budget                     901 non-null    int64  

In [4]:
# drop duplicate columns 
df.drop(columns=['lifetime_gross',
                        'opening_weekend',
                        'release_date_x',
                        'distributor_x'],inplace=True)
df.head(2)

Unnamed: 0,link_stub,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date_y,runtime,distributor_y,genres,rating
0,/title/tt0115433/?ref_=bo_ge_table_49,149,101 Dalmatians,2901,2794,136189294,184500000,320689294,33504025,0,1996-11-27,103.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Crime, Family",
1,/title/tt0211181/?ref_=bo_ge_table_35,335,102 Dalmatians,2704,2704,66957026,116654745,183611771,19883351,85000000,2000-11-22,100.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Family",


In [5]:
# rename columns
df = df.rename({'distributor_y':'distributor', 'release_date_y':'release_date'}, axis=1)

df.head(2)

Unnamed: 0,link_stub,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating
0,/title/tt0115433/?ref_=bo_ge_table_49,149,101 Dalmatians,2901,2794,136189294,184500000,320689294,33504025,0,1996-11-27,103.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Crime, Family",
1,/title/tt0211181/?ref_=bo_ge_table_35,335,102 Dalmatians,2704,2704,66957026,116654745,183611771,19883351,85000000,2000-11-22,100.0,Walt Disney Studios Motion Pictures,"Adventure, Comedy, Family",


In [6]:
# strip commas 
df = df.replace(',','', regex=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 901 entries, 0 to 900
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   link_stub                  901 non-null    object 
 1   rank                       901 non-null    int64  
 2   movie_title                901 non-null    object 
 3   max_theaters               901 non-null    object 
 4   opening_theathers          901 non-null    object 
 5   domestic_total_gross       901 non-null    int64  
 6   international_total_gross  901 non-null    int64  
 7   worldwide_total_gross      901 non-null    int64  
 8   domestic_opening           901 non-null    int64  
 9   budget                     901 non-null    int64  
 10  release_date               901 non-null    object 
 11  runtime                    888 non-null    float64
 12  distributor                899 non-null    object 
 13  genres                     901 non-null    object 

In [7]:
df.head(2)

Unnamed: 0,link_stub,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating
0,/title/tt0115433/?ref_=bo_ge_table_49,149,101 Dalmatians,2901,2794,136189294,184500000,320689294,33504025,0,1996-11-27,103.0,Walt Disney Studios Motion Pictures,Adventure Comedy Crime Family,
1,/title/tt0211181/?ref_=bo_ge_table_35,335,102 Dalmatians,2704,2704,66957026,116654745,183611771,19883351,85000000,2000-11-22,100.0,Walt Disney Studios Motion Pictures,Adventure Comedy Family,


In [None]:
# change theather data to int32

# df.astype({'max_theaters': 'int64'}).dtypes
# df.astype({'opening_theathers': 'int64'}).dtypes

In [8]:
# sort by domestic_total_gross instead of rank
df.sort_values('domestic_total_gross', ascending=False, inplace=True)
df.head(10)

Unnamed: 0,link_stub,rank,movie_title,max_theaters,opening_theathers,domestic_total_gross,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating
702,/title/tt6105098/?ref_=bo_ge_table_6,6,The Lion King,4802,4725,543638043,1119261396,1662899439,191770759,260000000,2019-07-11,118.0,Walt Disney Studios Motion Pictures,Adventure Animation Drama Family Musical,PG
637,/title/tt0468569/?ref_=bo_ge_table_7,7,The Dark Knight,4366,4366,534987076,471115201,1006102277,158411483,185000000,2008-07-16,152.0,Warner Bros.,Action Crime Drama Thriller,PG-13
78,/title/tt2771200/?ref_=bo_ge_table_8,8,Beauty and the Beast,4210,4210,504481165,769095055,1273576220,174750616,160000000,2017-03-16,129.0,Walt Disney Studios Motion Pictures,Adventure Family Fantasy Musical Romance,PG
60,/title/tt2395427/?ref_=bo_ge_table_9,9,Avengers: Age of Ultron,4276,4276,459005868,943803672,1402809540,191271109,250000000,2015-04-22,141.0,Walt Disney Studios Motion Pictures,Action Adventure Sci-Fi,PG-13
638,/title/tt1345836/?ref_=bo_ge_table_10,10,The Dark Knight Rises,4404,4404,448149584,633003513,1081153097,160887295,250000000,2012-07-19,164.0,Warner Bros.,Action Crime Drama,PG-13
120,/title/tt4154664/?ref_=bo_ge_table_11,11,Captain Marvel,4310,4310,426829839,701633133,1128462972,153433423,160000000,2019-03-06,123.0,Walt Disney Studios Motion Pictures,Action Adventure Sci-Fi,PG-13
681,/title/tt1951264/?ref_=bo_ge_table_12,12,The Hunger Games: Catching Fire,4163,4163,424668047,440343699,865011746,158074286,130000000,2013-11-15,146.0,Lionsgate,Action Adventure Drama Sci-Fi Thriller,PG-13
893,/title/tt0451279/?ref_=bo_ge_table_13,13,Wonder Woman,4165,4165,412845172,410009114,822854286,103251471,149000000,2017-05-30,141.0,Warner Bros.,Action Adventure Fantasy Sci-Fi War,PG-13
324,/title/tt1300854/?ref_=bo_ge_table_14,14,Iron Man 3,4253,4253,409013994,805797258,1214811252,174144585,200000000,2013-04-24,130.0,Walt Disney Studios Motion Pictures,Action Adventure Sci-Fi,PG-13
117,/title/tt3498820/?ref_=bo_ge_table_15,15,Captain America: Civil War,4226,4226,408084349,745253147,1153337496,179139142,250000000,2016-04-27,147.0,Walt Disney Studios Motion Pictures,Action Adventure Sci-Fi,PG-13


In [None]:
# # shift column 'Name' to first position
# first_column = df.pop('Name')
  
# # insert column using insert(position,column_name,
# # first_column) function
# df.insert(0, 'Name', first_column)
  
# print()
# print("After Shifting column to first position")
# display(df)

In [9]:
# movie link_stub to last column
link_stub_col = df.pop('link_stub')
df.insert(14, 'link_stub', link_stub_col)


In [10]:
domestic_total_gross_col = df.pop('domestic_total_gross')
df.insert(0, 'domestic_total_gross', domestic_total_gross_col)
df.head(2)

Unnamed: 0,domestic_total_gross,rank,movie_title,max_theaters,opening_theathers,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
702,543638043,6,The Lion King,4802,4725,1119261396,1662899439,191770759,260000000,2019-07-11,118.0,Walt Disney Studios Motion Pictures,Adventure Animation Drama Family Musical,PG,/title/tt6105098/?ref_=bo_ge_table_6
637,534987076,7,The Dark Knight,4366,4366,471115201,1006102277,158411483,185000000,2008-07-16,152.0,Warner Bros.,Action Crime Drama Thriller,PG-13,/title/tt0468569/?ref_=bo_ge_table_7


In [11]:
df.reset_index(drop=True)

Unnamed: 0,domestic_total_gross,rank,movie_title,max_theaters,opening_theathers,international_total_gross,worldwide_total_gross,domestic_opening,budget,release_date,runtime,distributor,genres,rating,link_stub
0,543638043,6,The Lion King,4802,4725,1119261396,1662899439,191770759,260000000,2019-07-11,118.0,Walt Disney Studios Motion Pictures,Adventure Animation Drama Family Musical,PG,/title/tt6105098/?ref_=bo_ge_table_6
1,534987076,7,The Dark Knight,4366,4366,471115201,1006102277,158411483,185000000,2008-07-16,152.0,Warner Bros.,Action Crime Drama Thriller,PG-13,/title/tt0468569/?ref_=bo_ge_table_7
2,504481165,8,Beauty and the Beast,4210,4210,769095055,1273576220,174750616,160000000,2017-03-16,129.0,Walt Disney Studios Motion Pictures,Adventure Family Fantasy Musical Romance,PG,/title/tt2771200/?ref_=bo_ge_table_8
3,459005868,9,Avengers: Age of Ultron,4276,4276,943803672,1402809540,191271109,250000000,2015-04-22,141.0,Walt Disney Studios Motion Pictures,Action Adventure Sci-Fi,PG-13,/title/tt2395427/?ref_=bo_ge_table_9
4,448149584,10,The Dark Knight Rises,4404,4404,633003513,1081153097,160887295,250000000,2012-07-19,164.0,Warner Bros.,Action Crime Drama,PG-13,/title/tt1345836/?ref_=bo_ge_table_10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,2638,960,Come Out and Play,10,10,269097,271735,1600,0,2013-03-22,105.0,Cinedigm Entertainment Group,Horror,R,/title/tt2341664/?ref_=bo_ge_table_60
897,1519,961,Postman Pat: The Movie,2,2,7100936,7102455,1254,0,2014-05-23,88.0,Shout! Factory,Animation Comedy Family,,/title/tt2062622/?ref_=bo_ge_table_61
898,894,962,Billy and Buddy,2,-,17399159,17400053,0,0,2013-02-27,82.0,Distrib Films,Comedy Family,,/title/tt2538654/?ref_=bo_ge_table_62
899,809,963,I Spit on Your Grave 2,1,1,677795,678604,441,0,2013-09-05,106.0,Anchor Bay Films,Horror Thriller,R,/title/tt2537176/?ref_=bo_ge_table_63


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 901 entries, 702 to 44
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   domestic_total_gross       901 non-null    int64  
 1   rank                       901 non-null    int64  
 2   movie_title                901 non-null    object 
 3   max_theaters               901 non-null    object 
 4   opening_theathers          901 non-null    object 
 5   international_total_gross  901 non-null    int64  
 6   worldwide_total_gross      901 non-null    int64  
 7   domestic_opening           901 non-null    int64  
 8   budget                     901 non-null    int64  
 9   release_date               901 non-null    object 
 10  runtime                    888 non-null    float64
 11  distributor                899 non-null    object 
 12  genres                     901 non-null    object 
 13  rating                     734 non-null    object

In [13]:
# picke df 
clean_df = df 
clean_df.to_pickle('clean_df.pkl')


In [14]:
# save to csv
clean_df.to_csv(r'/Users/sandraparedes/Dropbox/Mac/Downloads/clean_df.csv', index=False)
