In [155]:
import pickle
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import seaborn as sns

In [156]:
# Import all the dataframes
mojo_df = pd.read_pickle('mojo_df.pkl')
youtube_df = pd.read_pickle('youtube_df.pkl')
omdb_df = pd.read_pickle('omdb_df.pkl')
director_df = pd.read_pickle('director_df.pkl')
actor_df = pd.read_pickle('actor_df.pkl')

In [157]:
#clean mojo df
mojo_df.opening = mojo_df.opening.replace("\$","", regex = True).replace(",","", regex = True).str.strip()
mojo_df.opening = pd.to_numeric(mojo_df.opening)
mojo_df.tot_gross = mojo_df.tot_gross.replace("\$","", regex = True).replace(",","", regex = True).str.strip()
mojo_df.tot_gross = pd.to_numeric(mojo_df.tot_gross)
mojo_df.theaters = mojo_df.theaters.replace(",","", regex = True).str.strip()
mojo_df.theaters = pd.to_numeric(mojo_df.theaters)
mojo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
movie_name      500 non-null object
movie           500 non-null object
opening         500 non-null int64
date_time       500 non-null datetime64[ns]
release_date    500 non-null object
theaters        500 non-null int64
tot_gross       500 non-null int64
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 27.4+ KB


In [158]:
# Clean youtube df 
# select youtube df relevant data
youtube_df = youtube_df[["movie_name", "viewCount", "commentCount", "dislikeCount", "likeCount"]]

#replace nulls with 0
youtube_df = youtube_df.fillna(0)
# convert numeric str values to int
youtube_df.viewCount = pd.to_numeric(youtube_df.viewCount)
youtube_df.dislikeCount = pd.to_numeric(youtube_df.dislikeCount)
youtube_df.commentCount = pd.to_numeric(youtube_df.commentCount)
youtube_df.likeCount = pd.to_numeric(youtube_df.likeCount)
youtube_df = youtube_df.rename(columns ={"viewCount":"Yviews", "commentCount": "Ycomments", 
                                         "dislikeCount": "Ydislikes", "likeCount":"Ylikes"})
youtube_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
movie_name    500 non-null object
Yviews        500 non-null int64
Ycomments     500 non-null int64
Ydislikes     500 non-null int64
Ylikes        500 non-null int64
dtypes: int64(4), object(1)
memory usage: 19.6+ KB


In [159]:
# Clean omdb data
omdb_df.imdb = pd.to_numeric(omdb_df.imdb.replace("/10","", regex = True).str.strip())
omdb_df.metacritic = pd.to_numeric(omdb_df.metacritic.replace("/100","", regex = True).str.strip())
omdb_df.rotten_tomatoes = pd.to_numeric(omdb_df.rotten_tomatoes.replace("%","", regex = True).str.strip())
omdb_df.runtime = pd.to_numeric(omdb_df.runtime.replace("min","", regex = True).str.strip())
omdb_df.year = pd.to_numeric(omdb_df.year)
# Remove 11 movies that have been wrongly identified on omdb (wrong year)
year = [2013,2014,2015,2016,2017]
omdb_df = omdb_df[omdb_df.year.isin(year)]
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 469 entries, 0 to 479
Data columns (total 9 columns):
actors             469 non-null object
director           469 non-null object
imdb               469 non-null float64
metacritic         469 non-null int64
movie              469 non-null object
rated              469 non-null object
rotten_tomatoes    469 non-null int64
runtime            469 non-null int64
year               469 non-null int64
dtypes: float64(1), int64(4), object(4)
memory usage: 36.6+ KB


In [160]:
# Clean directors data
# Convert str to numeric
director_df.dir_agross = pd.to_numeric(director_df.dir_agross.replace("\$","", regex = True).str.strip())
director_df.dir_gross = pd.to_numeric(director_df.dir_gross.replace("\$","", regex = True)
                                      .replace(",","", regex = True).str.strip())
director_df.dir_nmovies = pd.to_numeric(director_df.dir_nmovies)
director_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
dir_agross     500 non-null float64
dir_gross      500 non-null float64
dir_nmovies    500 non-null int64
director       500 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 15.7+ KB


In [161]:
# Clean directors data
# Convert str to numeric
actor_df.act_agross = pd.to_numeric(actor_df.act_agross.replace("\$", "", regex = True)
                                    .replace(",", "", regex = True).str.strip())
actor_df.act_gross = pd.to_numeric(actor_df.act_gross.replace("\$", "", regex = True)
                                      .replace(",", "", regex = True).str.strip())
actor_df.act_nmovies = pd.to_numeric(actor_df.act_nmovies)
actor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
act_agross     500 non-null float64
act_gross      500 non-null float64
act_nmovies    500 non-null int64
actor          500 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 15.7+ KB


In [162]:
actor_df.head()

Unnamed: 0,act_agross,act_gross,act_nmovies,actor
0,70.5,5149.1,73,Samuel L. Jackson
1,118.2,4963.8,42,Harrison Ford
2,96.0,4605.6,48,Tom Hanks
3,71.8,4522.2,63,Morgan Freeman
4,166.7,4333.5,26,Andy Serkis


In [163]:
# Split actor & director strings into lists
# for index in range(len(omdb_df)):
#     omdb_df.actors[index] = [x.strip() for x in omdb_df.actors[index].split(',')]
omdb_df.actors = omdb_df.actors.apply(lambda x :[x.strip() for x in x.split(',')])    
omdb_df.director = omdb_df.director.apply(lambda x :[x.strip() for x in x.split(',')])
omdb_df.head()

Unnamed: 0,actors,director,imdb,metacritic,movie,rated,rotten_tomatoes,runtime,year
0,"[Jennifer Lawrence, Liam Hemsworth, Jack Quaid...",[Francis Lawrence],7.5,76,The Hunger Games: Catching Fire,PG-13,89,146,2013
1,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",[Shane Black],7.2,62,Iron Man 3,PG-13,80,130,2013
2,"[Kristen Bell, Idina Menzel, Jonathan Groff, J...","[Chris Buck, Jennifer Lee]",7.5,74,Frozen,PG,90,102,2013
3,"[Steve Carell, Kristen Wiig, Benjamin Bratt, M...","[Pierre Coffin, Chris Renaud]",7.4,62,Despicable Me 2,PG,74,98,2013
4,"[Henry Cavill, Amy Adams, Michael Shannon, Dia...",[Zack Snyder],7.1,55,Man of Steel,PG-13,55,143,2013


In [164]:
actor_df.sort_values(by="act_gross",ascending=False).head(5)

Unnamed: 0,act_agross,act_gross,act_nmovies,actor
0,70.5,5149.1,73,Samuel L. Jackson
1,118.2,4963.8,42,Harrison Ford
2,96.0,4605.6,48,Tom Hanks
3,71.8,4522.2,63,Morgan Freeman
4,166.7,4333.5,26,Andy Serkis


# Merge the dataframes

In [165]:
# Merge mojo and youtube
merged = pd.DataFrame.merge(mojo_df, youtube_df,on='movie_name', how = 'inner')
merged = merged.drop(["release_date"],1)
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 10 columns):
movie_name    500 non-null object
movie         500 non-null object
opening       500 non-null int64
date_time     500 non-null datetime64[ns]
theaters      500 non-null int64
tot_gross     500 non-null int64
Yviews        500 non-null int64
Ycomments     500 non-null int64
Ydislikes     500 non-null int64
Ylikes        500 non-null int64
dtypes: datetime64[ns](1), int64(7), object(2)
memory usage: 43.0+ KB


In [166]:
# Merge omdb
merged2 = pd.DataFrame.merge(merged,omdb_df,on="movie",how="inner")

In [167]:
# First create a dataframe for year and corresponding movie ticket
year = [2013,2014,2015,2016,2017]
tick = [8.13,8.17,8.43,8.65,8.97]
adjuster = pd.DataFrame({"year":year,"tick":tick})
# Assign ticket price adjuster column to omdb through merge
merged2 = pd.merge(merged2,adjuster,on="year",how="inner")

# Create new column for merged
est_tick = (merged2.opening // merged2.tick).astype("int64")
merged2.insert(loc=3, column='est_tick', value=est_tick)
merged2 = merged2.rename(columns={"opening":"op_gross"})

In [168]:
merged2

Unnamed: 0,movie_name,movie,op_gross,est_tick,date_time,theaters,tot_gross,Yviews,Ycomments,Ydislikes,Ylikes,actors,director,imdb,metacritic,rated,rotten_tomatoes,runtime,year,tick
0,The Hunger Games: Catching Fire,The Hunger Games: Catching Fire,158074286,19443331,2013-11-22,4163,424668047,12389287,9493,1385,52030,"[Jennifer Lawrence, Liam Hemsworth, Jack Quaid...",[Francis Lawrence],7.5,76,PG-13,89,146,2013,8.13
1,Iron Man 3,Iron Man 3,174144585,21419998,2013-05-03,4253,409013994,1274074,1643,259,7201,"[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",[Shane Black],7.2,62,PG-13,80,130,2013,8.13
2,Frozen,Frozen,243390,29937,2013-11-22,1,400738009,34267342,7458,8664,54157,"[Kristen Bell, Idina Menzel, Jonathan Groff, J...","[Chris Buck, Jennifer Lee]",7.5,74,PG,90,102,2013,8.13
3,Despicable Me 2,Despicable Me 2,83517315,10272732,2013-07-03,3997,368061265,7725854,2146,1733,17764,"[Steve Carell, Kristen Wiig, Benjamin Bratt, M...","[Pierre Coffin, Chris Renaud]",7.4,62,PG,74,98,2013,8.13
4,Man of Steel,Man of Steel,116619362,14344324,2013-06-14,4207,291045518,42683390,66568,6511,155344,"[Henry Cavill, Amy Adams, Michael Shannon, Dia...",[Zack Snyder],7.1,55,PG-13,55,143,2013,8.13
5,Gravity,Gravity,55785112,6861637,2013-10-04,3575,274092705,15729744,13123,3679,32653,"[Sandra Bullock, George Clooney, Ed Harris, Or...",[Alfonso Cuarón],7.8,96,PG-13,96,91,2013,8.13
6,Monsters University,Monsters University,82429469,10138926,2013-06-21,4004,268492764,3375218,1290,695,8927,"[Billy Crystal, John Goodman, Steve Buscemi, H...",[Dan Scanlon],7.3,65,G,79,104,2013,8.13
7,The Hobbit: The Desolation of Smaug,The Hobbit: The Desolation of Smaug,73645197,9058449,2013-12-13,3903,258366855,14097771,17057,1629,60932,"[Ian McKellen, Martin Freeman, Richard Armitag...",[Peter Jackson],7.9,66,PG-13,74,161,2013,8.13
8,Fast & Furious 6,Fast & Furious 6,97375245,11977274,2013-05-24,3658,238679850,26499303,17041,3415,78973,"[Vin Diesel, Paul Walker, Dwayne Johnson, Jord...",[Justin Lin],7.1,61,PG-13,69,130,2013,8.13
9,Oz The Great and Powerful,Oz The Great and Powerful,79110453,9730683,2013-03-08,3912,234911825,1456121,893,179,3911,"[James Franco, Mila Kunis, Rachel Weisz, Miche...",[Sam Raimi],6.3,44,PG,59,130,2013,8.13
