In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 12
# Disable warnings in Anaconda
import warnings
warnings.simplefilter('ignore')
# We will display plots right inside Jupyter Notebook
%matplotlib inline
import matplotlib.pyplot as plt
# We will use the Seaborn library
import seaborn as sns
sns.set()
# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
# Increase the default plot size
from pylab import rcParams
rcParams['figure.figsize'] = 5, 4

In [2]:
df = pd.read_csv('box_office_predictions.csv')

In [20]:
df2 = df

In [21]:
df2.head()

Unnamed: 0,budget,country,director,genre,gross,name,rating,runtime,score,star,studio,votes
0,237000000.0,UK,James Cameron,Action,760507625.0,Avatar (2009),PG-13,162,7.8,Sam Worthington,Twentieth Century Fox Film Corporation,958400
1,200000000.0,USA,James Cameron,Drama,658672302.0,Titanic (1997),PG-13,194,7.8,Leonardo DiCaprio,Twentieth Century Fox Film Corporation,865551
2,150000000.0,USA,Colin Trevorrow,Action,652270625.0,Jurassic World (2015),PG-13,124,7.0,Chris Pratt,Universal Pictures,470625
3,220000000.0,USA,Joss Whedon,Action,623357910.0,The Avengers (2012),PG-13,143,8.1,Robert Downey Jr.,Marvel Studios,1069292
4,185000000.0,USA,Christopher Nolan,Action,534858444.0,The Dark Knight (2008),PG-13,152,9.0,Christian Bale,Warner Bros.,1845853


In [38]:
#create new column called outcome to calculate the financial outcome of the movie
col = df2.apply(lambda row: row.gross - row.budget, axis=1)
df2 = df2.assign(outcome=col.values)

In [39]:
df2.head()

Unnamed: 0,budget,country,director,genre,gross,name,...,score,star,studio,votes,year,outcome
0,237000000.0,UK,James Cameron,Action,760507625.0,Avatar,...,7.8,Sam Worthington,Twentieth Century Fox Film Corporation,958400,2009,523507625.0
1,200000000.0,USA,James Cameron,Drama,658672302.0,Titanic,...,7.8,Leonardo DiCaprio,Twentieth Century Fox Film Corporation,865551,1997,458672302.0
2,150000000.0,USA,Colin Trevorrow,Action,652270625.0,Jurassic World,...,7.0,Chris Pratt,Universal Pictures,470625,2015,502270625.0
3,220000000.0,USA,Joss Whedon,Action,623357910.0,The Avengers,...,8.1,Robert Downey Jr.,Marvel Studios,1069292,2012,403357910.0
4,185000000.0,USA,Christopher Nolan,Action,534858444.0,The Dark Knight,...,9.0,Christian Bale,Warner Bros.,1845853,2008,349858444.0


In [26]:


#create a new col1 list from name column and remove non numeric objects
col1 = df2['name'].str.extract('(\d+)', expand=False)
#add new column called year from col1
df2 = df2.assign(year=col1.values)


In [41]:
df2.head()

Unnamed: 0,budget,country,director,genre,gross,name,...,score,star,studio,votes,year,outcome
0,237000000.0,UK,James Cameron,Action,760507625.0,Avatar,...,7.8,Sam Worthington,Twentieth Century Fox Film Corporation,958400,2009,523507625.0
1,200000000.0,USA,James Cameron,Drama,658672302.0,Titanic,...,7.8,Leonardo DiCaprio,Twentieth Century Fox Film Corporation,865551,1997,458672302.0
2,150000000.0,USA,Colin Trevorrow,Action,652270625.0,Jurassic World,...,7.0,Chris Pratt,Universal Pictures,470625,2015,502270625.0
3,220000000.0,USA,Joss Whedon,Action,623357910.0,The Avengers,...,8.1,Robert Downey Jr.,Marvel Studios,1069292,2012,403357910.0
4,185000000.0,USA,Christopher Nolan,Action,534858444.0,The Dark Knight,...,9.0,Christian Bale,Warner Bros.,1845853,2008,349858444.0


In [42]:
df3 = df2

In [43]:
#strip characters in  values from name
df3['name'] = df3['name'].str.replace('\([^)]*\)', '')

In [44]:
df3.head()

Unnamed: 0,budget,country,director,genre,gross,name,...,score,star,studio,votes,year,outcome
0,237000000.0,UK,James Cameron,Action,760507625.0,Avatar,...,7.8,Sam Worthington,Twentieth Century Fox Film Corporation,958400,2009,523507625.0
1,200000000.0,USA,James Cameron,Drama,658672302.0,Titanic,...,7.8,Leonardo DiCaprio,Twentieth Century Fox Film Corporation,865551,1997,458672302.0
2,150000000.0,USA,Colin Trevorrow,Action,652270625.0,Jurassic World,...,7.0,Chris Pratt,Universal Pictures,470625,2015,502270625.0
3,220000000.0,USA,Joss Whedon,Action,623357910.0,The Avengers,...,8.1,Robert Downey Jr.,Marvel Studios,1069292,2012,403357910.0
4,185000000.0,USA,Christopher Nolan,Action,534858444.0,The Dark Knight,...,9.0,Christian Bale,Warner Bros.,1845853,2008,349858444.0


In [49]:
df3.columns

Index(['budget', 'country', 'director', 'genre', 'gross', 'name', 'rating',
       'runtime', 'score', 'star', 'studio', 'votes', 'year', 'outcome'],
      dtype='object')

In [54]:
df3.describe()


Unnamed: 0,budget,gross,runtime,score,votes,outcome
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,24699180.0,33416350.0,106.587,6.386383,71885.37,8717169.0
std,37217100.0,57352050.0,18.026885,0.994921,130803.3,40178040.0
min,0.0,441.0,50.0,1.5,27.0,-176921900.0
25%,0.0,1527796.0,95.0,5.8,7791.75,-5121963.0
50%,11000000.0,12298970.0,102.0,6.5,26601.5,960212.5
75%,32625000.0,40072560.0,115.0,7.1,76774.75,14015080.0
max,300000000.0,760507600.0,366.0,9.3,1868308.0,523507600.0


In [55]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 14 columns):
budget      6000 non-null float64
country     6000 non-null object
director    6000 non-null object
genre       6000 non-null object
gross       6000 non-null float64
name        6000 non-null object
rating      6000 non-null object
runtime     6000 non-null int64
score       6000 non-null float64
star        6000 non-null object
studio      6000 non-null object
votes       6000 non-null int64
year        6000 non-null object
outcome     6000 non-null float64
dtypes: float64(4), int64(2), object(8)
memory usage: 656.3+ KB


In [57]:
df3.head()

Unnamed: 0,budget,country,director,genre,gross,name,...,score,star,studio,votes,year,outcome
0,237000000.0,UK,James Cameron,Action,760507625.0,Avatar,...,7.8,Sam Worthington,Twentieth Century Fox Film Corporation,958400,2009,523507625.0
1,200000000.0,USA,James Cameron,Drama,658672302.0,Titanic,...,7.8,Leonardo DiCaprio,Twentieth Century Fox Film Corporation,865551,1997,458672302.0
2,150000000.0,USA,Colin Trevorrow,Action,652270625.0,Jurassic World,...,7.0,Chris Pratt,Universal Pictures,470625,2015,502270625.0
3,220000000.0,USA,Joss Whedon,Action,623357910.0,The Avengers,...,8.1,Robert Downey Jr.,Marvel Studios,1069292,2012,403357910.0
4,185000000.0,USA,Christopher Nolan,Action,534858444.0,The Dark Knight,...,9.0,Christian Bale,Warner Bros.,1845853,2008,349858444.0


In [58]:
df3["year"] = df3.year.astype(float)

In [59]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 14 columns):
budget      6000 non-null float64
country     6000 non-null object
director    6000 non-null object
genre       6000 non-null object
gross       6000 non-null float64
name        6000 non-null object
rating      6000 non-null object
runtime     6000 non-null int64
score       6000 non-null float64
star        6000 non-null object
studio      6000 non-null object
votes       6000 non-null int64
year        6000 non-null float64
outcome     6000 non-null float64
dtypes: float64(5), int64(2), object(7)
memory usage: 656.3+ KB


In [69]:
df3.shape

(6000, 14)

In [60]:
df14 = df3[df3['year'] <= 2014]

In [70]:
df14.shape

(5634, 14)

In [71]:
dfgt14 = df3[df3['year'] >= 2014]

In [73]:
dfgt14.shape

(544, 14)