# Movie Analysis

> Business Problem:Microsoft has decided to create a new movie studio. Microsoft wants to know what types of films are currently doing the best at the box office to help decide what type of films to create.

## Load Data and Packages

In [1]:
# importing packages
import pandas as pd
# setting pandas display to avoid scientific notation
pd.options.display.float_format = '{:.2f}'.format
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [37]:
#Using Revenue numbers, title, and year
df_gross = pd.read_csv('data/bom.movie_gross.csv.gz')
#Using tconst(ID), genres, primary_title, runtime_minutes
df_titles = pd.read_csv('data/imdb.title.basics.csv.gz')
#Using tconst(ID), ratings, numvotes
df_ratings = pd.read_csv('data/imdb.title.ratings.csv.gz')
# 
df_budget = pd.read_csv('data/tn.movie_budgets.csv.gz')

## Data Exploration

In [25]:
df_budget.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [31]:
df_budget.loc[df_budget['movie'] == 'Avatar']

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"


In [35]:
df_budget.loc[df_budget['movie'] == 'Dark Phoenix']

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"


In [34]:
df_gross[df_gross['title'].str.contains('Avatar')]


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
128,Avatar: Special Edition,Fox,10700000.0,22500000,2010


In [36]:
df_gross[df_gross['title'].str.contains('Dark Phoenix')]


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year


In [3]:
df_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [4]:
df_gross.dtypes

title              object
studio             object
domestic_gross    float64
foreign_gross      object
year                int64
dtype: object

In [5]:
df_gross.describe()

Unnamed: 0,domestic_gross,year
count,3359.0,3387.0
mean,28745845.07,2013.96
std,66982498.24,2.48
min,100.0,2010.0
25%,120000.0,2012.0
50%,1400000.0,2014.0
75%,27900000.0,2016.0
max,936700000.0,2018.0


In [20]:
df_title.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [7]:
df_title.describe()

Unnamed: 0,start_year,runtime_minutes
count,146144.0,114405.0
mean,2014.62,86.19
std,2.73,166.36
min,2010.0,1.0
25%,2012.0,70.0
50%,2015.0,87.0
75%,2017.0,99.0
max,2115.0,51420.0


In [8]:
#review normalized value counts of df_title['genres'] to determine main categories
df_title['genres'].value_counts(normalize = True)


Documentary                  0.23
Drama                        0.15
Comedy                       0.07
Horror                       0.03
Comedy,Drama                 0.03
                             ... 
Action,Sport,War             0.00
Action,Adventure,Musical     0.00
Animation,Mystery,Thriller   0.00
Comedy,Sport,Western         0.00
Documentary,Horror,News      0.00
Name: genres, Length: 1085, dtype: float64

In [None]:

sns.countplot(x='genres', data=df_title)
plt.title('Number of Movies Per Genre', fontsize=18)
plt.ylabel('Count', fontsize=16, limit = 5000)
plt.xlabel('Genre', fontsize=16)


In [10]:
df_ratings.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [11]:
df_ratings.describe()

Unnamed: 0,averagerating,numvotes
count,73856.0,73856.0
mean,6.33,3523.66
std,1.47,30294.02
min,1.0,5.0
25%,5.5,14.0
50%,6.5,49.0
75%,7.4,282.0
max,10.0,1841066.0


In [12]:
df_ratings.dtypes

tconst            object
averagerating    float64
numvotes           int64
dtype: object

## Data Preparation

### To dos:
- df_gross
 - Convert foreign_gross revenue numbers to float so they can be manipulated as a number
 - Create a total_gross column

- df_title
 - Regroup genres which account for less than 2% of movies
 - Merge df_titles and df_ratings into the same df
 
- df_ratings
### Future Analysis Goals: 
- Revenues by genre
- Ratings by genre
- Revenues by genre
- Ratings by Runtime


In [13]:
#Remove commas from df_gross['foreign_gross'] and convert to float, so gross columns can be combined
df_gross['foreign_gross'] = df_gross['foreign_gross'].str.replace(",","").astype(float)
df_gross.dtypes

title              object
studio             object
domestic_gross    float64
foreign_gross     float64
year                int64
dtype: object

In [18]:
#Create total_gross column in df_gross
df_gross['total_gross'] = df_gross['foreign_gross'] + df_gross['domestic_gross']
df_gross.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,total_gross
0,Toy Story 3,BV,415000000.0,652000000.0,2010,1067000000.0
1,Alice in Wonderland (2010),BV,334200000.0,691300000.0,2010,1025500000.0
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000.0,2010,960300000.0
3,Inception,WB,292600000.0,535700000.0,2010,828300000.0
4,Shrek Forever After,P/DW,238700000.0,513900000.0,2010,752600000.0


In [38]:
df_imbd = df_titles.merge(df_ratings, left_on='tconst', right_on='tconst')
df_imbd.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",7.0,77
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",7.2,43
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,6.9,4517
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",6.1,13
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",6.5,119


## Data Analysis

## Summary