1.BUSINESS UNDERSTANDING

Goal: To provide the head of a new movie studio with actionable, data-driven recommendations that guide initial film production choices to maximize worldwide box office success.

**Key Questions:**

- Which film genres yield the highest average revenue?

- How does audience reception (IMDB rating) correlate with financial success?

- Is there an optimal film runtime that maximizes gross earnings?

In [36]:
import pandas as pd
import sqlite3
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as stats
import scipy as norm

In [37]:
conn=sqlite3.connect('im.db')
df=pd.read_csv('bom.movie_gross.csv')

In [38]:
df.to_sql(
    "revenue",
    conn,
    if_exists="replace",
    index=False
)

In [39]:
query_1="""
SELECT name
FROM sqlite_master
WHERE TYPE='table'


"""

pd.read_sql_query(query_1,conn)

Unnamed: 0,name
0,movie_basics
1,directors
2,known_for
3,movie_akas
4,movie_ratings
5,persons
6,principals
7,writers
8,movies_from_csv
9,bom.movie_gross.csv


In [40]:
df.to_sql(
    "bom.movie_gross.csv",   # table name (choose any name)
    conn,                # your EXISTING connection
    if_exists="replace", # or "append"
    index=False
)

In [41]:
query_2="""
SELECT *
FROM revenue

;

"""

pd.read_sql_query(query_2,conn)



Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [42]:
query_2="""
SELECT DISTINCT *
FROM revenue as r 
JOIN  movie_basics as mb 
ON r.title = mb.primary_title 
JOIN movie_ratings as mr
on mr.movie_id=mb.movie_id


;

"""

df1=pd.read_sql_query(query_2,conn)


In [43]:
df1.to_csv("final.csv", index=False) # converting to our csv

In [44]:
df_final=pd.read_csv('final.csv')
df_final.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,movie_id.1,averagerating,numvotes
0,Toy Story 3,BV,415000000.0,652000000,2010,tt0435761,Toy Story 3,Toy Story 3,2010,103.0,"Adventure,Animation,Comedy",tt0435761,8.3,682218
1,Inception,WB,292600000.0,535700000,2010,tt1375666,Inception,Inception,2010,148.0,"Action,Adventure,Sci-Fi",tt1375666,8.8,1841066
2,Shrek Forever After,P/DW,238700000.0,513900000,2010,tt0892791,Shrek Forever After,Shrek Forever After,2010,93.0,"Adventure,Animation,Comedy",tt0892791,6.3,167532
3,The Twilight Saga: Eclipse,Sum.,300500000.0,398000000,2010,tt1325004,The Twilight Saga: Eclipse,The Twilight Saga: Eclipse,2010,124.0,"Adventure,Drama,Fantasy",tt1325004,5.0,211733
4,Iron Man 2,Par.,312400000.0,311500000,2010,tt1228705,Iron Man 2,Iron Man 2,2010,124.0,"Action,Adventure,Sci-Fi",tt1228705,7.0,657690


In [45]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3027 entries, 0 to 3026
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3027 non-null   object 
 1   studio           3024 non-null   object 
 2   domestic_gross   3005 non-null   float64
 3   foreign_gross    1832 non-null   object 
 4   year             3027 non-null   int64  
 5   movie_id         3027 non-null   object 
 6   primary_title    3027 non-null   object 
 7   original_title   3027 non-null   object 
 8   start_year       3027 non-null   int64  
 9   runtime_minutes  2980 non-null   float64
 10  genres           3020 non-null   object 
 11  movie_id.1       3027 non-null   object 
 12  averagerating    3027 non-null   float64
 13  numvotes         3027 non-null   int64  
dtypes: float64(3), int64(3), object(8)
memory usage: 331.2+ KB


In [46]:
columns_drop=['movie_id.1','start_year','primary_title','original_title']

df_final.drop(columns=columns_drop, inplace=True)


In [47]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3027 entries, 0 to 3026
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3027 non-null   object 
 1   studio           3024 non-null   object 
 2   domestic_gross   3005 non-null   float64
 3   foreign_gross    1832 non-null   object 
 4   year             3027 non-null   int64  
 5   movie_id         3027 non-null   object 
 6   runtime_minutes  2980 non-null   float64
 7   genres           3020 non-null   object 
 8   averagerating    3027 non-null   float64
 9   numvotes         3027 non-null   int64  
dtypes: float64(3), int64(2), object(5)
memory usage: 236.6+ KB


In [None]:
df_final.describe() ## checking on the continous varaibles

Unnamed: 0,domestic_gross,year,runtime_minutes,averagerating,numvotes
count,3005.0,3027.0,2980.0,3027.0,3027.0
mean,30640330.0,2014.077635,107.217114,6.457582,61700.3
std,66716290.0,2.442245,20.073886,1.012277,125513.2
min,100.0,2010.0,3.0,1.6,5.0
25%,139000.0,2012.0,94.0,5.9,2117.0
50%,2000000.0,2014.0,105.0,6.6,13109.0
75%,32500000.0,2016.0,118.0,7.1,62765.5
max,700100000.0,2018.0,272.0,9.2,1841066.0
