# Movie Revenue Project Data Cleaning and Modeling Notebook

In [1]:
# Needed to pip install psycopg2
# pip install psycopg2-binary

In [2]:
# imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
from sqlalchemy import create_engine

In [3]:
# Imports for modeling: 
import statsmodels.api as sm
# Need to import this to deal with missing data 
from sklearn.impute import SimpleImputer
# Need these for creating pipeline 
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Step 1: Connect to SQL server to get data

In [341]:
# Establish a connection using SQLAlchemy 

# Try following string 
# Note: 'psycopg2' is the assumed driver for postgres 
connection_url = 'postgresql+psycopg2://oahwyljl:sSrk8smQ16BCOVhHQBVWVtK2nVcCDmiF@peanut.db.elephantsql.com/oahwyljl'

try:
    # GET THE CONNECTION OBJECT (ENGINE) FOR THE DATABASE
    engine = create_engine(connection_url)
    print( f"Connection created successfully.")
    
except Exception as ex:
    print("Connection could not be made due to the following error: \n", ex)

Connection created successfully.


In [355]:
# There were 539 empty rows, but those got removed from the database 
engine.execute('''SELECT count(movieinfo_id) FROM moviesinfo WHERE title = 'NaN' ''').fetchall() 

[(539,)]

### Get data and do preliminary cleaning

In [408]:
# df is the movies_info dataframe
df = pd.read_sql_query("SELECT * from moviesinfo", con=engine, parse_dates = ['released'])

In [409]:
# We have 13,380 movies in total (final number)
df.shape

(13380, 15)

In [410]:
# df_revenue is the dataframe containing revenue 
df_revenue = pd.read_sql_query("SELECT * from moviesgross", con=engine, parse_dates = ['release_date'])

In [411]:
# We have 14,939 movies from The Numbers 
df_revenue.shape

(14939, 9)

In [412]:
# Remove movieinfo_id extra column from both datasets
# Otherwise, will have issue with removing duplicates later 
df = df.drop('movieinfo_id', axis = 1)
df_revenue = df_revenue.drop('moviegross_id', axis = 1)

In [413]:
# Replace 'NaN' strings and 'N/A' strings with none type in both dataframes 
df = df.replace('NaN', np.nan)
df = df.replace('N/A', np.nan)
df_revenue = df_revenue.replace('NaN', np.nan)
df_revenue = df_revenue.replace('N/A', np.nan)

### Remove duplicate values: 

In [414]:
# There are 2,567 duplicate values 
len(df[df.duplicated()])

2567

In [415]:
# No duplicate values in df_revenue! 
df_revenue[df_revenue.duplicated()]

Unnamed: 0,year,rank,title,release_date,distributor,genre,gross,tickets_sold


In [416]:
# Drop duplicates 
df = df.drop_duplicates(subset = ['title', 'released'])

In [417]:
# Now we have 10,807 observations with movie info 
len(df)

10807

In [418]:
df[df['title'] == 'Galapagos']

Unnamed: 0,title,year,rated,released,runtime,genre,director,writer,actors,plot,language,country,poster,seasons
3001,Galapagos,2006,,2007-03-18,180 min,Documentary,,,"Tilda Swinton, Richard Wollocombe, Tom Hiddleston",The history of these beautiful Islands from th...,English,United Kingdom,https://m.media-amazon.com/images/M/MV5BNzdiZm...,1.0


### Remove TV Series

In [428]:
# 10,646 observations are NOT TV series 
len(df[df['seasons'].isna()])

10646

In [429]:
# Remove the TV series (keep only entries where 'seasons' is NaN)
df = df[df['seasons'].isna()]

In [433]:
# Drop the Series column, since we don't need it anymore: 
df = df.drop('seasons', axis = 1)

## Step 2: Inspect and Remove 'NaN's from movie df: 

In [434]:
# See how much data is missing in df and df_revunue
df.isna().sum()

title          0
year           0
rated       1281
released     153
runtime       90
genre         15
director      68
writer       726
actors       164
plot         122
language      60
country      231
poster       130
dtype: int64

In [435]:
# Good - no missing values for gross revenue 
df_revenue.isna().sum()

year               0
rank               0
title              0
release_date      36
distributor     1127
genre            931
gross              0
tickets_sold       0
dtype: int64

In [437]:
# Good that 'gross' is already an integer and won't need to be recoded! 
df_revenue['gross']

0        572984769
1        224543292
2        212609036
3        183651655
4        173005945
           ...    
14934          869
14935          589
14936          516
14937          374
14938          150
Name: gross, Length: 14939, dtype: int64

## Step 3: Recode columns in df 

In [436]:
# The following columns will need to be recoded: 
# Year --> to int 
# Runtime --> to int 
# Genre --> need to convert to list and dummy code genres 
# Directors --> create extra columns code top 10, 50, 100 
# Writer --> ignore for now? Can do same thing as with Director and actors 
# Actors --> convert to list and code top 10, 50, and 100 
# Language --> code as English only, English and other(s), Foreign lang only --> then one hot encode as 2 variables 
# Country --> code as US only, US and other countries, Foreign only --> then one hot encode 
for column in df.columns: 
    print(column, df[column].dtype) 

title object
year object
rated object
released datetime64[ns]
runtime object
genre object
director object
writer object
actors object
plot object
language object
country object
poster object


In [438]:
# Function for converting 'runtime' to int 
def get_minutes(x): 
    if pd.isna(x): 
        return np.nan 
    else: 
        try: 
            return int(x.split(' ')[0])
        except: 
            return np.nan

In [439]:
# Convert 'runtime'
df['runtime'] = df['runtime'].apply(get_minutes) 

In [440]:
# 'Runtime' column has 93 NAN values 
df['runtime'].isna().sum()

93

# Create columns for Top Director and Top Actors

## Columns for Top 10, 50, 100 Directors: 

In [441]:
num_directors = len(df['director'].value_counts())
print(f'There are {num_directors} directors in the dataset.') 

There are 6488 directors in the dataset.


### What % of movies did the top 10, 50, 100 directors make? 

In [442]:
# Lists of the top directors: 
# Problem with this approach: need to see how many movies top 10 director made and include anyone 
# who made the same number of movies in that list too 
top_10_directors = list(df['director'].value_counts()[0:10].index)
top_50_directors = list(df['director'].value_counts()[0:50].index)
top_100_directors = list(df['director'].value_counts()[0:100].index)

In [443]:
df['director'].value_counts()[0:11]

Woody Allen             20
Ridley Scott            19
Clint Eastwood          18
Steven Spielberg        18
Steven Soderbergh       17
Ron Howard              15
Martin Scorsese         14
Michael Winterbottom    14
François Ozon           14
M. Night Shyamalan      12
David Gordon Green      12
Name: director, dtype: int64

In [444]:
# New approach: 
cutoff_10 = df['director'].value_counts()[9]
top_10_directors = list(df['director'].value_counts()[df['director'].value_counts() >= cutoff_10].index)
num_top = len((df['director'].value_counts()[df['director'].value_counts() >= cutoff_10]))

# Movies made by the top 10 directors: 
top_10 = df['director'].apply(lambda x: x in top_10_directors).sum()
print(f'There were {num_top_10} "top 10" directors, who EACH made at least {cutoff_10} movies. \n \
They made {top_10} movies ({round(top_10 / len(df) * 100, 1)}% of all movies in dataset).')

There were 10 "top 10" directors, who EACH made at least 12 movies. 
 They made 209 movies (2.0% of all movies in dataset).


In [445]:
cutoff_50 = df['director'].value_counts()[49]
top_50_directors = list(df['director'].value_counts()[df['director'].value_counts() >= cutoff_50].index)
num_top_50 = len((df['director'].value_counts()[df['director'].value_counts() >= cutoff_50]))

# Movies made by the top 50 directors: 
top_50 = df['director'].apply(lambda x: x in top_50_directors).sum()
print(f'There were {num_top_50} "top 50" directors, who EACH made at least {cutoff_50} movies.\n \
They made {top_50} movies ({round(top_50 / len(df) * 100, 1)}% of all movies in dataset).')

There were 73 "top 50" directors, who EACH made at least 8 movies.
 They made 739 movies (6.9% of all movies in dataset).


In [446]:
cutoff_100 = df['director'].value_counts()[99]
top_100_directors = list(df['director'].value_counts()[df['director'].value_counts() >= cutoff_100].index)
num_top_100 = len((df['director'].value_counts()[df['director'].value_counts() >= cutoff_100]))

# Movies made by the top 100 directors: 
top_100 = df['director'].apply(lambda x: x in top_100_directors).sum()
print(f'There were {num_top_100} "top 100" directors, who EACH made at least {cutoff_100} movies.\n \
They made {top_100} movies ({round(top_100 / len(df) * 100, 1)}% of all movies in dataset).')

There were 132 "top 100" directors, who EACH made at least 7 movies.
 They made 1152 movies (10.8% of all movies in dataset).


### Create columns to code for top directors: 

In [447]:
# Function for coding top director 
def has_top_director(x, director_list): 
    if pd.isna(x): 
        return np.nan 
    elif x in director_list: 
        return 1 
    else: 
        return 0 

In [448]:
# Apply function to create 3 new columns: 
df['top_10_dir'] = df['director'].apply(lambda x: has_top_director(x, top_10_directors))
df['top_50_dir'] = df['director'].apply(lambda x: has_top_director(x, top_50_directors))
df['top_100_dir'] = df['director'].apply(lambda x: has_top_director(x, top_100_directors))

## Columns for Top 10, 50, 100 Actors: 

In [449]:
# Function for converting column to list of actors instead of string 
def to_list(x): 
    if pd.isna(x): 
        return np.nan 
    else: 
        return x.split(', ')

In [450]:
# Apply function to 'actors' and 'writer' columns 
# Make sure to run only once 
df['actors'] = df['actors'].apply(to_list) 
df['writer'] = df['writer'].apply(to_list)

In [457]:
df['actors'][0]

['Tom Holland', 'Zendaya', 'Benedict Cumberbatch']

In [458]:
isinstance(df['actors'][0], list)

True

In [466]:
# Create a dictionary of actors to get the top actors by number of movies they've been in 
actor_dict = {}
for actors in df['actors']: 
    if isinstance(actors, list):
        for actor in actors: 
            actor_dict[actor] = actor_dict.get(actor, 0) + 1

In [499]:
# 16,374 actors in the dataset 
len(actor_dict)

16374

In [485]:
# Look at the top actors 
top_actors = sorted(actor_dict.items(), key=lambda item: item[1], reverse = True)
top_actors

[('Samuel L. Jackson', 41),
 ('Matt Damon', 38),
 ('Robert De Niro', 38),
 ('Ryan Reynolds', 37),
 ('Nicolas Cage', 36),
 ('Liam Neeson', 36),
 ('Julianne Moore', 35),
 ('Mark Wahlberg', 34),
 ('Ewan McGregor', 34),
 ('Nicole Kidman', 33),
 ('Morgan Freeman', 33),
 ('Scarlett Johansson', 32),
 ('Owen Wilson', 32),
 ('Tom Hanks', 31),
 ('Amitabh Bachchan', 31),
 ('Hugh Jackman', 30),
 ('Bruce Willis', 30),
 ('Ethan Hawke', 30),
 ('Matthew McConaughey', 29),
 ('Colin Farrell', 29),
 ('Dwayne Johnson', 28),
 ('Charlize Theron', 28),
 ('Willem Dafoe', 28),
 ('Robert Downey Jr.', 28),
 ('Ben Affleck', 28),
 ('Akshay Kumar', 28),
 ('Naomi Watts', 28),
 ('Woody Harrelson', 27),
 ('Cate Blanchett', 27),
 ('Kristen Stewart', 27),
 ('Gerard Butler', 27),
 ('Isabelle Huppert', 27),
 ('Mark Ruffalo', 27),
 ('Jesse Eisenberg', 27),
 ('Rachel Weisz', 27),
 ('Jason Statham', 26),
 ('Johnny Depp', 26),
 ('Pierce Brosnan', 26),
 ('Dennis Quaid', 26),
 ('Christian Bale', 26),
 ('Salman Khan', 26),
 ('Na

In [505]:
# Movie cutoffs for top 10, top 50, and top 100 actors 
cutoff_10_actors = top_actors[9][1]
cutoff_50_actors = top_actors[49][1]
# Top 100 actor made at least 21 movies since 2020 
cutoff_100_actors = top_actors[99][1]

In [508]:
top_10_actors = [key for key, value in actor_dict.items() if value >= cutoff_10_actors]
top_10_actors

['Ryan Reynolds',
 'Nicolas Cage',
 'Samuel L. Jackson',
 'Liam Neeson',
 'Julianne Moore',
 'Matt Damon',
 'Robert De Niro',
 'Mark Wahlberg',
 'Nicole Kidman',
 'Ewan McGregor',
 'Morgan Freeman']

In [113]:
df['actors'].apply()

0             Tom Holland, Zendaya, Benedict Cumberbatch
1               Simu Liu, Awkwafina, Tony Chiu-Wai Leung
2          Tom Hardy, Woody Harrelson, Michelle Williams
3       Scarlett Johansson, Florence Pugh, David Harbour
4       Vin Diesel, Michelle Rodriguez, Jordana Brewster
                              ...                       
8975              David Rosen, Alexa Jago, Jonathan Aube
8976             Tony Nardi, Timothy Webber, Tygh Runyan
8977              Steven Nelson, Honey Lauren, Jeri Ryan
8978                   Roshan Seth, Soni Razdan, Om Puri
8979       Colin Friels, Jack Thompson, Donald Pleasence
Name: actors, Length: 8980, dtype: object

In [300]:
df['director'].apply(lambda x: has_top_director(x, top_10_directors)).isna().sum()

210

In [303]:
df.describe()

Unnamed: 0,runtime,top_10_dir,top_50_dir,top_100_dir
count,8866.0,8770.0,8770.0,8770.0
mean,103.22028,0.020753,0.076169,0.106499
std,28.06257,0.142563,0.265283,0.308493
min,1.0,0.0,0.0,0.0
25%,91.0,0.0,0.0,0.0
50%,101.0,0.0,0.0,0.0
75%,115.0,0.0,0.0,0.0
max,780.0,1.0,1.0,1.0


In [509]:
df.rated.value_counts()

R            3519
PG-13        2093
Not Rated    2011
PG            895
Unrated       409
G             151
TV-MA          91
TV-14          59
TV-PG          36
Approved       34
Passed         21
TV-G           20
NC-17          12
X               3
GP              3
TV-Y7           3
UNRATED         2
M/PG            1
MA-17           1
M               1
Name: rated, dtype: int64

In [302]:
df.head()

Unnamed: 0,title,year,rated,released,runtime,genre,director,writer,actors,plot,language,country,poster,top_10_dir,top_50_dir,top_100_dir
0,Spider-Man: No Way Home,2021,PG-13,2021-12-17,148.0,"Action, Adventure, Fantasy",Jon Watts,"Chris McKenna, Erik Sommers, Stan Lee","Tom Holland, Zendaya, Benedict Cumberbatch","With Spider-Man's identity now revealed, Peter...",English,United States,https://m.media-amazon.com/images/M/MV5BZWMyYz...,0.0,0.0,0.0
1,Shang-Chi and the Legend of the Ten Rings,2021,PG-13,2021-09-03,132.0,"Action, Adventure, Fantasy",Destin Daniel Cretton,"Dave Callaham, Destin Daniel Cretton, Andrew L...","Simu Liu, Awkwafina, Tony Chiu-Wai Leung","Shang-Chi, the master of weaponry-based Kung F...","English, Mandarin",United States,https://m.media-amazon.com/images/M/MV5BNTliYj...,0.0,0.0,0.0
2,Venom: Let There Be Carnage,2021,PG-13,2021-10-01,97.0,"Action, Adventure, Sci-Fi",Andy Serkis,"Kelly Marcel, Tom Hardy","Tom Hardy, Woody Harrelson, Michelle Williams",Eddie Brock attempts to reignite his career by...,English,"United States, China",https://m.media-amazon.com/images/M/MV5BYTc3ZT...,0.0,0.0,0.0
3,Black Widow,2021,PG-13,2021-07-09,134.0,"Action, Adventure, Sci-Fi",Cate Shortland,"Eric Pearson, Jac Schaeffer, Ned Benson","Scarlett Johansson, Florence Pugh, David Harbour",Natasha Romanoff confronts the darker parts of...,"English, Russian, Norwegian, Hungarian, Macedo...",United States,https://m.media-amazon.com/images/M/MV5BNjRmND...,0.0,0.0,0.0
4,F9: The Fast Saga,2021,PG-13,2021-06-25,143.0,"Action, Crime, Thriller",Justin Lin,"Daniel Casey, Justin Lin, Alfredo Botello","Vin Diesel, Michelle Rodriguez, Jordana Brewster",Dom and the crew must take on an international...,English,United States,https://m.media-amazon.com/images/M/MV5BMjI0Nm...,0.0,0.0,0.0


In [91]:
df_revenue['gross']

0        572984769
1        224543292
2        212609036
3        183651655
4        173005945
           ...    
14934          869
14935          589
14936          516
14937          374
14938          150
Name: gross, Length: 14939, dtype: int64