In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import optuna
from time import time
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
import joblib

# All the Models I'll be using
from sklearn.tree import DecisionTreeRegressor
import catboost as cb
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# 1. Get the Data

In [None]:
# Import as variable
df = pd.read_csv('./dataset/movie_statistic_dataset.csv')
display(df)

In [None]:
# Extract two random rows for deployment testing
extract = df.sample(n=2, random_state=1)
display(extract)
for i in extract.index:
    df.drop(index=i, inplace=True)
display(df)

In [None]:
# Reset the index of the dataset
df = df.reset_index()
display(df)

In [None]:
# View Feature Headers
print(list(df.columns))

In [None]:
# Modify Feature Headers
renamed_columns = {
    'director_birthYear' : 'director_birth', 
    'director_deathYear': 'director_death', 
    'movie_averageRating': 'rating', 
    'movie_numerOfVotes': 'votes', 
    'approval_Index': 'approval_index', 
    'Production budget $': 'budget', 
    'Domestic gross $': 'domestic_gross', 
    'Worldwide gross $': 'worldwide_gross' 
}
df.rename(renamed_columns, axis='columns', inplace=True)
display(df)

# 2. Visualise the Data

## 2.1 Data Discovery and Profiling

In [None]:
# Displays
display(df.head(5))
display(df.tail(5))
display(df.sample(5))

In [None]:
# Dimensions
print(df.shape)

In [None]:
# Datatypes
print(df.dtypes)

In [None]:
# Structural Information
print(df.info())

In [None]:
# NaN or Null Checks
print(df.isnull().sum())

In [None]:
# Infinite Value Checks
print(np.isinf(df.select_dtypes(include=[np.number])).sum()) 

In [None]:
# Checking for 'Missing Values' such as '-' or '/N'

# Checking production_date 
display(df[df['production_date'].str.fullmatch('-')])
display(df[df['production_date'].str.contains('N')].sum())

# Checking genres 
display(df[df['genres'].astype(str).str.fullmatch('-', na=False)].sum())

# Checking director_name
display(df[df['director_name'].str.fullmatch('-')]) # 326 rows, 7.4% of rows

# Checking rating
display(df[df['rating'].astype(str).str.fullmatch('-', na=False)].sum())
display(df[df['rating'].astype(str).str.contains('N')].sum())

# Checking votes
display(df[df['votes'].astype(str).str.fullmatch('-', na=False)].sum())
display(df[df['votes'].astype(str).str.contains('N')].sum())

# Checking approval_index
display(df[df['approval_index'].astype(str).str.fullmatch('-', na=False)].sum())
display(df[df['approval_index'].astype(str).str.contains('N')].sum())

# Checking budget
display(df[df['budget'].astype(str).str.fullmatch('-', na=False)].sum())
display(df[df['budget'].astype(str).str.contains('N')].sum())

# Checking domestic_gross
display(df[df['domestic_gross'].astype(str).str.fullmatch('-', na=False)].sum())
display(df[df['domestic_gross'].astype(str).str.contains('N')].sum())

# Checking worldwide_gross
display(df[df['worldwide_gross'].astype(str).str.fullmatch('-', na=False)].sum())
display(df[df['worldwide_gross'].astype(str).str.contains('N')].sum())

In [None]:
# Nominal Aggregation
print(df.groupby('genres').size())
print(df.groupby('director_name').size())

In [None]:
# Numeric Aggregation
print(df.groupby('votes').size()) # Displays the HUGE range of votes
print(df.groupby('budget').size()) # Likewise
print(df.groupby('domestic_gross').size()) # This one is interesting, which 3 movies had the SAME least dom_gross
print(df.groupby('worldwide_gross').size()) # Likewise

In [None]:
# Inconsistencies
print(df['rating'].unique())

In [None]:
# Checking Duplicates
print(df[df.duplicated()])

In [None]:
# Distinct Values
print(df.nunique())

In [None]:
# Statistical Summary
print(df.describe())

In [None]:
# Modified View for Statistical Summary
pd.options.display.float_format = '{:.2f}'.format
print(df.describe())

In [None]:
# Correlation Matrix
numeric_columns = df.select_dtypes(include=['float64', 'int64'])
display(numeric_columns.corr())

In [None]:
# Interesting Observations

# 1 - Checking the odd 3 movies that happen to have the exact same lowest revenue value
df_sorted = df.sort_values(by='domestic_gross', ascending=True)
display(df_sorted.head(3))

# 2 - Reviewing all the movies with the same title
df_duplicates = df[df.duplicated(subset=['movie_title'], keep=False)].sort_values(by='movie_title')
display(df_duplicates)

# 3 - Movie with the longest runtime
df_sorted = df.sort_values(by='runtime_minutes', ascending=False)
top = pd.DataFrame(df_sorted.head(1))
display(top)

# 4 - Movie with the highest budget
display(df[df['budget'] == df['budget'].max()])

# 5 - Movie with the highest worldwide gross
display(df[df['worldwide_gross'] == df['worldwide_gross'].max()])

## 2.2 Exploratory Data Analysis

In [None]:
# Individual Frequency - Top 10 Genres
genres_separate = df['genres'].str.split(',').explode()
top_10_genres = genres_separate.value_counts().nlargest(10)
plt.figure(figsize=(10, 6))
sns.barplot(
    x=top_10_genres.index,  
    y=top_10_genres.values,  
    palette='dark:lightcoral',
    hue=top_10_genres.index
)
plt.title('Top 10 Genres by Frequency', fontsize=14)
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Top 10 Directors
top_10_directors_by_frequency = df['director_name'].value_counts().nlargest(10)
plt.figure(figsize=(10, 6))
sns.barplot(
    x=top_10_directors_by_frequency.index,  
    y=top_10_directors_by_frequency.values,  
    palette='dark:lightcoral',
    hue=top_10_directors_by_frequency.index
)
plt.title('Top 10 Directors by Frequency', fontsize=14)
plt.xlabel('Director')
plt.ylabel('Number of Movies Directed')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Years
df['date'] = pd.to_datetime(df['production_date'], errors='coerce')  
df['year'] = df['date'].dt.year  
plt.figure(figsize=(10, 6))
sns.histplot(
    df['year'], 
    bins=range(df['year'].min(), df['year'].max() + 1), 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
plt.title('Histogram Distribution of Years', fontsize=14)
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Undo Dataframe Manipulation
del df['date']
del df['year']

In [None]:
# Individual Frequency - Months
df['date'] = pd.to_datetime(df['production_date'], errors='coerce')  
df['month'] = df['date'].dt.month  
plt.figure(figsize=(10, 6))
ax = sns.histplot(
    df['month'], 
    bins=range(df['month'].min(), df['month'].max() + 1), 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
for patch in ax.patches:
    height = patch.get_height()
    if height > 0:  
        plt.text(
            patch.get_x() + patch.get_width() / 2,  
            height,  # Bar height
            f'{int(height)}',  
            ha='center', 
            va='bottom', 
            fontsize=8
        )
plt.title('Histogram Distribution of Month', fontsize=14)
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Undo Dataframe Manipulation
del df['date']
del df['month']

In [None]:
# Individual Frequency - Days
df['date'] = pd.to_datetime(df['production_date'], errors='coerce')  
df['day'] = df['date'].dt.day  
plt.figure(figsize=(10, 6))
sns.histplot(
    df['day'], 
    bins=range(df['day'].min(), df['day'].max() + 1), 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
plt.title('Histogram Distribution of Days', fontsize=14)
plt.xlabel('Day')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Undo Dataframe Manipulation
del df['date']
del df['day']

In [None]:
# Individual Frequency - Runtime Minutes
plt.figure(figsize=(10, 6))
ax = sns.histplot(
    df['runtime_minutes'], 
    bins=range(int(df['runtime_minutes'].min()), int(df['runtime_minutes'].max()) + 1), 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
for patch in ax.patches:
    height = patch.get_height()
    if height > 0:  
        plt.text(
            patch.get_x() + patch.get_width() / 2,  
            height,  # Bar height
            f'{int(height)}',  
            ha='center', 
            va='bottom', 
            fontsize=8
        )
plt.title('Histogram Distribution of Runtime Minutes', fontsize=14)
plt.xlabel('Runtime Minutes')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Ratings
plt.figure(figsize=(10, 6))
ax = sns.histplot(
    df['rating'], 
    bins=range(int(df['rating'].min()), int(df['rating'].max()) + 1), 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
plt.title('Histogram Distribution of Ratings', fontsize=14)
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Votes
plt.figure(figsize=(10, 6))
ax = sns.histplot(
    df['votes'], 
    bins=50, 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
for patch in ax.patches:
    height = patch.get_height()
    if height > 0:  
        plt.text(
            patch.get_x() + patch.get_width() / 2,  
            height,  # Bar height
            f'{int(height)}',  
            ha='center', 
            va='bottom', 
            fontsize=8
        )
plt.title('Histogram Distribution of Votes', fontsize=14)
plt.xlabel('Votes')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Approval Index
plt.figure(figsize=(10, 6))
sns.histplot(
    df['approval_index'], 
    bins=50, 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
plt.title('Histogram Distribution of Approval Index', fontsize=14)
plt.xlabel('Approval Index')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Budget
plt.figure(figsize=(10, 6))
ax = sns.histplot(
    df['budget'], 
    bins=50, 
    color='lightcoral', 
    edgecolor='lightcoral', 
    kde=True
)
for patch in ax.patches:
    height = patch.get_height()
    if height > 0:  
        plt.text(
            patch.get_x() + patch.get_width() / 2,  
            height,  # Bar height
            f'{int(height)}',  
            ha='center', 
            va='bottom', 
            fontsize=8
        )
plt.title('Histogram Distribution of Budget', fontsize=14)
plt.xlabel('Budget')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Domestic Gross $
plt.figure(figsize=(10, 6))
sns.histplot(
    df['domestic_gross'], 
    bins=50, 
    color='lightcoral', 
    edgecolor='lightcoral', 
    log_scale=True,
    kde=True
)
plt.title('Histogram Distribution of Domestic Gross Revenue $', fontsize=14)
plt.xlabel('Domestic Gross $ (log scale)')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Individual Frequency - Worldwide Gross $ 
plt.figure(figsize=(10, 6))
sns.histplot(
    df['domestic_gross'], 
    bins=50, 
    color='lightcoral', 
    edgecolor='lightcoral', 
    log_scale=True, 
    kde=True
)
plt.title('Histogram Distribution of Worldwide Gross Revenue $')
plt.xlabel('Worldwide Gross $ (log scale)')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Comparison Against Worldwide Gross - Top 10 Movies
top_10_movies = df.nlargest(10, 'worldwide_gross')
plt.figure(figsize=(10, 6))
sns.barplot(
    x='movie_title',  
    y='worldwide_gross',  
    data=top_10_movies,  
    palette='dark:lightcoral',
    hue='movie_title'
)
plt.title('Top 10 Movies by Worldwide Gross', fontsize=14)
plt.xlabel('Movie Title')
plt.ylabel('Worldwide Gross (in billions)') # not sure if its actually in billiond
plt.xticks(rotation=45, ha='right') 
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 18% have the exact same values !!
same_gross_df = df[df['domestic_gross'] == df['worldwide_gross']]
display(same_gross_df)

In [None]:
# Comparison Against Worldwide Gross - Top 10 Genres
genres_exploded = df['genres'].str.split(',').explode()
genres_exploded_with_gross = df.loc[genres_exploded.index, ['worldwide_gross']].copy()
top_10_genres_by_gross = genres_exploded_with_gross.groupby(genres_exploded).sum().nlargest(10, 'worldwide_gross')
plt.figure(figsize=(10, 6))
sns.barplot(
    x=top_10_genres_by_gross.index,  
    y=top_10_genres_by_gross['worldwide_gross'],  
    palette='dark:lightcoral',
    hue=top_10_genres_by_gross.index
)
plt.title('Top 10 Genres by Worldwide Gross', fontsize=14)
plt.xlabel('Genre')
plt.ylabel('Total Worldwide Gross (in billions)')  # assuming it's in billions
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Comparison Against Worldwide Gross - Top 10 Directors
top_10_directors = df.groupby('director_name')['worldwide_gross'].sum().nlargest(10)
plt.figure(figsize=(10, 6))
sns.barplot(
    x=top_10_directors.index,  
    y=top_10_directors.values,  
    palette='dark:lightcoral',
    hue=top_10_directors.index
)
plt.title('Top 10 Directors by Worldwide Gross', fontsize=14)
plt.xlabel('Director')
plt.ylabel('Total Worldwide Gross (in billions)')  
plt.xticks(rotation=45, ha='right') 
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Relationships - Distribution of Domestic Gross in Worldwide Gross
df['percentage'] = (df['domestic_gross'] / df['worldwide_gross']) * 100
percentage_df = df[['percentage']]
del df['percentage']
plt.figure(figsize=(8, 6))
sns.histplot(percentage_df['percentage'], bins=10, kde=True, color='lightcoral', edgecolor='lightcoral')
plt.xlabel('Domestic Gross Percentage (%)')
plt.ylabel('Frequency')
plt.title('Distribution of Domestic Gross in Worldwide Gross')
plt.show()

In [None]:
# Relationships - Heatmap
numeric_columns = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    cmap="pink",
    linewidths=0.5,
    cbar_kws={'label': 'Correlation Coefficient'}
)
plt.title('Heatmap of Numeric Features', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Relationships - Scatter / PairPlot Matrix
numeric_features = [
    'runtime_minutes', 
    'rating', 
    'votes', 
    'approval_index', 
    'budget', 
    'domestic_gross', 
    'worldwide_gross'
]
sns.pairplot(df[numeric_features], plot_kws={'color': 'lightcoral'})
plt.suptitle('Scatter/PairPlot Matrix of Numeric Features', y=1.02, fontsize=14)
plt.show()

In [None]:
display(df)