Advanced Programing INFO 450 Final Project with Disney Success Data from:
https://www.kaggle.com/datasets/thedevastator/disney-character-success-a-comprehensive-analysi

In [None]:
# Import visualization tools
############################

import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import numpy as np
import seaborn as sns

# Import machine learning model tools
######################################

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Read in csv files from dataset
################################

disney_movies_total_gross = pd.read_csv("Downloads/disney_data/disney_movies_total_gross.csv")
disney_directors = pd.read_csv("Downloads/disney_data/disney-director.csv")


In [None]:
# Print total gross dataset
###########################

disney_movies_total_gross

In [None]:
# Print disney directors dataset
################################

disney_directors

In [None]:
# Creation of an inner join to return records having a pair of the name and move_title on both sides
# Returns all data from disney_movies_total_gross and disney_directors
####################################################################################################

merger = pd.merge(disney_movies_total_gross, disney_directors, how="inner", left_on="movie_title", 
                  right_on="name")

#Prints new dataset
###################

merger

In [None]:
# Creation of dataframe
#########################

df = pd.DataFrame(merger)

# Adding an additional column that converts total_gross float into an integer y replacing the ',' and '$' with empty strings
############################################################################################################################

df["total_gross_int"] = df["total_gross"].str.replace(',','').str.replace('$', '')

# Converting into integer
#########################

df['total_gross_int']= df['total_gross_int'].apply(int)

# Printing dataframe
####################

df

In [None]:
# Group by directors and sum the total gross income each director has
#####################################################################

groupby_directors = merger.groupby("director")["total_gross_int"].sum()

# Print groupby_directors
#########################

groupby_directors

# Converting groupby_directors into pandas DataFrame
####################################################

director_df = pd.DataFrame(groupby_directors)

# Print DataFrame
#################

director_df

In [None]:
# Grab the top ten directors
############################

top_directors = groupby_directors.sort_values(ascending=False).head(10)

# Creation of pie chart grouped by the sum of total gross per each director with percentage
###########################################################################################

plt.pie(top_directors, labels=top_directors.index, autopct='%1.1f%%')

# Add title to the chart
########################

plt.title('Top 10 Directors by Total Gross')

# Print Pie Chart
#################

plt.show()

In [None]:
# Group by genre and sum the total gross income each genre has
#####################################################################

groupby_genre = merger.groupby("genre")["total_gross_int"].sum().reset_index()

# Print groupby_genre
#########################
groupby_genre

# Converting groupby_directors into pandas DataFrame
####################################################

genre_df = pd.DataFrame(groupby_genre)

# Print DataFrame
#################

genre_df

In [None]:
# Create a stacked bar chart
############################

fig,ax = plt.subplots(figsize=(10,5))
plt.bar(groupby_genre['genre'], groupby_genre['total_gross_int'])

# Add labels of the total gross by each category
################################################

for i, v in enumerate(groupby_genre['total_gross_int']):
    ax.text(i, v + 1e8, f'${v:,.0f}', ha='center', fontweight='bold')

# Label the x-axis 'Genre'
##########################

plt.xlabel('Genre')

# Label the y-axis 'Total Gross Income'
#######################################

plt.ylabel('Total Gross Income')

# Label the title 'Total Gross Income by Genre'
###############################################

plt.title('Total Gross Income by Genre')

# Print Bar chart
#################
plt.show()

In [None]:
# Group by movie name and sum the total gross income each movie has
#####################################################################

groupby_name = merger.groupby("name")["total_gross_int"].sum().reset_index()

# Sort values by desending order
################################

groupby_name = groupby_name.sort_values('total_gross_int', ascending=False)

# Grab top ten movies by gross income
#####################################

top_movies = groupby_name.head(10)

top_movies = top_movies.sort_values('total_gross_int', ascending=True)
# Print top_movies
####################

top_movies

# Converting top_movies into pandas DataFrame
####################################################

name_df = pd.DataFrame(top_movies)

# Print DataFrame
#################

name_df

In [None]:
# Creation of scatter plot by total gross income and movie
##########################################################

fig = px.scatter(x=top_movies["total_gross_int"], y=top_movies["name"])

# Update the x-axis label and range
####################################

fig.update_xaxes(title_text='Total Gross Income')
fig.update_xaxes(range=[0, top_movies["total_gross_int"].max() * 1.1])

# Update the y-axis label and range
##################################

fig.update_yaxes(title_text='Movie Title')
fig.update_yaxes(range=[-0.5, len(top_movies["name"]) - 0.5])

# Add a Title 
fig.update_layout(title="Top Movies by Gross Income")

# Show the chart
################

fig.show()

In [None]:
# Group by movie name and sum the total gross income each movie has
#####################################################################

groupby_rating = merger.groupby("MPAA_rating")["total_gross_int"].sum().reset_index()

# Sort values by desending order
################################

groupby_rating = groupby_rating.sort_values('total_gross_int', ascending=False)

# Grab top ten movies by gross income
#####################################

#top_movies = groupby_name.head(10)

# Print top_movies
####################

#top_movies

# Converting top_movies into pandas DataFrame
####################################################

rating_df = pd.DataFrame(groupby_rating)

# Print DataFrame
#################

rating_df

In [None]:
# Create a horizontal stacked bar chart
# Bar() trace is used to create a stacked bar chart for each rating,
# with the x-axis as total gross income and y-axis as rating
# Layout() method is used to set the marmode to stack and add a title to the chart
################################################################################

data = []
for i in range(len(rating_df)):
    data.append(go.Bar(
        x=[rating_df['total_gross_int'][i]],
        y=[rating_df['MPAA_rating'][i]],
        orientation='h',
        name=rating_df['MPAA_rating'][i]
    ))

layout = go.Layout(
    barmode='stack',
    title='Total Gross Income by MPAA Rating'
)

fig = go.Figure(data=data, layout=layout)

# Show the chart
################

fig.show()


In [None]:
# Creates a list of nodes by combinding the values from name and director columns 
#################################################################################

nodes = list(set(merger['name']) | set(merger['director']))

# Create a list of edges by iterating through the name and director columns of the dataframe,
# and creates a tuple of the movie title and director name for each row
############################################################################################

edges = [(merger['name'][i], merger['director'][i]) for i in range(len(merger))]

# Create a network graph with the nodes and edges
# Sets the thickness, line color, width, and label for each node 
# Sets the source and raget nodes for each edge in the graph
# Sets the value which is 1 for each edge
################################################################

fig = go.Figure(go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = nodes
    ),
    link = dict(
      source = [nodes.index(edge[0]) for edge in edges],
      target = [nodes.index(edge[1]) for edge in edges],
      value = [1 for edge in edges]
  )))

# Set the title for the graph
#############################
fig.update_layout(title='Movie Directors and Titles Network Graph')

# Show the chart
################
fig.show()

In [None]:
# Sort the data by total gross and select the top 10 movies
###########################################################

top10 = df.sort_values('total_gross_int', ascending=False).head(10)

# Pivot the data to create a heatmap
####################################
heatmap_data = pd.pivot_table(top10, values='total_gross_int', index='movie_title', columns='release_date')

# Create the heatmap using seaborn
##################################

sns.set(font_scale=1)
plt.figure(figsize=(8, 5))
sns.heatmap(heatmap_data, cmap="YlGnBu", annot=False,  linewidths=.5)
plt.title('Total Gross by Movie Title and Release Date')
plt.xlabel('Release Date')
plt.ylabel('Movie Title')
plt.show()

In [None]:
# Create ML Model to predict the director of a movie title
# Preprocess the data by selecting the relevant columns
##########################################################

df = df[['name', 'director']]

# Split the dataset into training and testing sets
##################################################

X_train, X_test, y_train, y_test = train_test_split(df['name'], df['director'], test_size=0.2, random_state=42)

# Create a bag-of-words representation of the movie titles using CountVectorizer
#################################################################################
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train a Decision Tree model
##############################
model = DecisionTreeClassifier()
model.fit(X_train_bow, y_train)

# Evaluate the performance of the model on the testing set
##########################################################

y_pred = model.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Predict the director of a new movie title
############################################

new_title = ['The Jungle Book']
new_title_bow = vectorizer.transform(new_title)
predicted_director = model.predict(new_title_bow)[0]
print(f'Predicted director: {predicted_director}')
