In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
#file paths for datasets
file_path1 = 'marvel.xlsx'
file_path2 = 'mcu_box_office.csv'

marvel_names = pd.read_excel(file_path1)
marvel_numbers_reviews = pd.read_csv(file_path2)

In [3]:
# Select relevant columns for character appearance and box office amount
characters = ['Movie Title', 'Nick Fury', 'Iron Man', 'Hulk', 'Black Widow', 'Thor', 
              'Hawkeye', 'Avengers', 'Loki', 'Captain America', 'Guardians', 'Ant-Man', 
              'Doctor Strange', 'Black Panther', 'Captain Marvel', 'Shang-Chi', 'Eternals', 
              'Wanda', 'Thanos']

box_office = ['movie_title', 'worldwide_box_office']


# Merge datasets
merged_df = pd.merge(marvel_names[characters], marvel_numbers_reviews[box_office],
                     how='inner', left_on='Movie Title', right_on='movie_title')

# Drop unnecessary column
merged_df.drop('movie_title', axis=1, inplace=True)

# Reindex with 'Movie Title' as the index
merged_df.set_index('Movie Title', inplace=True)

# Select columns from characters to exclude the first element 'Movie Title'
selected_columns = characters[1:]

# Convert 'Yes' and 'No' to 1 and 0 respectively
character_data_binary = merged_df[selected_columns].applymap(lambda x: 1 if x.lower() == 'yes' else 0)

# Remove commas from box office amount and convert to float
merged_df['worldwide_box_office'] = merged_df['worldwide_box_office'].replace('[\$,]', '', regex=True).astype(float)

# Dataframe created calculating character appearances
appearances_df = pd.DataFrame({
    'Total_Appearances': character_data_binary.sum(axis=0),
    'Total_Box_Office': character_data_binary.T.dot(merged_df['worldwide_box_office'])
})

# Calculate mean box office revenue per appearance
appearances_df['Mean_Box_Office'] = appearances_df['Total_Box_Office'] / appearances_df['Total_Appearances']

In [8]:
# Standardize appearance_df
scaler = StandardScaler()
scaled_data = scaler.fit_transform(appearances_df[['Total_Appearances', 'Mean_Box_Office']])

# Defining a function for K-means clustering
def k_means_clustering(data, k, seed=None, max_iterations=100):
    '''
    returns: labels and centroids for k-means to be performed on a dataset
    
    data: data to be input 
    k: number of clusters
    seed=None: no seed value provided
    max_iteractions = maximum iterations before function terminates
    '''
    np.random.seed(seed)
    centroids_idx = np.random.choice(data.shape[0], size=k, replace=False)
    centroids = data[centroids_idx]

    for _ in range(max_iterations):
        # Calculate distances from data points to centroids
        distances = np.sqrt(((data - centroids[:, np.newaxis])**2).sum(axis=2))

        # Assign data points to the closest centroid
        labels = np.argmin(distances, axis=0)

        # Update centroids based on mean of assigned data points
        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])

        # Check convergence
        if np.allclose(new_centroids, centroids):
            break

        centroids = new_centroids

    return labels, centroids

# number of clusters (k)
k = 3

# Perform K-means clustering on scaled_data
labels, centroids = k_means_clustering(scaled_data, k)

# Add cluster labels to the appearances_df DataFrame
appearances_df['Cluster'] = labels

In [9]:
# Create a hover template
hover_template = "<b>%{hovertext}</b><br>" \
                 "Total Appearances: %{customdata[0]}<br>" \
                 "Mean Box Office (Standardized): %{customdata[1]:.2f}"

# Plot data 
fig = px.scatter(appearances_df, x='Total_Appearances', y='Mean_Box_Office', color='Cluster',
                 hover_name=appearances_df.index, custom_data=['Total_Appearances', 'Mean_Box_Office'])

#Update hover template and text
fig.update_traces(hovertemplate=hover_template, text=appearances_df.index)

# Update layout for hover mode
fig.update_layout(
    hovermode='closest',
    hoverlabel=dict(bgcolor="white", font_size=12, font_family="Rockwell")
)

# Set labels and title
fig.update_xaxes(title_text='Total Appearances')
fig.update_yaxes(title_text='Mean Box Office (Standardized)')
fig.update_layout(title='K-means Clustering of Marvel Characters (created by Stephanie Arezzi)')

fig.show()

# Retrieve and print cluster labels
labelled_clusters = np.unique(labels)

for cluster_label in labelled_clusters:
    cluster_characters = appearances_df.index[appearances_df['Cluster'] == cluster_label]
    print(f"Cluster {cluster_label}: {', '.join(cluster_characters)}")

Cluster 0: Shang-Chi, Eternals
Cluster 1: Guardians, Doctor Strange, Black Panther, Captain Marvel, Wanda, Thanos
Cluster 2: Nick Fury, Iron Man, Hulk, Black Widow, Thor, Hawkeye, Avengers, Loki, Captain America, Ant-Man
