## Clustering actors

In [None]:
import random 
from itertools import combinations

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import igraph as ig

from ch_11_funcs import (conductance, 
        average_internal_node_degree, print_clustering_stats,
        display_network_clusters_labels
)

ig.config['plotting.backend']='matplotlib'
plt.rcParams["figure.figsize"] = (10,10)
sns.set_theme()
random.seed(2)

### Netflix movie dataset

Netflix TV Shows and Movies dataset.

This data was acquired in July 2022 containing data available in the United States.

Source : https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies?select=titles.csv

####  Analysis goal

Find groups of actors that frequently appear together.

In [None]:
actor_info_path = '/Users/dalibor/VsCodeProjects/clust/data/netflix_movies/credits.csv'
credits_df = pd.read_csv(actor_info_path)

In [None]:
credits_df.head(10)

In [None]:
movie_info_path = '/Users/dalibor/VsCodeProjects/clust/data/netflix_movies/titles.csv'
movie_info_df = pd.read_csv(movie_info_path)
movie_info_df.head()

In [None]:
credits_df['role'].unique()

In [None]:
# Keep only actors
credits_df = credits_df[credits_df['role']=='ACTOR']

In [None]:
# Keep actors appearing in more than 5 shows/movies
show_counts = credits_df['name'].value_counts().sort_values(ascending=False)
selected_actors = show_counts[show_counts > 5].index.to_list()

In [None]:
len(selected_actors)

In [None]:
# Remove movies with single remaining actor
credits_df = credits_df[credits_df['name'].isin(selected_actors)]
credits_df = credits_df.groupby('id').filter(lambda x: len(x) > 1)

In [None]:
# Map actors to integers
unique_actors = credits_df['name'].unique()
actor_mapping = {unique_actors[i]: i for i in range(unique_actors.shape[0])}
credits_df['actor_id'] = credits_df['name'].map(actor_mapping)

In [None]:
credits_df

In [None]:
# Extract actors from each movie
movie_groups = credits_df.groupby('id')['actor_id'].apply(set)
movie_groups

In [None]:
actor_combinations_count = {}

In [None]:
# Make edge for each pair of actors acting in the same
# movie
# Edge weight is equal to the number of co-occurrences
for actors in movie_groups:
    
    if len(actors) > 1:
        for actor1, actor2 in combinations(actors, 2):
            
            actor_pair = tuple(sorted((actor1, actor2)))
            
            # Update count
            if actor_pair in actor_combinations_count:
                actor_combinations_count[actor_pair] += 1
            else:
                actor_combinations_count[actor_pair] = 1

In [None]:
actor_combinations_count

In [None]:
len(actor_combinations_count)

In [None]:
# Create graph and assign nodes
g = ig.Graph(n=len(unique_actors), edges=actor_combinations_count.keys())
g.es['weight'] = list(actor_combinations_count.values())

In [None]:
# Create graph layout
layout = g.layout("auto", weights='weight')

In [None]:
# Plot the graph
ig.plot(g, layout=layout, edge_width=0.2)

In [None]:
ig_clust = g.community_multilevel(resolution=1)

In [None]:
print_clustering_stats(
    ig_clusters=ig_clust, 
    min_cluster_size=10
)

In [None]:
g.modularity(ig_clust.membership)

In [None]:
conductance(ig_clust)

In [None]:
average_internal_node_degree(ig_clust)

In [None]:
_ = display_network_clusters_labels(
    g_clust=ig_clust,
    layout=layout,
    edge_width=0.2,
    min_size=10
)

### Cluster characterization

In [None]:
movie_info_df.head()

In [None]:
movie_info_df.info()

In [None]:
movie_info_df.loc[0, 'genres']

In [None]:
from ast import literal_eval
literal_eval(movie_info_df.loc[0, 'genres'])

In [None]:
movie_info_df['genres'] = movie_info_df['genres'].apply(literal_eval).astype(object)
movie_info_df['production_countries'] = movie_info_df['production_countries'].apply(literal_eval).astype(object)

In [None]:
movie_info_df.head()

In [None]:
# Plot movie/show characteristics for all the clusters
for clust_id, clust_nodes in enumerate(ig_clust):
    
    # Avoid noise clusters
    if len(clust_nodes) > 10:
    
        fig, axs = plt.subplots(2,3, figsize=(14,10))
        
        axs = axs.flatten()
        
        # Get all movie ids corresponding to actors from current clusters
        movie_ids = credits_df.loc[credits_df['actor_id'].isin(clust_nodes), 'id'].to_list()
        movide_df_subset = movie_info_df[movie_info_df['id'].isin(movie_ids)]
        
        # Get genres and production countries
        clust_genres = [
            genre for genre_sublist in movide_df_subset['genres'].to_list() 
            for genre in genre_sublist
        ]
        clust_prod_country = [
            prod_country for prod_country_sublist in movide_df_subset['production_countries'].to_list() 
            for prod_country in prod_country_sublist
        ]
        
        pd_genres = pd.Series(clust_genres)
        pd_genres.value_counts().plot.bar(ax=axs[0])
        axs[0].set_title('Genres')
        
        pd_clust_prod_country = pd.Series(clust_prod_country)
        pd_clust_prod_country.value_counts().plot.bar(ax=axs[1])
        axs[1].set_title('Prod countries')

        # Display the type
        movide_df_subset['type'].value_counts().plot.bar(ax=axs[2])
        axs[2].set_title('Type')
        
        # Display age certification
        movide_df_subset['age_certification'].value_counts().plot.bar(ax=axs[3])
        axs[3].set_title('Age certification')
        
        # Display release year
        movide_df_subset['release_year'].plot.box(ax=axs[4])
        axs[4].set_title('Release year')
        
        # Display runtime
        movide_df_subset['runtime'].plot.box(ax=axs[5])
        axs[5].set_title('Runtime')
        
        plt.suptitle('Cluster id : {}, cluster size : {}'.format(clust_id, len(clust_nodes)))
        plt.tight_layout()

        plt.show()
    
    