In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

# Original Dataset

Dataset: https://grouplens.org/datasets/movielens/100k/

## Ratings 

- [ua,ub].test -> disjoint subsets with almost 10k rows, with exactly 10 ratings per user.
- [u1,u2,u3,u4,u5].test - > disjoint subsets with almost 20k rows.
- [ua,ub].base -> subsets with almost 90k rows.
- [u1,u2,u3,u4,u5].base -> subsets with almost 80k rows.
- u.data -> complete dataset with 100k rows.

In [2]:
def read_and_process_ratings(filename):
    # Read data.
    ratings = pd.read_csv(f'datasets/ml-100k/{filename}', sep='\t')
    ratings = ratings.drop('timestamp', axis=1)
    ratings = ratings.rename(columns={'item_id': 'movie_id'})

    # Filter only positive ratings so user->movie edges mean the user liked that movie.
    print("Filter only positive ratings...")
    print(f"Before filtering: {ratings.movie_id.nunique()} movies, {ratings.user_id.nunique()} users and {ratings.shape[0]} ratings.")
    ratings = ratings[ratings.rating>=4].drop('rating', axis=1) 
    print(f"After filtering: {ratings.movie_id.nunique()} movies, {ratings.user_id.nunique()} users and {ratings.shape[0]} ratings.")

    # Filter only movies rated more than once, so no user is disconnected from the rest.
    print("\nFilter only movies rated more than once...")
    print(f"Before filtering: {ratings.movie_id.nunique()} movies and {ratings.user_id.nunique()} users")
    rating_per_movie = ratings.movie_id.value_counts().reset_index().rename(columns={'index':'movie_id', 'movie_id': 'n'})
    rating_per_movie = rating_per_movie[rating_per_movie.n>1].drop('n', axis=1)
    ratings = ratings.merge(rating_per_movie, how='inner', on='movie_id')
    print(f"After filtering: {ratings.movie_id.nunique()} movies and {ratings.user_id.nunique()} users")
    
    return ratings


def read_and_process_movies(ratings):
    # Read data.
    movies = pd.read_csv('datasets/ml-100k/u.item', sep='|') 
    
    # Remove unknown.
    movies = movies[movies.unknown == 0] 
    
    # Drop unused columns.
    movies = movies.drop(['release date', 'video release date', 'IMDb URL', 'unknown'], axis=1) 
    movies = movies.rename(columns={'movie id': 'movie_id', 'movie title': 'movie_title'})
    
    # Filter only selected movies according to ratings.
    selected_movies = ratings[['movie_id']].drop_duplicates()
    movies = movies.merge(selected_movies, how='inner', on='movie_id')
    return movies


def read_and_process_ratings_and_movies(filename_ratings):
    # Read ratings and movies.
    ratings = read_and_process_ratings(filename_ratings)
    movies = read_and_process_movies(ratings)
    
    # Merging movies' info.
    ratings_with_movies = ratings.merge(movies, on='movie_id', how='inner')
    ratings_with_movies = ratings_with_movies.drop('movie_title', axis=1)
    
    # Removings genre cols that don't have value for any movie.
    print("\nFilter only genres related to at least one movie...")
    print(f"Before filtering: {ratings_with_movies.shape[1]-2} genres")
    ratings_with_movies = ratings_with_movies.loc[:, (ratings_with_movies.sum(axis=0) != 0)]     
    print(f"After filtering: {ratings_with_movies.shape[1]-2} genres")

    # Sorting values just to be easiar to understand later.
    ratings_with_movies = ratings_with_movies.sort_values(by=['user_id','movie_id'])
    
    return ratings_with_movies

In [3]:
ratings_small_a = read_and_process_ratings_and_movies(filename_ratings='ua.test')
ratings_small_a

Filter only positive ratings...
Before filtering: 1129 movies, 943 users and 9430 ratings.
After filtering: 879 movies, 934 users and 5469 ratings.

Filter only movies rated more than once...
Before filtering: 879 movies and 934 users
After filtering: 645 movies and 932 users

Filter only genres related to at least one movie...
Before filtering: 18 genres
After filtering: 18 genres


Unnamed: 0,user_id,movie_id,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,20,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
7,1,33,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
11,1,61,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
14,1,160,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
20,1,171,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828,943,111,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1089,943,186,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1097,943,215,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4958,943,232,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [4]:
# ratings_small_a.user_id.value_counts().reset_index().user_id.hist()

In [5]:
# ratings_small_a.movie_id.value_counts().reset_index().movie_id.hist(bins=50)

In [6]:
# ratings_small_a.sum()[2:]

## Users

In [7]:
# def read_and_process_users(ratings):
#     users = pd.read_csv('datasets/ml-100k/u.user', sep='|') # Read data.
#     selected_users = ratings[['user_id']].drop_duplicates()
#     users = users.merge(selected_users, how='inner', on='user_id')
#     return users

In [8]:
# users_small_a = read_and_process_users(ratings=ratings_small_a)
# users_small_a

In [9]:
# users_small_a.occupation.value_counts()

# GRAPH

#### Bipartite 1

Clusterize user

'userId' -> likes -> 'movie'

[Test if groups likes similar genres and have similar occupation and ages and (less important) gender and zip code.]

#### Tripartite 1 (user is only connected to 1 columns)

Clusterize user

'userId' -> likes -> 'movie'

'movie' -> categorized as -> 'genres'

[Test if groups have similar occupation and ages and (less important) gender and zip code.]

#### Tripartite 2 (user is connected to 2 columns)

Clusterize user

'userId' -> likes -> 'movie'

'userId' -> works as -> 'occupation'

[Test if groups likes similar genres and have similar ages and (less important) gender and zip code.]

#### Quatripartite 1 

Clusterize user

'userId' -> likes -> 'movie'

'userId' -> works as -> 'occupation'

'userId' -> doesn't like -> 'genre' [choose negative to genre or movie] 

[Test if groups have similar ages, gender and zip code.]

In [23]:
# User and movie id cannot have the same number, so we need to update it (being string or already int)
# to consecutive int number, that way we can use the algorithm.
# TODO: Doing only user and movie and genre
def update_ids_according_to_layers(df_ratings, columns_to_use):
    ## Ratings
    dataset = df_ratings
    next_index = 0
    dict_indexes = {}
    for col in columns_to_use:
        categories = dataset.loc[:, col].unique()
        indexes = range(next_index, next_index+len(categories))
        dict_categories = dict(zip(categories, indexes))
        dict_indexes[col] = dict(zip(indexes, categories))
        dataset.update(dataset.loc[:, col].replace(dict_categories))
        next_index = indexes[-1] + 1
    return dataset, next_index, dict_indexes

In [11]:
def get_active_genre_list(df):
    all_genres =  ['Action', 'Adventure', 'Animation', "Children's",
                    'Comedy', 'Crime', 'Documentary', 'Drama',
                    'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
                    'Romance', 'Sci-Fi', 'Thriller',  'War', 'Western']
    selected_genres = [x for x in df.columns if x in all_genres]
    return selected_genres

In [12]:
# TODO: Doing only user and movie and genre
def create_type_file(df_ratings, file_name, columns_to_use, connect_genre, debug=True):
    with open(f'{file_name}.type','w') as file:
        ## Ratings and movies
        dataset = df_ratings
        for i in range(len(columns_to_use)):
            n_rows = dataset[columns_to_use[i]].nunique()
            if debug:
                print(columns_to_use[i], ":", n_rows, "vertices")
            for j in range(n_rows):
                file.write(str(i))
                file.write('\n')
        ## Genre
        if connect_genre:
            i += 1
            n_rows = len(get_active_genre_list(df_ratings))
            if debug:
                print("Genres:", n_rows, "vertices")
            for j in range(n_rows):
                file.write(str(i))
                file.write('\n')

In [24]:
# TODO: Doing only user and movie and genre
def create_ncol_file(df_ratings, file_name, columns_to_use, connect_genre_to_movie, 
                     next_index, dict_indexes, debug=True):
    column_0 = 'user_id'
    with open(f'{file_name}.ncol','w') as file:
        ## Ratings
        dataset = df_ratings
        set_cols = set(columns_to_use) - {column_0}
        # for each column, there is a connection with column_0   
        for col in set_cols:
            df = dataset[[column_0, col]].drop_duplicates()
            # for each row
            for index, row in df.iterrows():
                file.write(f"{row[column_0]} {row[col]} 1")
                file.write('\n')
                
        ## Genre
        if connect_genre_to_movie:
            movie_column = 'movie_id'
            selected_genres = get_active_genre_list(ratings_small_a)
            df_genres = dataset[[movie_column] + selected_genres].drop_duplicates()
            dict_indexes['genre'] = {}
            for genre in selected_genres: 
                dict_indexes['genre'][next_index] = genre
                df = df_genres[df_genres[genre] == 1]
                # for each row
                for index, row in df.iterrows():
                    file.write(f"{row[movie_column]} {next_index} 1")
                    file.write('\n')
                next_index += 1
    return dict_indexes

In [25]:
def generate_all_files(dataset, filename, columns_to_use, connect_genre_to_movie):
    df_transformed, next_index, dict_indexes = update_ids_according_to_layers(dataset, columns_to_use)
    
    create_type_file(df_ratings=df_transformed, 
                     file_name=filename, 
                     columns_to_use=columns_to_use, 
                     connect_genre=connect_genre_to_movie)
    
    dict_indexes = create_ncol_file(df_ratings=df_transformed, 
                     file_name=filename, 
                     columns_to_use=columns_to_use, 
                     connect_genre_to_movie=connect_genre_to_movie,
                     next_index=next_index,
                     dict_indexes=dict_indexes)
    
    # Dict of dicts. Each item is a column, 
    # in which the key represents the new id and the value represents the old value.
    return dict_indexes

## Bipartite 

User -> movie

In [26]:
filename = 'movie_lens_small_a_bipartite-1'
filepath = f'../outputs/output_bnoc/{filename}/{filename}'
dict_ids_small_a_bipartite1 = generate_all_files(ratings_small_a, filepath, 
                               columns_to_use=['user_id', 'movie_id'], 
                               connect_genre_to_movie=False)

user_id : 932 vertices
movie_id : 645 vertices


## Tripartite
User -> movie -> genre

In [27]:
filename = 'movie_lens_small_a_tripartite-1'
filepath = f'../outputs/output_bnoc/{filename}/{filename}'
dict_ids_small_a_tripartite1 = generate_all_files(ratings_small_a, filepath, 
                               columns_to_use=['user_id', 'movie_id'], 
                               connect_genre_to_movie=True)

user_id : 932 vertices
movie_id : 645 vertices
Genres: 18 vertices


## 4-partite - old

### Small: only movies from selected directors
Is it able to cluster according to the directors?

In [52]:
g_5partite1 = s5partite.dropna()
g_5partite1

Unnamed: 0,id,title,genres,director,cast,userId
0,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",15
1,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",62
2,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",231
3,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",348
4,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",834
...,...,...,...,...,...,...
43385,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",219172
43386,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",230119
43387,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",248087
43388,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",250643


In [53]:
# Split genres 
g_5partite1 = (g_5partite1
               .set_index(['id', 'director', 'cast', 'userId'])['genres']
               .apply(lambda x: [s.strip() for s in x.split(',')])
               .apply(pd.Series)
               .stack()
              )
g_5partite1 = g_5partite1.reset_index()
g_5partite1.columns = ['id', 'director', 'cast', 'userId', 'sample_num', 'category']
g_5partite1 = g_5partite1.drop(['sample_num'], axis=1)

In [54]:
# Split cast 
g_5partite1 = (g_5partite1
               .set_index(['id', 'director', 'category', 'userId'])['cast']
               .apply(lambda x: [s.strip() for s in x.split(',')])
               .apply(pd.Series)
               .stack()
              )
g_5partite1 = g_5partite1.reset_index()
g_5partite1.columns = ['id', 'director', 'category', 'userId', 'sample_num', 'cast']
g_5partite1 = g_5partite1.drop(['sample_num'], axis=1)
g_5partite1

Unnamed: 0,id,director,category,userId,cast
0,7980,Peter Jackson,Fantasy,15,Rachel Weisz
1,7980,Peter Jackson,Fantasy,15,Mark Wahlberg
2,7980,Peter Jackson,Fantasy,15,Susan Sarandon
3,7980,Peter Jackson,Fantasy,15,Saoirse Ronan
4,7980,Peter Jackson,Fantasy,15,Stanley Tucci
...,...,...,...,...,...
11850207,679,James Cameron,Science Fiction,251429,Alibe Parsons
11850208,679,James Cameron,Science Fiction,251429,Blain Fairman
11850209,679,James Cameron,Science Fiction,251429,Barbara Coles
11850210,679,James Cameron,Science Fiction,251429,Eddie Powell


In [55]:
df_cast = g_5partite1[['id', 'cast']].drop_duplicates().cast.value_counts().reset_index()
df_cast.columns = ['cast', 'times']
df_cast = df_cast[df_cast.times >= 2]
df_cast = df_cast[['cast']]
df_cast

Unnamed: 0,cast
0,Andy Serkis
1,Michael Caine
2,Elijah Wood
3,Liv Tyler
4,Paul Dooley
...,...
58,Marton Csokas
59,Jenifer Lewis
60,William Wisher Jr.
61,David Wenham


In [56]:
df_cast_ids = g_5partite1.merge(df_cast, how='inner', on='cast')['id'].drop_duplicates()
df_cast_ids.count()

17

In [57]:
# Pegando apenas filmes que tem pelo menos algum ator de interseccao com outro
g_5partite1 = g_5partite1.merge(df_cast_ids, how='inner', on='id')
g_5partite1

Unnamed: 0,id,director,category,userId,cast
0,597,James Cameron,Drama,17,Kate Winslet
1,597,James Cameron,Drama,17,Leonardo DiCaprio
2,597,James Cameron,Drama,17,Frances Fisher
3,597,James Cameron,Drama,17,Billy Zane
4,597,James Cameron,Drama,17,Kathy Bates
...,...,...,...,...,...
11824251,679,James Cameron,Science Fiction,251429,Alibe Parsons
11824252,679,James Cameron,Science Fiction,251429,Blain Fairman
11824253,679,James Cameron,Science Fiction,251429,Barbara Coles
11824254,679,James Cameron,Science Fiction,251429,Eddie Powell


In [58]:
# Pegando apenas os atores que intersectam pelo menos uma vez
g_5partite1 = g_5partite1.merge(df_cast, how='inner', on='cast').reset_index(drop=True)

In [59]:
g_5partite1 = g_5partite1[['id', 'director', 'category', 'cast', 'userId']]

In [60]:
g_5partite1.drop_duplicates(['id','director']).director.value_counts()

Christopher Nolan    5
James Cameron        5
Peter Jackson        4
John Lasseter        3
Name: director, dtype: int64

In [61]:
g_5partite1

Unnamed: 0,id,director,category,cast,userId
0,597,James Cameron,Drama,Bill Paxton,17
1,597,James Cameron,Romance,Bill Paxton,17
2,597,James Cameron,Thriller,Bill Paxton,17
3,597,James Cameron,Drama,Bill Paxton,25
4,597,James Cameron,Romance,Bill Paxton,25
...,...,...,...,...,...
1334838,49013,John Lasseter,Comedy,Thomas Kretschmann,267779
1334839,49013,John Lasseter,Animation,Thomas Kretschmann,270887
1334840,49013,John Lasseter,Family,Thomas Kretschmann,270887
1334841,49013,John Lasseter,Adventure,Thomas Kretschmann,270887


In [62]:
# final_set = s5partite.merge(g_5partite1[['id']], on='id', how='inner').drop_duplicates()

In [63]:
# final_set.director.value_counts()

In [64]:
# final_set

In [65]:
# filename = 'real_small_4partite_connected-1'
# filepath = f'../outputs/output_bnoc/{filename}/{filename}'
# generate_all_files(g_5partite1, filepath, column_0 = 'id', 
#                    columns_to_connect_all=['category', 'cast'], 
#                    columns_to_connect_only_column_0 = ['userId'],
#                   columns_to_use=['id', 'category', 'cast', 'userId'])
# # todo: column to membership: director

In [62]:
# 17,
# 13,
# 63,
# 33072

### prova real

In [66]:
cluster = [2,2,0,1,1,2,2,2,3,1,1,2,1,0,3,1,0]
movie_ids = g_5partite1.loc[:,'id'].unique() 
df_clusters = pd.DataFrame(data={'id': movie_ids, 'cluster': cluster})

In [67]:
df_clusters

Unnamed: 0,id,cluster
0,597,2
1,218,2
2,679,0
3,122,1
4,121,1
5,280,2
6,1124,2
7,920,2
8,49013,3
9,320,1


In [71]:
s4partite = s4partite.merge(df_clusters, how='inner', on='id') 
s4partite

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,862,Toy Story,"Animation, Comedy, Family",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",5415.0,7.7,1
1,280,Terminator 2: Judgment Day,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Robert ...",4274.0,7.7,2
2,2756,The Abyss,"Adventure, Action, Thriller, Science Fiction",James Cameron,"Ed Harris, Mary Elizabeth Mastrantonio, Michae...",822.0,7.1,0
3,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",3282.0,7.7,0
4,218,The Terminator,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Michael Biehn, Linda Ha...",4208.0,7.4,2
5,597,Titanic,"Drama, Romance, Thriller",James Cameron,"Kate Winslet, Leonardo DiCaprio, Frances Fishe...",7770.0,7.5,2
6,77,Memento,"Mystery, Thriller",Christopher Nolan,"Guy Pearce, Carrie-Anne Moss, Joe Pantoliano, ...",4168.0,8.1,0
7,120,The Lord of the Rings: The Fellowship of the Ring,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Cate Blanchett, Orl...",8892.0,8.0,3
8,320,Insomnia,"Crime, Mystery, Thriller",Christopher Nolan,"Al Pacino, Robin Williams, Hilary Swank, Maura...",1181.0,6.8,1
9,121,The Lord of the Rings: The Two Towers,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Li...",7641.0,8.0,1


In [69]:
# df_clusterizado = netflix_data.merge(df_clusters, how='inner', on='show_id') 
# df_clusterizado

In [72]:
s4partite.cluster.value_counts()

1    6
2    6
0    3
3    2
Name: cluster, dtype: int64

In [75]:
s4partite.director.value_counts()

Christopher Nolan    5
James Cameron        5
Peter Jackson        4
John Lasseter        3
Name: director, dtype: int64

In [73]:
pd.set_option('display.max_colwidth', None)

In [76]:
s4partite[s4partite.cluster==0].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,77,Memento,"Mystery, Thriller",Christopher Nolan,"Guy Pearce, Carrie-Anne Moss, Joe Pantoliano, Mark Boone Junior, Stephen Tobolowsky, Harriet Sansom Harris, Callum Keith Rennie, Larry Holden, Jorja Fox, Russ Fega, Thomas Lennon, Kimberly Campbell, Marianne Muellerleile",4168.0,8.1,0
1,2756,The Abyss,"Adventure, Action, Thriller, Science Fiction",James Cameron,"Ed Harris, Mary Elizabeth Mastrantonio, Michael Biehn, Leo Burmester, Todd Graff, John Bedford Lloyd, Kimberly Scott, Chris Elliott, J.C. Quinn, Captain Kidd Brewer Jr., George Robert Klek, Christopher Murphy, Adam Nelson, Dick Warlock, Jimmie Ray Weeks, J. Kenneth Campbell, Peter Ratray, Michael Beach, Ken Jenkins, Michael Chapman",822.0,7.1,0
2,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, Paul Reiser, Lance Henriksen, Carrie Henn, Bill Paxton, William Hope, Jenette Goldstein, Al Matthews, Mark Rolston, Ricco Ross, Colette Hiller, Daniel Kash, Cynthia Dale Scott, Tip Tipping, Trevor Steedman, Paul Maxwell, Carl Toop, Valerie Colgan, Alan Polonsky, Alibe Parsons, Blain Fairman, Barbara Coles, Eddie Powell, Jay Benedict",3282.0,7.7,0


In [77]:
s4partite[s4partite.cluster==1].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,320,Insomnia,"Crime, Mystery, Thriller",Christopher Nolan,"Al Pacino, Robin Williams, Hilary Swank, Maura Tierney, Martin Donovan, Nicky Katt, Paul Dooley, Crystal Lowe, Jay Brazeau, Larry Holden, Kerry Sandomirsky, Lorne Cardinal, Katharine Isabelle, Jonathan Jackson, Paula Shaw, Oliver 'Ole' Zemen, James Hutson, Andrew Campbell, Tasha Simms, Malcolm Boddington, Chris Gauthier, Ian Tracey, Kate Robbins, Emily Perkins, Dean Wray",1181.0,6.8,1
1,155,The Dark Knight,"Drama, Action, Crime, Thriller",Christopher Nolan,"Christian Bale, Michael Caine, Heath Ledger, Aaron Eckhart, Gary Oldman, Maggie Gyllenhaal, Morgan Freeman, Monique Gabriela Curnen, Ron Dean, Chin Han, Nestor Carbonell, Eric Roberts, Cillian Murphy, Ritchie Coster, Anthony Michael Hall, Keith Szarabajka, Colin McFarlane, Joshua Harto, Melinda McGraw, Nathan Gamble, William Fichtner, Michael Vieau, Michael Stoyanov, William Smillie, Michael Jai White, Danny Goldring, Matthew O'Neill, Olumiji Olawumi, Greg Beam, Erik Hellman, Beatrice Rosen, Vincenzo Nicoli, Edison Chen, Nydia Rodriguez Terracina, Andy Luther, James Farruggio, Tom McElroy, Will Zahrn, James Fierro, Patrick Leahy, Sam Derence, Jennifer Knox, Patrick Clear, Sarah Jayne Dunn, Charles Venn, Winston G. Ellis, David Dastmalchian, Sophia Hinshelwood, Keith Kupferer, Joseph Luis Caballero, Richard Dillane, Daryl Satcher, Chris Petschler, Aidan Feore, Philip Bulcock, Paul Birchard, Walter Lewis, Vincent Riotta, Nancy Crane, K. Todd Freeman, Matt Shallenberger, Michael Andrew Gorman, Lanny Lutz, Peter DeFaria, Matt Rippy, Andrew Bicknell, Ariyon Bakare, Doug Ballard, Helene Wilson, Tommy Campbell, Craig Heaney, Lorna Gayle, Lisa McAllister, Peter Brooke, Joshua Rollins, Dale Rivera, Matthew Leitch, Tom Lister Jr., Thomas Gaitsch, William Armstrong, Adam Kalesperis, Tristan Tait, Bronson Webb, David Ajala, Gertrude Kyles, Jonathan Ryland, James Scales, Nigel Carrington, Ian Pirie, Lateef Lovejoy, Grahame Edwards, Roger Monk, Ronan Summers, Wai Wong, Michael Corey Foster, Hannah Gunn, Brandon Lambdin, Jon Lee Brody, Debbi Burns, Maritza Cabrera, Shirin Caiola, Laura Chernicky, Henry Milton Chu, Kelli Clevenger, Richard Divizio, Tony Domino, David Fultz, Natalie Hallam, Jordon Hodges, Erron Jay, Daniel Jefferson, Nicky Katt, Thomas Kosik, Don Kress, Tim Krueger, Dan Latham, Tom McComas, James Mellor, Joseph Oliveira, Buster Reeves, Peter Rnic, Amit Shah, Michelle Shields, Sofiya Smirnova, Bruce Spielbauer, Robert Patrick Stern, Robert Stone, Richard Strobel, Tom Townsend, John Turk, John Warman, Erik A. Williams, Chris Wilson, Kevin Zaideman",12269.0,8.3,1
2,862,Toy Story,"Animation, Comedy, Family",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney, Wallace Shawn, John Ratzenberger, Annie Potts, John Morris, Erik von Detten, Laurie Metcalf, R. Lee Ermey, Sarah Freeman, Penn Jillette",5415.0,7.7,1
3,254,King Kong,"Adventure, Drama, Action",Peter Jackson,"Naomi Watts, Jack Black, Adrien Brody, Thomas Kretschmann, Colin Hanks, Andy Serkis, Evan Parke, Jamie Bell, Lobo Chan, John Sumner, Craig Hall, Kyle Chandler, William Johnson, David Pittu, Mark Hadlow, Geraldine Brophy, David Dennis, Pip Mushin, Jim Knobeloch, Ric Herbert, Lee Donahue, Tom Hobbs, Tiriel Mora, Jed Brophy, John Wraight, William Wallace, Frank Edwards, Crawford Thomson, Richard Kavanagh, Stephen Hall, Joe Folau, Chic Littlewood, Samuel Taylor, Philip Ettington",2403.0,6.6,1
4,121,The Lord of the Rings: The Two Towers,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Liv Tyler, Orlando Bloom, John Rhys-Davies, Christopher Lee, Sean Astin, Billy Boyd, Dominic Monaghan, Andy Serkis, Hugo Weaving, Craig Parker, Bernard Hill, Brad Dourif, Miranda Otto, David Wenham, Karl Urban, Cate Blanchett, Olivia Tennet, Sean Bean, Jed Brophy, Calum Gittins, John Bach, Robbie Magasiva, John Noble, Robyn Malcolm, Bruce Phillips, Raymond Trickitt, Stephen Ure, Nathaniel Lees, Sam Comery, Bruce Hopkins, John Leigh",7641.0,8.0,1
5,122,The Lord of the Rings: The Return of the King,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Liv Tyler, Orlando Bloom, John Rhys-Davies, Sean Astin, Billy Boyd, Dominic Monaghan, Andy Serkis, Hugo Weaving, Ian Holm, Bernard Hill, Miranda Otto, David Wenham, Karl Urban, John Noble, Cate Blanchett, Lawrence Makoare, Paul Norell, Thomas Robins, Sarah McLeod, Sean Bean, Marton Csokas, Christopher Lee, David Aston, Sadwyn Brophy, Alistair Browning, Ian Hughes",8226.0,8.1,1


In [78]:
s4partite[s4partite.cluster==2].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,272,Batman Begins,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Michael Caine, Liam Neeson, Katie Holmes, Gary Oldman, Cillian Murphy, Tom Wilkinson, Morgan Freeman, Rutger Hauer, Ken Watanabe, Mark Boone Junior, Linus Roache, Larry Holden, Gerard Murphy, Colin McFarlane, Jack Gleeson, T.J. Ramini, Kieran Hurley, Catherine Porter, Gus Lewis, Rade Serbedzija, Sara Stewart, Richard Brake, Emma Lockhart, Christine Adams, John Nolan, Karen David, Jonathan D. Ellis, Tamer Hassan, Ronan Leahy, Vincent Wong, Том Ву, Mark Chiu, Turbo Kong, Sai-Kit Yung, Chike Chan, Jamie Hayden, David Murray, Darragh Kelly, John Kazek, Joseph Rye, Kwaku Ankomah, Jo Martin, Charles Edwards, Lucy Russell, Mark Straker, Timothy Deenihan, Flavia Masetto, Emily Steven-Daly, David Bedella, Martin McDougall, Noah Lee Margetts, Joe Hanley, Karl Shiels, Roger Griffiths, Stephen Walters, Richard Laing, Matt Miller, Risteard Cooper, Shane Rimmer, Jeremy Theobald, Alexandra Bastedo, John Judd, Soo Hee Ding, Phill Curr, Sarah Wateridge, Charlie Kranz, Terry McMahon, Cedric Young, Tim Booth, Tom Nolan, Roger Yuan, Joe Sargent, Emmanuel Idowu, Mel Taylor, Ilyssa Fradin, Andrew Pleavin, Jeff Christian, Tenzin Gyurme, Tenzin Clive Ball, John Burke, Earlene Bentley, Alex Moggridge, Jay Buozzi, Rory Campbell, Poppy Tierney, Spencer Wilding, Mark Smith, Dave Legeno, Khan Bonfils, Ruben Halse, Jordan Shaw, Dominic Burgess, Nadia Cameron-Blakey, Jeff Tanner, Omar Mostafa, Leon Delroy Williams, Fabio Cardascia, Dean Alexandrou, Joey Ansah, Jon Foo, Emil Martirossian, Mark Strange, Lasco Atkins, Rick Avery, James Embree, Gil Kolirin, Jane Osborn, Dan Poole, Tommy Gunn, Philip Harvey, Russell Wilcox, Ray Donn",7511.0,7.5,2
1,1124,The Prestige,"Drama, Mystery, Thriller",Christopher Nolan,"Hugh Jackman, Christian Bale, Michael Caine, Scarlett Johansson, Andy Serkis, Samantha Mahurin, David Bowie, Piper Perabo, Rebecca Hall, Daniel Davis, Roger Rees, Ricky Jay, Jim Piddock, Christopher Neame, Mark Ryan, Jamie Harris, Monty Stuart, Ron Perkins, Anthony De Marco, Chao Li Chi, John B. Crye, William Morgan Sheppard, Ezra Buzzington, James Lancaster, Johnny Liska, Russ Fega, Kevin Will, Edward Hibbert, James Otis, Sam Menning, Brian Tahash, Jodi Bianca Wise, Enn Reitel, Robert W. Arbogast, Chris Cleveland, Rock Anthony, Basilina Butler, Erin Cipolletti, Tim Pilleri, Gary Sievers, Inna Swann",4510.0,8.0,2
2,280,Terminator 2: Judgment Day,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Robert Patrick, Edward Furlong, Michael Edwards, Joe Morton, Earl Boen, Jenette Goldstein, Xander Berkeley, S. Epatha Merkerson, Castulo Guerra, Danny Cooksey, Sven-Ole Thorsen, DeVaughn Nixon, Ken Gibbel, Robert Winley, Pete Schrum, Don Lake, Richard Vidan, Jim Palmer, Don Stanton, Dan Stanton, Colin Patrick Lynch, Nikki Cox, Tony Simotes, Abdul Salaam El Razzac, Mike Muscat, Dean Norris, Charles A. Tamburro, Terrence Evans, Denney Pierce, Mark Christopher Lawrence, Van Ling, Gerard G. Williams, Martin DeLuca, Scott Shaw, Joel Kramer, Richard Ruskin, Randy Walker, William Wisher Jr., Nancy Fish, Dalton Hamilton",4274.0,7.7,2
3,218,The Terminator,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Michael Biehn, Linda Hamilton, Paul Winfield, Lance Henriksen, Bess Motta, Earl Boen, Rick Rossovich, Bill Paxton, Brian Thompson, Franco Columbu, Dick Miller, Joe Farago, Shawn Schepps, Bruce M. Kerner, Brad Rearden, William Wisher Jr., Ken Fritz, Hettie Lynne Hurtes, Philip Gordon, Stan Yale, Leslie Morris, Hugh Farrington, Harriet Medin, James Ralston, Wayne Stone, John E. Bristol, Patrick Pinney, Greg Robbins, Marianne Muellerleile, Marian Green, J. Randolph Harrison, Darrell Mapson",4208.0,7.4,2
4,597,Titanic,"Drama, Romance, Thriller",James Cameron,"Kate Winslet, Leonardo DiCaprio, Frances Fisher, Billy Zane, Kathy Bates, Gloria Stuart, Bill Paxton, Bernard Hill, David Warner, Victor Garber, Jonathan Hyde, Suzy Amis, Lewis Abernathy, Nicholas Cascone, Danny Nucci, Jason Barry, Lew Palter, Eric Braeden, Bernard Fox, Ewan Stewart, Ioan Gruffudd, Jonny Phillips, Edward Fletcher, Scott G. Anderson, Martin East, Gregory Cooke, Alexandrea Owens, Seth Adkins, Michael Ensign, Anatoly M. Sagalevitch, Martin Hub, Mark Lindsay Chapman, Richard Graham, Paul Brightwell, Ron Donachie, Charlotte Chatton, Fannie Brett, Jenette Goldstein, Camilla Overbye Roos, Linda Kerns, Amy Gaipa, Martin Jarvis, Rosalind Ayres, Rochelle Rose, Jonathan Evans-Jones, Brian Walsh, Rocky Taylor, Craig Kelly, Liam Tuohy, Simon Crane, James Lancaster, Elsa Raven, Reece P. Thompson III, Laramie Landis, Mark Rafael Truitt, John Walcutt, Terry Forrestal, Derek Lea, Richard Ashton, Sean Nepita, Brendan Connolly, David Cronnelly, Garth Wilton, Richard Fox, Nick Meaney, Kevin Owers, Mark Capri, Marc Cass, Paul Herbert, Emmett James, Chris Byrne, Oliver Page, James Garrett, Erik Holland, Erik Holland, Jari Kinnunen, Anders Falk, Barry Dennen, Vern Urich, Rebecca Klingler, Tricia O'Neil, Kathleen S. Dunn, Romeo Francis, Mandana Marino, Van Ling, Bjørn Olsen, Dan Pettersson, Shay Duffin, Greg Ellis, Diana Morgan, Kris Andersson, Bobbie Bates, Aaron James Cash, Anne Fletcher, Edmond Alan Forsyth, Andie Hicks, Scott Hislop, Stan Mazin, Lisa Ratzin, Julene Renee, Brian Baines, Ellie Bensinger, Alexandra Boyd, Mike Butters, James Cameron, Bruno Campolo, Chris Cragnotti, Kevyn Currie, Kevin De La Noy, Thomas Fiss, Griffin Howell, Sean Howse, Tony Kenny, Bret Aaron Knower, George Kosty III, George Kosty Jr., Geoffrey C. Kosty, Gregory Charles Kosty, Sean Lawlor, John Leonhardt, Miguel A. Lomelin, Don Lynch, Johnny Martin, Ryan McClurkin, Meghan McLeod, Mike O'Neal, Julian Oros, Phil Parlapiano, Judy Prestininzi, Steven Quale, Olivia Rosewood, John Slade, Stephen Wolfe Smith, R. Gern Trowbridge, Francisco Váldez, Lucie Zolcerova",7770.0,7.5,2
5,920,Cars,"Animation, Adventure, Comedy, Family",John Lasseter,"Owen Wilson, Paul Newman, Bonnie Hunt, Larry the Cable Guy, Tony Shalhoub, Cheech Marin, Michael Wallis, George Carlin, Paul Dooley, Jenifer Lewis, Guido Quaroni, Richard Petty, Michael Keaton, Katherine Helmond, John Ratzenberger, Joe Ranft, Jeremy Piven, Jeremy Clarkson, Dale Earnhardt Jr., Mario Andretti, Michael Schumacher, Jay Leno, Tom Hanks, Tim Allen, Billy Crystal, John Goodman, Dave Foley, Bob Costas, Darrell Waltrip, Richard Kind, Edie McClurg, Humpy Wheeler, Tom Magliozzi, Ray Magliozzi, Lynda Petty, Andrew Stanton, Sarah Clark, Mike Nelson, Joe Ranft, Jonas Rivera, Lou Romano, Adrian Ochoa, E.J. Holowicki, Elissa Knight, Lindsey Collins, Larry Benton, Douglas Keever, Vanness Wu",3991.0,6.6,2


In [79]:
s4partite[s4partite.cluster==3].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,49013,Cars 2,"Animation, Family, Adventure, Comedy",John Lasseter,"Owen Wilson, Larry the Cable Guy, Michael Caine, Emily Mortimer, John Turturro, Eddie Izzard, Thomas Kretschmann, Joe Mantegna, Peter Jacobson, Bruce Campbell, Tony Shalhoub, Darrell Waltrip, Guido Quaroni, Brent Musburger, Jason Isaacs, David Hobbs, Stanley Townsend, Lloyd Sherr, Paul Dooley, Michel Michelis, Sig Hansen, Franco Nero, Vanessa Redgrave, Bonnie Hunt, Cheech Marin, Jenifer Lewis, Michael Wallis, Katherine Helmond, John Ratzenberger, Jeff Garlin, Patrick Walker, Lewis Hamilton, Velibor Topic, John Mainieri, Brad Lewis, Richard Kind, Edie McClurg, Teresa Gallagher, Jeff Gordon, John Lasseter, Mark Winterbottom, Fernando Alonso, Vitaly Petrov, Jan Nilsson, Memo Rojas, Jacques Villeneuve, Sebastian Vettel",2088.0,5.8,3
1,120,The Lord of the Rings: The Fellowship of the Ring,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Cate Blanchett, Orlando Bloom, Sean Bean, Viggo Mortensen, Hugo Weaving, Liv Tyler, John Rhys-Davies, Christopher Lee, Billy Boyd, Dominic Monaghan, Sean Astin, Andy Serkis, Ian Holm, Craig Parker, Lawrence Makoare, Sala Baker, Sarah McLeod, Marton Csokas, Alan Howard, Noel Appleby, Megan Edwards, Michael Elsworth, Mark Ferguson, Brent McIntyre",8892.0,8.0,3
