In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

# Original Dataset

Kaggle link: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

movies_metadata.csv: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. 
Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.

keywords.csv: Contains the movie plot keywords for our MovieLens movies. Available in the form of a stringified JSON Object.

credits.csv: Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.

links.csv: The file that contains the TMDB and IMDB IDs of all the movies featured in the Full MovieLens dataset.

links_small.csv: Contains the TMDB and IMDB IDs of a small subset of 9,000 movies of the Full Dataset.

ratings_small.csv: The subset of 100,000 ratings from 700 users on 9,000 movies.

Filtering data according to: https://www.kaggle.com/code/rounakbanik/movie-recommender-systems/notebook

## movies_metadata.csv

In [5]:
movies = pd.read_csv('datasets/the_movies_dataset/movies_metadata.csv')
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [7]:
# md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [8]:
movies = movies.drop([19730, 29503, 35587]) # Bug with this data
movies['id'] = movies['id'].astype('int')
movies.shape

(45463, 24)

## links_small.csv, credits.csv and keywords.csv

In [9]:
links_small = pd.read_csv('datasets/the_movies_dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [10]:
credits = pd.read_csv('datasets/the_movies_dataset/credits.csv')
credits['id'] = credits['id'].astype('int')

In [11]:
keywords = pd.read_csv('datasets/the_movies_dataset/keywords.csv')
keywords['id'] = keywords['id'].astype('int')

In [12]:
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

## Preparing small_movies

In [13]:
small_movies = movies[movies['id'].isin(links_small)]
small_movies.shape

(9219, 27)

Crew: From the crew, we will only pick the director as our feature since the others don't contribute that much to the feel of the movie.

Cast: Choosing Cast is a little more tricky. Lesser known actors and minor roles do not really affect people's opinion of a movie. Therefore, we must only select the major characters and their respective actors. Arbitrarily we will choose the top 3 actors that appear in the credits list.

In [14]:
small_movies['cast'] = small_movies['cast'].apply(literal_eval)
small_movies['crew'] = small_movies['crew'].apply(literal_eval)
small_movies['keywords'] = small_movies['keywords'].apply(literal_eval)
small_movies['cast_size'] = small_movies['cast'].apply(lambda x: len(x))
small_movies['crew_size'] = small_movies['crew'].apply(lambda x: len(x))

In [15]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [16]:
small_movies['director'] = small_movies['crew'].apply(get_director)

In [17]:
small_movies['cast'] = small_movies['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [18]:
# # Revisitar essa decisao depois. Selecionando apenas os 5 principais atores
# n_cast = 5
# small_movies['cast'] = small_movies['cast'].apply(lambda x: x[:n_cast] if len(x) >=n_cast else x)

In [19]:
small_movies['genres'] = small_movies['genres'].apply(lambda x: ', '.join(x))
small_movies['cast'] = small_movies['cast'].apply(lambda x: ', '.join(x))

In [20]:
small_movies['director'] = small_movies['director'].astype('str')

In [21]:
small_movies['keywords'] = small_movies['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [22]:
s = small_movies.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [23]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

Keywords occur in frequencies ranging from 1 to 610. We do not have any use for keywords that occur only once. Therefore, these can be safely removed. Finally, we will convert every word to its stem so that words such as Dogs and Dog are considered the same.

In [24]:
s = s[s > 1]

In [25]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [26]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [27]:
small_movies['keywords'] = small_movies['keywords'].apply(filter_keywords)
small_movies['keywords'] = small_movies['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
small_movies['keywords'] = small_movies['keywords'].apply(lambda x: [str.lower(i) for i in x])

## Analyzing data
Using:
 - genres
 - id
 - status (filter only released)
 - title
 - video (filer only video = False)
 - cast
 - director
 - vote_average
 - vote_count
 
Maybe future: 
 - keywords
 - belongs_to_collection
 - original_language
 - popularity
 - production_companies
 - production_countries
 - spoken_languages
 - cast_size
 - crew_size
 - imdb_id
 
Problably no use:
 - adult (all False)
 - budget
 - homepage
 - original_title
 - overview
 - poster_path
 - release_date
 - revenue
 - runtime
 - tagline
 - crew

In [28]:
small_movies = small_movies[small_movies.status == 'Released']
small_movies = small_movies[small_movies.video == False]
small_movies = small_movies[small_movies.genres.str.len() != 0]
small_movies = small_movies[small_movies.director != 'nan']
small_movies = small_movies[small_movies.cast.str.len() != 0]

In [29]:
# small_movies.sort_values(by='vote_count', ascending = False)[['title', 'genres', 'vote_count', 'vote_average', 'director']][0:50]

In [30]:
s4partite = small_movies[['id', 'title', 'genres', 'director', 'cast', 'vote_count', 'vote_average']].reset_index(drop=True)
s4partite.head()

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average
0,862,Toy Story,"Animation, Comedy, Family",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",5415.0,7.7
1,8844,Jumanji,"Adventure, Fantasy, Family",Joe Johnston,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",2413.0,6.9
2,15602,Grumpier Old Men,"Romance, Comedy",Howard Deutch,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",92.0,6.5
3,31357,Waiting to Exhale,"Comedy, Drama, Romance",Forest Whitaker,"Whitney Houston, Angela Bassett, Loretta Devin...",34.0,6.1
4,11862,Father of the Bride Part II,Comedy,Charles Shyer,"Steve Martin, Diane Keaton, Martin Short, Kimb...",173.0,5.7


In [31]:
s4partite = s4partite[s4partite.director.isin(['John Lasseter', 'James Cameron', 'Christopher Nolan', 'Peter Jackson'])]
# s4partite = s4partite[s4partite.director.isin(['Woody Allen', 'Clint Eastwood', 'Steven Spielberg', 'Martin Scorsese', 'Tim Burton'])]

In [32]:
s4partite = s4partite[s4partite.vote_count>=500].reset_index(drop=True)

In [33]:
s4partite.director.value_counts()

Christopher Nolan    8
Peter Jackson        8
James Cameron        7
John Lasseter        5
Name: director, dtype: int64

In [34]:
# s4partite[s4partite.director == 'John Lasseter'].sort_values(by='vote_count')

In [35]:
s4partite.shape

(28, 7)

In [36]:
# small_movies[small_movies.director == 'Tim Burton'][['id', 'title', 'genres','vote_count', 'vote_average']].sort_values(by='vote_count', ascending=False)

In [37]:
# # 'Quentin Tarantino', 'Steven Spielberg', 'Martin Scorsese';
# small_movies.director.value_counts().head(20)

# # Woody Allen, Clint Eastwood, Steven Spielberg, Martin Scorsese, Tim Burton


# # Alfred Hitchcock, Quentin Tarantino, John Lasseter, Christopher Nolan, Robert Zemeckis

# ## Animation: John Lasseter,
# ## Science fiction: James Cameron
# ## Thriller: Christopher Nolan 
# ## Funtasy: Peter Jackson

# # Ridley Scott, Oliver Stone , Spike Lee, Francis Ford Coppola  

In [38]:
# genre = 'Animation'
# small_movies[small_movies['genres'].apply(lambda x: genre in x)
#             ].director.value_counts().head(20)

In [39]:
# genre = 'Music'
# small_movies[small_movies['genres'].apply(lambda x: genre in x)
#             ].sort_values(by='vote_count', ascending = False
#                          )[['title', 'genres', 'vote_count', 'vote_average', 'director']][0:50]

### Ratings

In [40]:
ratings = pd.read_csv('datasets/the_movies_dataset/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings['userId'] = ratings['userId'].astype('int')
ratings['movieId'] = ratings['movieId'].astype('int')
ratings['rating'] = ratings['rating'].astype('float')
ratings.columns = ['userId', 'id', 'rating']
ratings = ratings[ratings.rating>=4].reset_index(drop=True) # Only movies they liked
ratings.head()

Unnamed: 0,userId,id,rating
0,1,147,4.5
1,1,858,5.0
2,1,1221,5.0
3,1,1246,5.0
4,1,1968,4.0


In [41]:
s5partite = ratings.merge(s4partite, on='id', how='inner')#.drop_duplicates('title').director.value_counts()

In [42]:
s5partite = s5partite[['id','title', 'genres', 'director', 'cast', 'userId']]

In [43]:
df_count_ratings = s5partite.groupby(['id','director']).agg({'userId': 'count'}).reset_index().sort_values(by='userId', ascending=False)
df_count_ratings = df_count_ratings[df_count_ratings.userId >= 10]

In [44]:
s5partite = s5partite.merge(df_count_ratings[['id']], on='id', how='inner')#.drop_duplicates('title').director.value_counts()

In [45]:
s5partite.drop_duplicates('title').director.value_counts()

Christopher Nolan    5
Peter Jackson        5
James Cameron        5
John Lasseter        3
Name: director, dtype: int64

In [46]:
s5partite

Unnamed: 0,id,title,genres,director,cast,userId
0,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",15
1,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",62
2,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",231
3,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",348
4,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",834
...,...,...,...,...,...,...
43385,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",219172
43386,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",230119
43387,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",248087
43388,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",250643


## GRAPH

'movie' = 'id', 'title'


'movie' -> categorized as -> 'genres'

'movie' -> directed by -> 'director' 

'movie' -> interpreted by -> 'cast'

'movie' -> liked by -> 'userId'


'director' -> directed in -> 'genres'

'cast' -> interpreted in -> 'genres'

'director' -> worked with -> 'cast'

In [47]:
# Converting strings to ints so we can use the algorithm.
def transform_categories_in_indexes(dataset, columns_to_use=None):
    if columns_to_use:
        list_cols = columns_to_use
    else:
        list_cols = dataset.columns
    next_index = 0
    for col in list_cols:
        categories = dataset.loc[:, col].unique()
        indexes = range(next_index, next_index+len(categories))
        dict_categories = dict(zip(categories, indexes))
        dataset.update(dataset.loc[:, col].replace(dict_categories))
        next_index = indexes[-1] + 1
    return dataset

In [48]:
def create_type_file(dataset, file_name, columns_to_use=None, debug=True):
    with open(f'{file_name}.type','w') as file:
        if columns_to_use:
            list_cols = columns_to_use
        else:
            list_cols = dataset.columns
        for i in range(len(list_cols)):
            n_rows = dataset[list_cols[i]].nunique()
            if debug:
                print(list_cols[i], ":", n_rows, "vertices")
            for j in range(n_rows):
                file.write(str(i))
                file.write('\n')

In [49]:
def create_ncol_file(dataset, column_0, columns_to_connect_all, columns_to_connect_only_column_0, file_name):
    with open(f'{file_name}.ncol','w') as file:
        # Connect columns_to_connect_all with all columns inside it
        list_cols = columns_to_connect_all
        list_cols.append(column_0)
        for i in range(len(list_cols)):
            for j in range(i+1,len(list_cols)):
                df = dataset[[list_cols[i], list_cols[j]]].drop_duplicates()
                # for each row
                for index, row in df.iterrows():
                    file.write(f"{row[list_cols[i]]} {row[list_cols[j]]} 1")
                    file.write('\n')
                 
        # for each column in columns_to_connect_only_column_0, there is a connection with column_0   
        for col in columns_to_connect_only_column_0:
            df = dataset[[column_0, col]].drop_duplicates()
            # for each row
            for index, row in df.iterrows():
                file.write(f"{row[column_0]} {row[col]} 1")
                file.write('\n')

In [50]:
# def create_Cbipartite_membership_file(dataset, file_name):
# def create_director_membership_file():
#     with open(f'{file_name}.membership','w') as file:
#         other_column = dataset.columns[1]
#         ## Layer 0 - show_id
#         n_shows = dataset.shape[0]
#         for index, row in dataset.iterrows():
#             file.write(f"{row[other_column]-n_shows}")
#             file.write('\n')

#         ## Layer 1 - only works if there is no communities there
#         for i in range (dataset[other_column].nunique()):
#             file.write(str(i))
#             file.write('\n')

In [51]:
def generate_all_files(dataset, filename, column_0, columns_to_connect_all, 
                       columns_to_connect_only_column_0, columns_to_use=None, ):

    df_transformed = transform_categories_in_indexes(dataset, columns_to_use=columns_to_use)
    create_type_file(df_transformed, file_name=filename, columns_to_use=columns_to_use)
    
    create_ncol_file(df_transformed, column_0 = column_0,
                     columns_to_connect_all=columns_to_connect_all, 
                     columns_to_connect_only_column_0=columns_to_connect_only_column_0, 
                     file_name=filename)

## 4-partite

### Small: only movies from selected directors
Is it able to cluster according to the directors?

In [52]:
g_5partite1 = s5partite.dropna()
g_5partite1

Unnamed: 0,id,title,genres,director,cast,userId
0,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",15
1,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",62
2,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",231
3,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",348
4,7980,The Lovely Bones,"Fantasy, Drama",Peter Jackson,"Rachel Weisz, Mark Wahlberg, Susan Sarandon, S...",834
...,...,...,...,...,...,...
43385,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",219172
43386,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",230119
43387,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",248087
43388,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",250643


In [53]:
# Split genres 
g_5partite1 = (g_5partite1
               .set_index(['id', 'director', 'cast', 'userId'])['genres']
               .apply(lambda x: [s.strip() for s in x.split(',')])
               .apply(pd.Series)
               .stack()
              )
g_5partite1 = g_5partite1.reset_index()
g_5partite1.columns = ['id', 'director', 'cast', 'userId', 'sample_num', 'category']
g_5partite1 = g_5partite1.drop(['sample_num'], axis=1)

In [54]:
# Split cast 
g_5partite1 = (g_5partite1
               .set_index(['id', 'director', 'category', 'userId'])['cast']
               .apply(lambda x: [s.strip() for s in x.split(',')])
               .apply(pd.Series)
               .stack()
              )
g_5partite1 = g_5partite1.reset_index()
g_5partite1.columns = ['id', 'director', 'category', 'userId', 'sample_num', 'cast']
g_5partite1 = g_5partite1.drop(['sample_num'], axis=1)
g_5partite1

Unnamed: 0,id,director,category,userId,cast
0,7980,Peter Jackson,Fantasy,15,Rachel Weisz
1,7980,Peter Jackson,Fantasy,15,Mark Wahlberg
2,7980,Peter Jackson,Fantasy,15,Susan Sarandon
3,7980,Peter Jackson,Fantasy,15,Saoirse Ronan
4,7980,Peter Jackson,Fantasy,15,Stanley Tucci
...,...,...,...,...,...
11850207,679,James Cameron,Science Fiction,251429,Alibe Parsons
11850208,679,James Cameron,Science Fiction,251429,Blain Fairman
11850209,679,James Cameron,Science Fiction,251429,Barbara Coles
11850210,679,James Cameron,Science Fiction,251429,Eddie Powell


In [55]:
df_cast = g_5partite1[['id', 'cast']].drop_duplicates().cast.value_counts().reset_index()
df_cast.columns = ['cast', 'times']
df_cast = df_cast[df_cast.times >= 2]
df_cast = df_cast[['cast']]
df_cast

Unnamed: 0,cast
0,Andy Serkis
1,Michael Caine
2,Elijah Wood
3,Liv Tyler
4,Paul Dooley
...,...
58,Marton Csokas
59,Jenifer Lewis
60,William Wisher Jr.
61,David Wenham


In [56]:
df_cast_ids = g_5partite1.merge(df_cast, how='inner', on='cast')['id'].drop_duplicates()
df_cast_ids.count()

17

In [57]:
# Pegando apenas filmes que tem pelo menos algum ator de interseccao com outro
g_5partite1 = g_5partite1.merge(df_cast_ids, how='inner', on='id')
g_5partite1

Unnamed: 0,id,director,category,userId,cast
0,597,James Cameron,Drama,17,Kate Winslet
1,597,James Cameron,Drama,17,Leonardo DiCaprio
2,597,James Cameron,Drama,17,Frances Fisher
3,597,James Cameron,Drama,17,Billy Zane
4,597,James Cameron,Drama,17,Kathy Bates
...,...,...,...,...,...
11824251,679,James Cameron,Science Fiction,251429,Alibe Parsons
11824252,679,James Cameron,Science Fiction,251429,Blain Fairman
11824253,679,James Cameron,Science Fiction,251429,Barbara Coles
11824254,679,James Cameron,Science Fiction,251429,Eddie Powell


In [58]:
# Pegando apenas os atores que intersectam pelo menos uma vez
g_5partite1 = g_5partite1.merge(df_cast, how='inner', on='cast').reset_index(drop=True)

In [59]:
g_5partite1 = g_5partite1[['id', 'director', 'category', 'cast', 'userId']]

In [60]:
g_5partite1.drop_duplicates(['id','director']).director.value_counts()

Christopher Nolan    5
James Cameron        5
Peter Jackson        4
John Lasseter        3
Name: director, dtype: int64

In [61]:
g_5partite1

Unnamed: 0,id,director,category,cast,userId
0,597,James Cameron,Drama,Bill Paxton,17
1,597,James Cameron,Romance,Bill Paxton,17
2,597,James Cameron,Thriller,Bill Paxton,17
3,597,James Cameron,Drama,Bill Paxton,25
4,597,James Cameron,Romance,Bill Paxton,25
...,...,...,...,...,...
1334838,49013,John Lasseter,Comedy,Thomas Kretschmann,267779
1334839,49013,John Lasseter,Animation,Thomas Kretschmann,270887
1334840,49013,John Lasseter,Family,Thomas Kretschmann,270887
1334841,49013,John Lasseter,Adventure,Thomas Kretschmann,270887


In [62]:
# final_set = s5partite.merge(g_5partite1[['id']], on='id', how='inner').drop_duplicates()

In [63]:
# final_set.director.value_counts()

In [64]:
# final_set

In [65]:
# filename = 'real_small_4partite_connected-1'
# filepath = f'../outputs/output_bnoc/{filename}/{filename}'
# generate_all_files(g_5partite1, filepath, column_0 = 'id', 
#                    columns_to_connect_all=['category', 'cast'], 
#                    columns_to_connect_only_column_0 = ['userId'],
#                   columns_to_use=['id', 'category', 'cast', 'userId'])
# # todo: column to membership: director

In [62]:
# 17,
# 13,
# 63,
# 33072

### prova real

In [66]:
cluster = [2,2,0,1,1,2,2,2,3,1,1,2,1,0,3,1,0]
movie_ids = g_5partite1.loc[:,'id'].unique() 
df_clusters = pd.DataFrame(data={'id': movie_ids, 'cluster': cluster})

In [67]:
df_clusters

Unnamed: 0,id,cluster
0,597,2
1,218,2
2,679,0
3,122,1
4,121,1
5,280,2
6,1124,2
7,920,2
8,49013,3
9,320,1


In [71]:
s4partite = s4partite.merge(df_clusters, how='inner', on='id') 
s4partite

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,862,Toy Story,"Animation, Comedy, Family",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",5415.0,7.7,1
1,280,Terminator 2: Judgment Day,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Robert ...",4274.0,7.7,2
2,2756,The Abyss,"Adventure, Action, Thriller, Science Fiction",James Cameron,"Ed Harris, Mary Elizabeth Mastrantonio, Michae...",822.0,7.1,0
3,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, ...",3282.0,7.7,0
4,218,The Terminator,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Michael Biehn, Linda Ha...",4208.0,7.4,2
5,597,Titanic,"Drama, Romance, Thriller",James Cameron,"Kate Winslet, Leonardo DiCaprio, Frances Fishe...",7770.0,7.5,2
6,77,Memento,"Mystery, Thriller",Christopher Nolan,"Guy Pearce, Carrie-Anne Moss, Joe Pantoliano, ...",4168.0,8.1,0
7,120,The Lord of the Rings: The Fellowship of the Ring,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Cate Blanchett, Orl...",8892.0,8.0,3
8,320,Insomnia,"Crime, Mystery, Thriller",Christopher Nolan,"Al Pacino, Robin Williams, Hilary Swank, Maura...",1181.0,6.8,1
9,121,The Lord of the Rings: The Two Towers,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Li...",7641.0,8.0,1


In [69]:
# df_clusterizado = netflix_data.merge(df_clusters, how='inner', on='show_id') 
# df_clusterizado

In [72]:
s4partite.cluster.value_counts()

1    6
2    6
0    3
3    2
Name: cluster, dtype: int64

In [75]:
s4partite.director.value_counts()

Christopher Nolan    5
James Cameron        5
Peter Jackson        4
John Lasseter        3
Name: director, dtype: int64

In [73]:
pd.set_option('display.max_colwidth', None)

In [76]:
s4partite[s4partite.cluster==0].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,77,Memento,"Mystery, Thriller",Christopher Nolan,"Guy Pearce, Carrie-Anne Moss, Joe Pantoliano, Mark Boone Junior, Stephen Tobolowsky, Harriet Sansom Harris, Callum Keith Rennie, Larry Holden, Jorja Fox, Russ Fega, Thomas Lennon, Kimberly Campbell, Marianne Muellerleile",4168.0,8.1,0
1,2756,The Abyss,"Adventure, Action, Thriller, Science Fiction",James Cameron,"Ed Harris, Mary Elizabeth Mastrantonio, Michael Biehn, Leo Burmester, Todd Graff, John Bedford Lloyd, Kimberly Scott, Chris Elliott, J.C. Quinn, Captain Kidd Brewer Jr., George Robert Klek, Christopher Murphy, Adam Nelson, Dick Warlock, Jimmie Ray Weeks, J. Kenneth Campbell, Peter Ratray, Michael Beach, Ken Jenkins, Michael Chapman",822.0,7.1,0
2,679,Aliens,"Horror, Action, Thriller, Science Fiction",James Cameron,"Sigourney Weaver, Michael Biehn, James Remar, Paul Reiser, Lance Henriksen, Carrie Henn, Bill Paxton, William Hope, Jenette Goldstein, Al Matthews, Mark Rolston, Ricco Ross, Colette Hiller, Daniel Kash, Cynthia Dale Scott, Tip Tipping, Trevor Steedman, Paul Maxwell, Carl Toop, Valerie Colgan, Alan Polonsky, Alibe Parsons, Blain Fairman, Barbara Coles, Eddie Powell, Jay Benedict",3282.0,7.7,0


In [77]:
s4partite[s4partite.cluster==1].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,320,Insomnia,"Crime, Mystery, Thriller",Christopher Nolan,"Al Pacino, Robin Williams, Hilary Swank, Maura Tierney, Martin Donovan, Nicky Katt, Paul Dooley, Crystal Lowe, Jay Brazeau, Larry Holden, Kerry Sandomirsky, Lorne Cardinal, Katharine Isabelle, Jonathan Jackson, Paula Shaw, Oliver 'Ole' Zemen, James Hutson, Andrew Campbell, Tasha Simms, Malcolm Boddington, Chris Gauthier, Ian Tracey, Kate Robbins, Emily Perkins, Dean Wray",1181.0,6.8,1
1,155,The Dark Knight,"Drama, Action, Crime, Thriller",Christopher Nolan,"Christian Bale, Michael Caine, Heath Ledger, Aaron Eckhart, Gary Oldman, Maggie Gyllenhaal, Morgan Freeman, Monique Gabriela Curnen, Ron Dean, Chin Han, Nestor Carbonell, Eric Roberts, Cillian Murphy, Ritchie Coster, Anthony Michael Hall, Keith Szarabajka, Colin McFarlane, Joshua Harto, Melinda McGraw, Nathan Gamble, William Fichtner, Michael Vieau, Michael Stoyanov, William Smillie, Michael Jai White, Danny Goldring, Matthew O'Neill, Olumiji Olawumi, Greg Beam, Erik Hellman, Beatrice Rosen, Vincenzo Nicoli, Edison Chen, Nydia Rodriguez Terracina, Andy Luther, James Farruggio, Tom McElroy, Will Zahrn, James Fierro, Patrick Leahy, Sam Derence, Jennifer Knox, Patrick Clear, Sarah Jayne Dunn, Charles Venn, Winston G. Ellis, David Dastmalchian, Sophia Hinshelwood, Keith Kupferer, Joseph Luis Caballero, Richard Dillane, Daryl Satcher, Chris Petschler, Aidan Feore, Philip Bulcock, Paul Birchard, Walter Lewis, Vincent Riotta, Nancy Crane, K. Todd Freeman, Matt Shallenberger, Michael Andrew Gorman, Lanny Lutz, Peter DeFaria, Matt Rippy, Andrew Bicknell, Ariyon Bakare, Doug Ballard, Helene Wilson, Tommy Campbell, Craig Heaney, Lorna Gayle, Lisa McAllister, Peter Brooke, Joshua Rollins, Dale Rivera, Matthew Leitch, Tom Lister Jr., Thomas Gaitsch, William Armstrong, Adam Kalesperis, Tristan Tait, Bronson Webb, David Ajala, Gertrude Kyles, Jonathan Ryland, James Scales, Nigel Carrington, Ian Pirie, Lateef Lovejoy, Grahame Edwards, Roger Monk, Ronan Summers, Wai Wong, Michael Corey Foster, Hannah Gunn, Brandon Lambdin, Jon Lee Brody, Debbi Burns, Maritza Cabrera, Shirin Caiola, Laura Chernicky, Henry Milton Chu, Kelli Clevenger, Richard Divizio, Tony Domino, David Fultz, Natalie Hallam, Jordon Hodges, Erron Jay, Daniel Jefferson, Nicky Katt, Thomas Kosik, Don Kress, Tim Krueger, Dan Latham, Tom McComas, James Mellor, Joseph Oliveira, Buster Reeves, Peter Rnic, Amit Shah, Michelle Shields, Sofiya Smirnova, Bruce Spielbauer, Robert Patrick Stern, Robert Stone, Richard Strobel, Tom Townsend, John Turk, John Warman, Erik A. Williams, Chris Wilson, Kevin Zaideman",12269.0,8.3,1
2,862,Toy Story,"Animation, Comedy, Family",John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney, Wallace Shawn, John Ratzenberger, Annie Potts, John Morris, Erik von Detten, Laurie Metcalf, R. Lee Ermey, Sarah Freeman, Penn Jillette",5415.0,7.7,1
3,254,King Kong,"Adventure, Drama, Action",Peter Jackson,"Naomi Watts, Jack Black, Adrien Brody, Thomas Kretschmann, Colin Hanks, Andy Serkis, Evan Parke, Jamie Bell, Lobo Chan, John Sumner, Craig Hall, Kyle Chandler, William Johnson, David Pittu, Mark Hadlow, Geraldine Brophy, David Dennis, Pip Mushin, Jim Knobeloch, Ric Herbert, Lee Donahue, Tom Hobbs, Tiriel Mora, Jed Brophy, John Wraight, William Wallace, Frank Edwards, Crawford Thomson, Richard Kavanagh, Stephen Hall, Joe Folau, Chic Littlewood, Samuel Taylor, Philip Ettington",2403.0,6.6,1
4,121,The Lord of the Rings: The Two Towers,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Liv Tyler, Orlando Bloom, John Rhys-Davies, Christopher Lee, Sean Astin, Billy Boyd, Dominic Monaghan, Andy Serkis, Hugo Weaving, Craig Parker, Bernard Hill, Brad Dourif, Miranda Otto, David Wenham, Karl Urban, Cate Blanchett, Olivia Tennet, Sean Bean, Jed Brophy, Calum Gittins, John Bach, Robbie Magasiva, John Noble, Robyn Malcolm, Bruce Phillips, Raymond Trickitt, Stephen Ure, Nathaniel Lees, Sam Comery, Bruce Hopkins, John Leigh",7641.0,8.0,1
5,122,The Lord of the Rings: The Return of the King,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Viggo Mortensen, Liv Tyler, Orlando Bloom, John Rhys-Davies, Sean Astin, Billy Boyd, Dominic Monaghan, Andy Serkis, Hugo Weaving, Ian Holm, Bernard Hill, Miranda Otto, David Wenham, Karl Urban, John Noble, Cate Blanchett, Lawrence Makoare, Paul Norell, Thomas Robins, Sarah McLeod, Sean Bean, Marton Csokas, Christopher Lee, David Aston, Sadwyn Brophy, Alistair Browning, Ian Hughes",8226.0,8.1,1


In [78]:
s4partite[s4partite.cluster==2].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,272,Batman Begins,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Michael Caine, Liam Neeson, Katie Holmes, Gary Oldman, Cillian Murphy, Tom Wilkinson, Morgan Freeman, Rutger Hauer, Ken Watanabe, Mark Boone Junior, Linus Roache, Larry Holden, Gerard Murphy, Colin McFarlane, Jack Gleeson, T.J. Ramini, Kieran Hurley, Catherine Porter, Gus Lewis, Rade Serbedzija, Sara Stewart, Richard Brake, Emma Lockhart, Christine Adams, John Nolan, Karen David, Jonathan D. Ellis, Tamer Hassan, Ronan Leahy, Vincent Wong, Том Ву, Mark Chiu, Turbo Kong, Sai-Kit Yung, Chike Chan, Jamie Hayden, David Murray, Darragh Kelly, John Kazek, Joseph Rye, Kwaku Ankomah, Jo Martin, Charles Edwards, Lucy Russell, Mark Straker, Timothy Deenihan, Flavia Masetto, Emily Steven-Daly, David Bedella, Martin McDougall, Noah Lee Margetts, Joe Hanley, Karl Shiels, Roger Griffiths, Stephen Walters, Richard Laing, Matt Miller, Risteard Cooper, Shane Rimmer, Jeremy Theobald, Alexandra Bastedo, John Judd, Soo Hee Ding, Phill Curr, Sarah Wateridge, Charlie Kranz, Terry McMahon, Cedric Young, Tim Booth, Tom Nolan, Roger Yuan, Joe Sargent, Emmanuel Idowu, Mel Taylor, Ilyssa Fradin, Andrew Pleavin, Jeff Christian, Tenzin Gyurme, Tenzin Clive Ball, John Burke, Earlene Bentley, Alex Moggridge, Jay Buozzi, Rory Campbell, Poppy Tierney, Spencer Wilding, Mark Smith, Dave Legeno, Khan Bonfils, Ruben Halse, Jordan Shaw, Dominic Burgess, Nadia Cameron-Blakey, Jeff Tanner, Omar Mostafa, Leon Delroy Williams, Fabio Cardascia, Dean Alexandrou, Joey Ansah, Jon Foo, Emil Martirossian, Mark Strange, Lasco Atkins, Rick Avery, James Embree, Gil Kolirin, Jane Osborn, Dan Poole, Tommy Gunn, Philip Harvey, Russell Wilcox, Ray Donn",7511.0,7.5,2
1,1124,The Prestige,"Drama, Mystery, Thriller",Christopher Nolan,"Hugh Jackman, Christian Bale, Michael Caine, Scarlett Johansson, Andy Serkis, Samantha Mahurin, David Bowie, Piper Perabo, Rebecca Hall, Daniel Davis, Roger Rees, Ricky Jay, Jim Piddock, Christopher Neame, Mark Ryan, Jamie Harris, Monty Stuart, Ron Perkins, Anthony De Marco, Chao Li Chi, John B. Crye, William Morgan Sheppard, Ezra Buzzington, James Lancaster, Johnny Liska, Russ Fega, Kevin Will, Edward Hibbert, James Otis, Sam Menning, Brian Tahash, Jodi Bianca Wise, Enn Reitel, Robert W. Arbogast, Chris Cleveland, Rock Anthony, Basilina Butler, Erin Cipolletti, Tim Pilleri, Gary Sievers, Inna Swann",4510.0,8.0,2
2,280,Terminator 2: Judgment Day,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Linda Hamilton, Robert Patrick, Edward Furlong, Michael Edwards, Joe Morton, Earl Boen, Jenette Goldstein, Xander Berkeley, S. Epatha Merkerson, Castulo Guerra, Danny Cooksey, Sven-Ole Thorsen, DeVaughn Nixon, Ken Gibbel, Robert Winley, Pete Schrum, Don Lake, Richard Vidan, Jim Palmer, Don Stanton, Dan Stanton, Colin Patrick Lynch, Nikki Cox, Tony Simotes, Abdul Salaam El Razzac, Mike Muscat, Dean Norris, Charles A. Tamburro, Terrence Evans, Denney Pierce, Mark Christopher Lawrence, Van Ling, Gerard G. Williams, Martin DeLuca, Scott Shaw, Joel Kramer, Richard Ruskin, Randy Walker, William Wisher Jr., Nancy Fish, Dalton Hamilton",4274.0,7.7,2
3,218,The Terminator,"Action, Thriller, Science Fiction",James Cameron,"Arnold Schwarzenegger, Michael Biehn, Linda Hamilton, Paul Winfield, Lance Henriksen, Bess Motta, Earl Boen, Rick Rossovich, Bill Paxton, Brian Thompson, Franco Columbu, Dick Miller, Joe Farago, Shawn Schepps, Bruce M. Kerner, Brad Rearden, William Wisher Jr., Ken Fritz, Hettie Lynne Hurtes, Philip Gordon, Stan Yale, Leslie Morris, Hugh Farrington, Harriet Medin, James Ralston, Wayne Stone, John E. Bristol, Patrick Pinney, Greg Robbins, Marianne Muellerleile, Marian Green, J. Randolph Harrison, Darrell Mapson",4208.0,7.4,2
4,597,Titanic,"Drama, Romance, Thriller",James Cameron,"Kate Winslet, Leonardo DiCaprio, Frances Fisher, Billy Zane, Kathy Bates, Gloria Stuart, Bill Paxton, Bernard Hill, David Warner, Victor Garber, Jonathan Hyde, Suzy Amis, Lewis Abernathy, Nicholas Cascone, Danny Nucci, Jason Barry, Lew Palter, Eric Braeden, Bernard Fox, Ewan Stewart, Ioan Gruffudd, Jonny Phillips, Edward Fletcher, Scott G. Anderson, Martin East, Gregory Cooke, Alexandrea Owens, Seth Adkins, Michael Ensign, Anatoly M. Sagalevitch, Martin Hub, Mark Lindsay Chapman, Richard Graham, Paul Brightwell, Ron Donachie, Charlotte Chatton, Fannie Brett, Jenette Goldstein, Camilla Overbye Roos, Linda Kerns, Amy Gaipa, Martin Jarvis, Rosalind Ayres, Rochelle Rose, Jonathan Evans-Jones, Brian Walsh, Rocky Taylor, Craig Kelly, Liam Tuohy, Simon Crane, James Lancaster, Elsa Raven, Reece P. Thompson III, Laramie Landis, Mark Rafael Truitt, John Walcutt, Terry Forrestal, Derek Lea, Richard Ashton, Sean Nepita, Brendan Connolly, David Cronnelly, Garth Wilton, Richard Fox, Nick Meaney, Kevin Owers, Mark Capri, Marc Cass, Paul Herbert, Emmett James, Chris Byrne, Oliver Page, James Garrett, Erik Holland, Erik Holland, Jari Kinnunen, Anders Falk, Barry Dennen, Vern Urich, Rebecca Klingler, Tricia O'Neil, Kathleen S. Dunn, Romeo Francis, Mandana Marino, Van Ling, Bjørn Olsen, Dan Pettersson, Shay Duffin, Greg Ellis, Diana Morgan, Kris Andersson, Bobbie Bates, Aaron James Cash, Anne Fletcher, Edmond Alan Forsyth, Andie Hicks, Scott Hislop, Stan Mazin, Lisa Ratzin, Julene Renee, Brian Baines, Ellie Bensinger, Alexandra Boyd, Mike Butters, James Cameron, Bruno Campolo, Chris Cragnotti, Kevyn Currie, Kevin De La Noy, Thomas Fiss, Griffin Howell, Sean Howse, Tony Kenny, Bret Aaron Knower, George Kosty III, George Kosty Jr., Geoffrey C. Kosty, Gregory Charles Kosty, Sean Lawlor, John Leonhardt, Miguel A. Lomelin, Don Lynch, Johnny Martin, Ryan McClurkin, Meghan McLeod, Mike O'Neal, Julian Oros, Phil Parlapiano, Judy Prestininzi, Steven Quale, Olivia Rosewood, John Slade, Stephen Wolfe Smith, R. Gern Trowbridge, Francisco Váldez, Lucie Zolcerova",7770.0,7.5,2
5,920,Cars,"Animation, Adventure, Comedy, Family",John Lasseter,"Owen Wilson, Paul Newman, Bonnie Hunt, Larry the Cable Guy, Tony Shalhoub, Cheech Marin, Michael Wallis, George Carlin, Paul Dooley, Jenifer Lewis, Guido Quaroni, Richard Petty, Michael Keaton, Katherine Helmond, John Ratzenberger, Joe Ranft, Jeremy Piven, Jeremy Clarkson, Dale Earnhardt Jr., Mario Andretti, Michael Schumacher, Jay Leno, Tom Hanks, Tim Allen, Billy Crystal, John Goodman, Dave Foley, Bob Costas, Darrell Waltrip, Richard Kind, Edie McClurg, Humpy Wheeler, Tom Magliozzi, Ray Magliozzi, Lynda Petty, Andrew Stanton, Sarah Clark, Mike Nelson, Joe Ranft, Jonas Rivera, Lou Romano, Adrian Ochoa, E.J. Holowicki, Elissa Knight, Lindsey Collins, Larry Benton, Douglas Keever, Vanness Wu",3991.0,6.6,2


In [79]:
s4partite[s4partite.cluster==3].sort_values(by=['director','genres']).reset_index(drop=True)

Unnamed: 0,id,title,genres,director,cast,vote_count,vote_average,cluster
0,49013,Cars 2,"Animation, Family, Adventure, Comedy",John Lasseter,"Owen Wilson, Larry the Cable Guy, Michael Caine, Emily Mortimer, John Turturro, Eddie Izzard, Thomas Kretschmann, Joe Mantegna, Peter Jacobson, Bruce Campbell, Tony Shalhoub, Darrell Waltrip, Guido Quaroni, Brent Musburger, Jason Isaacs, David Hobbs, Stanley Townsend, Lloyd Sherr, Paul Dooley, Michel Michelis, Sig Hansen, Franco Nero, Vanessa Redgrave, Bonnie Hunt, Cheech Marin, Jenifer Lewis, Michael Wallis, Katherine Helmond, John Ratzenberger, Jeff Garlin, Patrick Walker, Lewis Hamilton, Velibor Topic, John Mainieri, Brad Lewis, Richard Kind, Edie McClurg, Teresa Gallagher, Jeff Gordon, John Lasseter, Mark Winterbottom, Fernando Alonso, Vitaly Petrov, Jan Nilsson, Memo Rojas, Jacques Villeneuve, Sebastian Vettel",2088.0,5.8,3
1,120,The Lord of the Rings: The Fellowship of the Ring,"Adventure, Fantasy, Action",Peter Jackson,"Elijah Wood, Ian McKellen, Cate Blanchett, Orlando Bloom, Sean Bean, Viggo Mortensen, Hugo Weaving, Liv Tyler, John Rhys-Davies, Christopher Lee, Billy Boyd, Dominic Monaghan, Sean Astin, Andy Serkis, Ian Holm, Craig Parker, Lawrence Makoare, Sala Baker, Sarah McLeod, Marton Csokas, Alan Howard, Noel Appleby, Megan Edwards, Michael Elsworth, Mark Ferguson, Brent McIntyre",8892.0,8.0,3
