In [2]:
import pandas as pd
import os

# Define the path to your raw data folder
RAW_DATA_PATH = "../data/raw/"
PROCESSED_DATA_PATH = "../data/processed/"

print("Loading title.basics.tsv.gz...")
# Load the movie titles dataset
# We use a tsv reader (tab-separated values) and specify the compression
title_basics_df = pd.read_csv(os.path.join(RAW_DATA_PATH, "title.basics.tsv.gz"), sep='\t', low_memory=False)

print("Data loaded successfully!")
# Display the first 5 rows and the shape of the dataframe
print(f"Shape of the dataframe: {title_basics_df.shape}")
display(title_basics_df.head())

Loading title.basics.tsv.gz...
Data loaded successfully!
Shape of the dataframe: (12058793, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [3]:
# --- Step 2: Filter for movies ---
print(f"Original shape of titles dataframe: {title_basics_df.shape}")

# Keep only the rows where titleType is 'movie'
movies_df = title_basics_df[title_basics_df['titleType'] == 'movie'].copy()

# The .copy() is important to avoid a common Pandas warning.
print(f"Shape after filtering for movies: {movies_df.shape}")

# We can also drop columns we won't need to save memory
movies_df = movies_df.drop(columns=['titleType', 'originalTitle', 'isAdult', 'endYear'])
print("Dropped unnecessary columns.")

display(movies_df.head())

Original shape of titles dataframe: (12058793, 9)
Shape after filtering for movies: (731489, 9)
Dropped unnecessary columns.


Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
8,tt0000009,Miss Jerry,1894,45,Romance
144,tt0000147,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport"
331,tt0000335,Soldiers of the Cross,1900,40,"Biography,Drama"
498,tt0000502,Bohemios,1905,100,\N
570,tt0000574,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"


In [4]:
# --- Step 3: Load and filter the principals (actors/actresses) data ---
print("Loading title.principals.tsv.gz...")
principals_df = pd.read_csv(os.path.join(RAW_DATA_PATH, "title.principals.tsv.gz"), sep='\t')
print(f"Original shape of principals dataframe: {principals_df.shape}")

# We only care about actors and actresses for our graph
# The 'category' column tells us the person's job in the movie
actor_principals_df = principals_df[
    (principals_df['category'] == 'actor') | 
    (principals_df['category'] == 'actress')
].copy()

print(f"Shape after filtering for actors/actresses: {actor_principals_df.shape}")
display(actor_principals_df.head())

Loading title.principals.tsv.gz...
Original shape of principals dataframe: (95835978, 6)
Shape after filtering for actors/actresses: (39922192, 6)


Unnamed: 0,tconst,ordering,nconst,category,job,characters
14,tt0000005,1,nm0443482,actor,\N,"[""Blacksmith""]"
15,tt0000005,2,nm0653042,actor,\N,"[""Assistant""]"
17,tt0000007,1,nm0179163,actor,\N,\N
18,tt0000007,2,nm0183947,actor,\N,\N
24,tt0000008,1,nm0653028,actor,\N,"[""Sneezing Man""]"


In [5]:
# --- Step 4: Create the final edge list by merging ---

# We rename 'tconst' in principals to avoid confusion after the merge
actor_principals_df.rename(columns={'tconst': 'movieId', 'nconst': 'actorId'}, inplace=True)

# We only want the edges (actor-movie connections) for the movies we've already filtered.
# A "merge" is like a SQL join. 'inner' means we only keep rows where the movieId exists in BOTH dataframes.
edges_df = pd.merge(
    movies_df[['tconst']], # We only need the ID column from movies_df
    actor_principals_df[['movieId', 'actorId']],
    left_on='tconst',
    right_on='movieId'
)

# Drop the redundant movie ID column
edges_df = edges_df.drop(columns=['tconst'])

print(f"Total number of actor-movie connections (edges): {len(edges_df)}")
display(edges_df.head())

Total number of actor-movie connections (edges): 4025496


Unnamed: 0,movieId,actorId
0,tt0000009,nm0063086
1,tt0000009,nm0183823
2,tt0000009,nm1309758
3,tt0000335,nm1010955
4,tt0000335,nm1012612


In [6]:
# --- Step 5: Load the actor names ---
print("Loading name.basics.tsv.gz...")
names_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'name.basics.tsv.gz'), sep='\t')
print(f"Loaded {len(names_df)} names.")

# We only need the names for the actors that are actually in our edge list.
# First, get a unique list of actor IDs from our edges_df
unique_actor_ids = edges_df['actorId'].unique()

# Now, filter the names_df to only keep the actors we need
actors_df = names_df[names_df['nconst'].isin(unique_actor_ids)].copy()
actors_df = actors_df[['nconst', 'primaryName']] # Keep only the relevant columns
actors_df.rename(columns={'nconst': 'actorId', 'primaryName': 'name'}, inplace=True)

print(f"Filtered down to {len(actors_df)} unique actors present in our movies.")
display(actors_df.head())

Loading name.basics.tsv.gz...
Loaded 14870233 names.
Filtered down to 1314337 unique actors present in our movies.


Unnamed: 0,actorId,name
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


In [7]:
# --- Step 6: Final cleanup of the movies list ---
# Get a unique list of movie IDs that are definitely in our final edge list
movies_with_edges = edges_df['movieId'].unique()

# Filter the main movies_df to only include these movies
final_movies_df = movies_df[movies_df['tconst'].isin(movies_with_edges)].copy()
final_movies_df.rename(columns={'tconst': 'movieId'}, inplace=True)


print("Final counts for graph construction:")
print(f" - Movies (nodes): {len(final_movies_df)}")
print(f" - Actors (nodes): {len(actors_df)}")
print(f" - Edges (connections): {len(edges_df)}")

Final counts for graph construction:
 - Movies (nodes): 528728
 - Actors (nodes): 1314337
 - Edges (connections): 4025496


In [10]:
import networkx as nx
import pickle # <-- Step 1: Import the standard pickle library

# --- Step 7: Build the Bipartite Graph ---
# (This part is the same and correct)
B = nx.Graph()
print("Adding movie nodes...")
for i, row in final_movies_df.iterrows():
    B.add_node(row['movieId'], type='movie', title=row['primaryTitle'], year=row['startYear'], runtime=row['runtimeMinutes'], genres=row['genres'])
print("Adding actor nodes...")
for i, row in actors_df.iterrows():
    B.add_node(row['actorId'], type='actor', name=row['name'])
print("Adding edges...")
B.add_edges_from(edges_df[['movieId', 'actorId']].values)
print("\n--- Graph Construction Complete ---")
num_nodes = B.number_of_nodes()
num_edges = B.number_of_edges()
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
if num_nodes > 0:
    avg_degree = (2 * num_edges) / num_nodes
    print(f"Average degree: {avg_degree:.2f}")

# --- Step 8: Save the Graph and Clean Dataframes ---
print("\nSaving the processed data...")

# Save the final, clean dataframes to CSV files for future use
final_movies_df.to_csv(os.path.join(PROCESSED_DATA_PATH, 'clean_movies.csv'), index=False)
actors_df.to_csv(os.path.join(PROCESSED_DATA_PATH, 'clean_actors.csv'), index=False)

# --- THIS IS THE FIX ---
# Step 2: Define the file path
graph_path = os.path.join(PROCESSED_DATA_PATH, "movie_actor_graph.gpickle")
# Step 3: Use pickle to save the graph object
with open(graph_path, 'wb') as f:
    pickle.dump(B, f)

print("All processed files have been saved successfully to the 'data/processed/' folder.")

Adding movie nodes...
Adding actor nodes...
Adding edges...

--- Graph Construction Complete ---
Number of nodes: 1843257
Number of edges: 3957929
Average degree: 4.29

Saving the processed data...
All processed files have been saved successfully to the 'data/processed/' folder.
