In [14]:
# --- Step 1: Import Libraries and Verify GPU ---

import pandas as pd
import numpy as np
import networkx as nx
import pickle
import os

# Import our new deep learning libraries
import torch
import torch_geometric
from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected

print(f"PyTorch version: {torch.__version__}")
print(f"PyG version: {torch_geometric.__version__}")

# --- GPU VERIFICATION ---
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"\nGPU is available! Using device: {device}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("\nGPU not available, using CPU. Training will be slower.")

# Define our data paths
PROCESSED_DATA_PATH = "../data/processed/"
RAW_DATA_PATH = "../data/raw/"

# Load the main bipartite graph we created in Notebook 1
print("\nLoading the graph from the pickle file...")
graph_path = os.path.join(PROCESSED_DATA_PATH, "movie_actor_graph.gpickle")
with open(graph_path, 'rb') as f:
    B = pickle.load(f)

# Load the MovieLens ratings data, which will be our prediction target
print("Loading MovieLens ratings data...")
ratings_df = pd.read_csv(os.path.join(RAW_DATA_PATH, "ml-latest", "ratings.csv"))

print("\n--- Data Loaded Successfully ---")
print(f"Graph has {B.number_of_nodes()} nodes and {B.number_of_edges()} edges.")
print(f"Loaded {len(ratings_df)} user ratings.")

display(ratings_df.head())

PyTorch version: 2.5.1
PyG version: 2.6.1

GPU is available! Using device: cuda
Device name: NVIDIA GeForce GTX 1650

Loading the graph from the pickle file...
Loading MovieLens ratings data...

--- Data Loaded Successfully ---
Graph has 1843257 nodes and 3957929 edges.
Loaded 33832162 user ratings.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [15]:
# --- Step 2 Sub-sample Ratings and Create Mappings ---

# To make training faster, let's work with a subset of the ratings.
# Let's start with 10% of the users.
all_user_ids = ratings_df['userId'].unique()
sample_user_ids = np.random.choice(all_user_ids, size=int(len(all_user_ids) * 0.10), replace=False)

# Filter the ratings dataframe to only include these users
ratings_subset_df = ratings_df[ratings_df['userId'].isin(sample_user_ids)].copy()

print(f"Original number of ratings: {len(ratings_df)}")
print(f"Using a subset of {len(ratings_subset_df)} ratings from {len(sample_user_ids)} users for faster training.")

# --- Create Mappings ---
# We need a continuous integer index for each node type (0, 1, 2, ...)

# The 'links.csv' file is the key to mapping between MovieLens movieId and IMDb tconst
links_df = pd.read_csv(os.path.join(RAW_DATA_PATH, "ml-latest", "links.csv"), dtype={'imdbId': str})
# Prepend 'tt' to imdbId to match the format in our graph B (e.g., 114709 -> tt0114709)
links_df['imdbId'] = 'tt' + links_df['imdbId'].str.zfill(7)
# Create a dictionary to map from MovieLens ID -> IMDb ID
ml_to_imdb_map = links_df.set_index('movieId')['imdbId'].to_dict()


# 1. Identify valid movies present in BOTH our graph AND the ratings subset
graph_movie_ids = {node for node, data in B.nodes(data=True) if data.get('type') == 'movie'}
rated_movie_ids_ml = set(ratings_subset_df['movieId'].unique())

valid_imdb_ids_from_graph = set(ml_to_imdb_map.values()).intersection(graph_movie_ids)
valid_ml_ids = {ml_id for ml_id, imdb_id in ml_to_imdb_map.items() if imdb_id in valid_imdb_ids_from_graph}
final_valid_ml_ids = valid_ml_ids.intersection(rated_movie_ids_ml)


# 2. Filter our ratings dataframe one last time to ensure all movies/users are valid
ratings_final_df = ratings_subset_df[ratings_subset_df['movieId'].isin(final_valid_ml_ids)].copy()
final_valid_user_ids = ratings_final_df['userId'].unique()


# 3. Create the final mappings
user_mapping = {user_id: i for i, user_id in enumerate(final_valid_user_ids)}
movie_mapping = {movie_id: i for i, movie_id in enumerate(final_valid_ml_ids)}
# A robust way to get actor nodes
actor_nodes = {node for node, data in B.nodes(data=True) if data.get('type') == 'actor'}
actor_mapping = {actor_id: i for i, actor_id in enumerate(actor_nodes)}


print(f"\nFinal counts for GNN data structure:")
print(f" - Users: {len(user_mapping)}")
print(f" - Movies: {len(movie_mapping)}")
print(f" - Actors: {len(actor_mapping)}")
print(f" - Ratings (edges): {len(ratings_final_df)}")

Original number of ratings: 33832162
Using a subset of 3377660 ratings from 33097 users for faster training.

Final counts for GNN data structure:
 - Users: 33081
 - Movies: 33452
 - Actors: 1314337
 - Ratings (edges): 3304655


In [16]:
# --- Step 3  Create the HeteroData Object ---

data = HeteroData()

# Add the nodes.
data['user'].num_nodes = len(user_mapping)
data['movie'].num_nodes = len(movie_mapping)
data['actor'].num_nodes = len(actor_mapping)

print("Created HeteroData object with user, movie, and actor nodes.")

# --- Define the Edges ---

# 1. User-Movie Edges (from ratings)
print("Adding User-Movie edges...")
user_indices = [user_mapping[uid] for uid in ratings_final_df['userId']]
movie_indices_for_rating = [movie_mapping[mid] for mid in ratings_final_df['movieId']]

data['user', 'rates', 'movie'].edge_index = torch.tensor([user_indices, movie_indices_for_rating])
data['user', 'rates', 'movie'].edge_attr = torch.tensor(ratings_final_df['rating'].values, dtype=torch.float)


# 2. Actor-Movie Edges (from our graph B)
print("Adding Actor-Movie edges...")
imdb_to_ml_map = {v: k for k, v in ml_to_imdb_map.items()}

actor_edge_indices = []
movie_edge_indices = []

# --- THIS IS THE FIX ---
# nx.bipartite.edges is deprecated. We iterate through all edges and check node types.
# This is the modern, robust way to do it.
for u, v in B.edges():
    # Check if u is a movie and v is an actor
    if B.nodes[u].get('type') == 'movie' and B.nodes[v].get('type') == 'actor':
        movie_id_str, actor_id_str = u, v
    # Check if v is a movie and u is an actor
    elif B.nodes[v].get('type') == 'movie' and B.nodes[u].get('type') == 'actor':
        movie_id_str, actor_id_str = v, u
    else:
        continue # Skip if it's not a movie-actor edge

    # Now, proceed with the same logic as before
    if movie_id_str in imdb_to_ml_map:
        ml_movie_id = imdb_to_ml_map[movie_id_str]
        if ml_movie_id in movie_mapping:
            movie_idx = movie_mapping[ml_movie_id]
            if actor_id_str in actor_mapping:
                actor_idx = actor_mapping[actor_id_str]
                actor_edge_indices.append(actor_idx)
                movie_edge_indices.append(movie_idx)

# Create the final edge tensors for the graph structure
data['actor', 'acted_in', 'movie'].edge_index = torch.tensor([actor_edge_indices, movie_edge_indices], dtype=torch.long)
data['movie', 'has_actor', 'actor'].edge_index = torch.tensor([movie_edge_indices, actor_edge_indices], dtype=torch.long)

# --- Final Step: Send the data to the GPU ---
print("\nMoving data to the GPU...")
data = data.to(device)

print("\nAdded edges and moved data to the GPU successfully:")
print(data)
print(f"\nIs the data on the GPU? {data.is_cuda}")

Created HeteroData object with user, movie, and actor nodes.
Adding User-Movie edges...
Adding Actor-Movie edges...

Moving data to the GPU...

Added edges and moved data to the GPU successfully:
HeteroData(
  user={ num_nodes=33081 },
  movie={ num_nodes=33452 },
  actor={ num_nodes=1314337 },
  (user, rates, movie)={
    edge_index=[2, 3304655],
    edge_attr=[3304655],
  },
  (actor, acted_in, movie)={ edge_index=[2, 323257] },
  (movie, has_actor, actor)={ edge_index=[2, 323257] }
)

Is the data on the GPU? True


In [25]:
# --- Step 4 Define the Explicit HeteroGNN Model ---

from torch_geometric.nn import SAGEConv, HeteroConv

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = HeteroConv({
            ('user', 'rates', 'movie'): SAGEConv((-1, -1), hidden_channels),
            ('actor', 'acted_in', 'movie'): SAGEConv((-1, -1), hidden_channels),
            ('movie', 'has_actor', 'actor'): SAGEConv((-1, -1), hidden_channels),
            ('movie', 'rev_rates', 'user'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='sum')
        self.conv2 = HeteroConv({
            ('user', 'rates', 'movie'): SAGEConv((-1, -1), hidden_channels),
            ('actor', 'acted_in', 'movie'): SAGEConv((-1, -1), hidden_channels),
            ('movie', 'has_actor', 'actor'): SAGEConv((-1, -1), hidden_channels),
            ('movie', 'rev_rates', 'user'): SAGEConv((-1, -1), hidden_channels),
        }, aggr='sum')

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, edge_index_dict)
        return x_dict

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.user_emb = torch.nn.Embedding(data['user'].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data['movie'].num_nodes, hidden_channels)
        self.actor_emb = torch.nn.Embedding(data['actor'].num_nodes, hidden_channels)
        self.gnn = HeteroGNN(hidden_channels)
        self.decoder = lambda x_user, x_movie: (x_user * x_movie).sum(dim=-1)

    def forward(self, data):
        # 1. Get initial embeddings
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_emb(data["movie"].node_id),
          "actor": self.actor_emb(data["actor"].node_id),
        } 
        
        # 2. Run GNN encoder to get final embeddings for ALL nodes
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        
        # --- THIS IS THE FIX ---
        # 3. Decode predictions ONLY for the edges we are interested in
        #    The splitter puts the target edges in 'edge_label_index'
        edge_label_index = data['user', 'rates', 'movie'].edge_label_index
        
        pred = self.decoder(
            x_dict['user'][edge_label_index[0]],
            x_dict['movie'][edge_label_index[1]],
        )
        
        return pred

print("Final Corrected HeteroGNN Model architecture defined successfully.")

Final Corrected HeteroGNN Model architecture defined successfully.


In [22]:
# --- Step 5 Create Initial Node Features ---

# All our model needs are the unique IDs for each node, which the
# Embedding layers will use to look up the feature vectors.

data['user'].node_id = torch.arange(data['user'].num_nodes, device=device)
data['movie'].node_id = torch.arange(data['movie'].num_nodes, device=device)
data['actor'].node_id = torch.arange(data['actor'].num_nodes, device=device)

# We no longer need the '.x' attribute for movies. Let's delete it to be clean.
del data['movie'].x

print("Initial node IDs created and assigned.")
print(data)

Initial node IDs created and assigned.
HeteroData(
  user={
    num_nodes=33081,
    node_id=[33081],
  },
  movie={
    num_nodes=33452,
    node_id=[33452],
  },
  actor={
    num_nodes=1314337,
    node_id=[1314337],
  },
  (user, rates, movie)={
    edge_index=[2, 3304655],
    edge_attr=[3304655],
  },
  (actor, acted_in, movie)={ edge_index=[2, 323257] },
  (movie, has_actor, actor)={ edge_index=[2, 323257] }
)


In [23]:
# --- Step 6 Split Edges for Training, Validation, and Testing ---
import torch_geometric.transforms as T

# We will split the 'user' -> 'rates' -> 'movie' edges.
# By setting is_undirected=False and providing the reverse edge type,
# we ensure the transform creates the necessary reverse edges for GNN message passing.
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=False, # We will handle the reverse edges explicitly
    add_negative_train_samples=False,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')] # Tell the splitter to create these reverse edges
)

# Apply the transform to our data
train_data, val_data, test_data = transform(data)


# --- CRUCIAL FIX: Make the actor-movie connections undirected manually ---
# The splitter only works on the edge type we give it. We need to ensure
# the other structural edges are also treated as undirected for message passing.
train_data = ToUndirected()(train_data)
val_data = ToUndirected()(val_data)
test_data = ToUndirected()(test_data)


print("--- Data Splitting Complete ---")
print("\nTraining Data:")
print(train_data)
print("\nValidation Data:")
print(val_data)
print("\nTest Data:")
print(test_data)

--- Data Splitting Complete ---

Training Data:
HeteroData(
  user={
    num_nodes=33081,
    node_id=[33081],
  },
  movie={
    num_nodes=33452,
    node_id=[33452],
  },
  actor={
    num_nodes=1314337,
    node_id=[1314337],
  },
  (user, rates, movie)={
    edge_index=[2, 2643725],
    edge_attr=[2643725],
    edge_label=[2643725],
    edge_label_index=[2, 2643725],
  },
  (actor, acted_in, movie)={ edge_index=[2, 323257] },
  (movie, has_actor, actor)={ edge_index=[2, 323257] },
  (movie, rev_rates, user)={
    edge_index=[2, 2643725],
    edge_attr=[2643725],
    edge_label=[2643725],
  },
  (movie, rev_acted_in, actor)={ edge_index=[2, 323257] },
  (actor, rev_has_actor, movie)={ edge_index=[2, 323257] },
  (user, rev_rev_rates, movie)={
    edge_index=[2, 2643725],
    edge_attr=[2643725],
    edge_label=[2643725],
  }
)

Validation Data:
HeteroData(
  user={
    num_nodes=33081,
    node_id=[33081],
  },
  movie={
    num_nodes=33452,
    node_id=[33452],
  },
  actor={
    n

In [28]:
# --- Step 7 (Final & Robust): Initialize and Train the Model (Full Run) ---
from tqdm.notebook import tqdm 

# --- Model Initialization ---
hidden_channels = 64 
model = Model(hidden_channels=hidden_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_function = torch.nn.MSELoss()


# --- Training Loop ---
# We are running the full 50 epochs for the best performance.
epochs = 50 

# --- Explicit Save Path ---
# We are using an explicit relative path to go UP from the 'notebooks' folder
# and then DOWN into the correct 'models' folder.
MODELS_DIR = "../models/"
os.makedirs(MODELS_DIR, exist_ok=True) # Create the directory if it's not there
model_save_path = os.path.join(MODELS_DIR, "gnn_recommendation_model.pt")


print(f"Starting the full training run for {epochs} epochs...")
print(f"Model will be saved to the absolute path: {os.path.abspath(model_save_path)}")

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    pred = model(train_data)
    ground_truth = train_data['user', 'rates', 'movie'].edge_label
    loss = loss_function(pred, ground_truth)
    loss.backward()
    optimizer.step()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_pred = model(val_data)
        val_ground_truth = val_data['user', 'rates', 'movie'].edge_label
        val_loss = loss_function(val_pred, val_ground_truth)
        
    val_rmse = torch.sqrt(val_loss)

    # Print progress every 5 epochs
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:02d}, Train Loss: {loss.item():.4f}, Validation RMSE: {val_rmse.item():.4f}")


print("\n--- Full Training Complete ---")

# --- Save the Trained Model to the Correct Location ---
torch.save(model.state_dict(), model_save_path)
print(f"\nTrained model state saved to: {model_save_path}")

# Final verification to be 100% sure the file exists
if os.path.exists(model_save_path):
    print("\nSUCCESS: File has been verified and exists in the correct 'models' folder!")
else:
    print("\nCRITICAL ERROR: File still not found after saving. Please check folder permissions or disk space.")

Starting the full training run for 50 epochs...
Model will be saved to the absolute path: C:\Users\rahul\OneDrive\Documents\Movie_GNN_Project\models\gnn_recommendation_model.pt
Epoch 05, Train Loss: 2.4819, Validation RMSE: 2.4650
Epoch 10, Train Loss: 10.1088, Validation RMSE: 2.1785
Epoch 15, Train Loss: 2.1751, Validation RMSE: 2.1038
Epoch 20, Train Loss: 0.1227, Validation RMSE: 0.8993
Epoch 25, Train Loss: 0.3314, Validation RMSE: 0.7854
Epoch 30, Train Loss: 0.3916, Validation RMSE: 0.7802
Epoch 35, Train Loss: 0.1891, Validation RMSE: 0.7877
Epoch 40, Train Loss: 0.0471, Validation RMSE: 0.9600
Epoch 45, Train Loss: 0.0774, Validation RMSE: 1.1250
Epoch 50, Train Loss: 0.0664, Validation RMSE: 1.0678

--- Full Training Complete ---

Trained model state saved to: ../models/gnn_recommendation_model.pt

SUCCESS: File has been verified and exists in the correct 'models' folder!
