In [None]:
from torch_geometric.datasets import AmazonBook, MovieLens
from torch_geometric.transforms import Compose, ToDevice, ToUndirected
import torch
from torch_geometric.data import Data
import torch
from torch_geometric.utils import train_test_split_edges
from torch_sparse import SparseTensor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)
transform = Compose([ToDevice(device)])
movielens_dataset = MovieLens(root="./data/MovieLens", transform=transform, model_name='all-MiniLM-L6-v2')
print(f"Dataset: {movielens_dataset}")
print(f"Number of graphs in dataset: {len(movielens_dataset)}")
print(f"Number of features of dataset: {movielens_dataset.num_features}")

You are using device: cpu
Dataset: MovieLens()
Number of graphs in dataset: 1
Number of features of dataset: {'movie': 404, 'user': 0}


In [2]:
import torch
from torch_geometric.datasets import MovieLens
from torch_geometric.transforms import Compose, ToDevice, ToUndirected

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create transform pipeline
transform = Compose([
    ToDevice(device),  # Move data to GPU if available
])

# Load MovieLens dataset
movielens_dataset = MovieLens(
    root="./data/MovieLens",
    transform=transform,
    model_name='all-MiniLM-L6-v2'  # This model will be used to create movie features
)

# Basic dataset information
print("\n=== Dataset Overview ===")
print(f"Dataset: {movielens_dataset}")
print(f"Number of graphs: {len(movielens_dataset)}")

# Get the first (and only) graph
data = movielens_dataset[0]

# Detailed graph information
print("\n=== Graph Structure ===")
print(f"Node types: {data.node_types}")  # Should show ['movie', 'user']
print(f"Edge types: {data.edge_types}")  # Should show [('user', 'rates', 'movie')]

# Get specific node information
print("\n=== Node Information ===")
print(f"Number of users: {data['user'].num_nodes}")
print(f"Number of movies: {data['movie'].num_nodes}")
print(f"Movie feature dimension: {data['movie'].x.size(1)}")  # From sentence transformer
print(f"User feature dimension: {data['user'].x.size(1) if 'x' in data['user'] else 'No features'}")

# Get edge information
print("\n=== Edge Information ===")
edge_index = data[('user', 'rates', 'movie')].edge_index
ratings = data[('user', 'rates', 'movie')].edge_label
print(f"Number of ratings: {ratings.size(0)}")
print(f"Rating range: [{ratings.min():.1f}, {ratings.max():.1f}]")

# Rating distribution
print("\n=== Rating Distribution ===")
for rating in torch.arange(0.5, 5.5, 0.5):
    count = (ratings == rating).sum().item()
    print(f"Rating {rating:.1f}: {count} ratings")

# Device information
print("\n=== Device Information ===")
print(f"Data is on: {'CUDA' if data.is_cuda else 'CPU'}")

# Let's prepare the data for LightGCN
# Filter ratings >= 4 for positive interactions
mask = ratings >= 4.0
filtered_edge_index = edge_index[:, mask]
filtered_ratings = ratings[mask]

print("\n=== Filtered Data for LightGCN ===")
print(f"Original number of edges: {ratings.size(0)}")
print(f"Filtered edges (ratings >= 4.0): {filtered_ratings.size(0)}")
print(f"Percentage of positive interactions: {(filtered_ratings.size(0)/ratings.size(0)*100):.2f}%")

# Save important values for model initialization
num_users = data['user'].num_nodes
num_movies = data['movie'].num_nodes
embedding_dim = 64  # You can adjust this

print("\n=== Model Parameters ===")
print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")
print(f"Embedding dimension: {embedding_dim}")

Using device: cpu

=== Dataset Overview ===
Dataset: MovieLens()
Number of graphs: 1

=== Graph Structure ===
Node types: ['movie', 'user']
Edge types: [('user', 'rates', 'movie')]

=== Node Information ===
Number of users: 610
Number of movies: 9742
Movie feature dimension: 404
User feature dimension: No features

=== Edge Information ===
Number of ratings: 100836
Rating range: [0.0, 5.0]

=== Rating Distribution ===
Rating 0.5: 0 ratings
Rating 1.0: 4602 ratings
Rating 1.5: 0 ratings
Rating 2.0: 13101 ratings
Rating 2.5: 0 ratings
Rating 3.0: 33183 ratings
Rating 3.5: 0 ratings
Rating 4.0: 35369 ratings
Rating 4.5: 0 ratings
Rating 5.0: 13211 ratings

=== Device Information ===
Data is on: CPU

=== Filtered Data for LightGCN ===
Original number of edges: 100836
Filtered edges (ratings >= 4.0): 48580
Percentage of positive interactions: 48.18%

=== Model Parameters ===
Number of users: 610
Number of movies: 9742
Embedding dimension: 64


In [3]:
import pandas as pd

# Read the CSV files
movies_df = pd.read_csv('data/MovieLens/raw/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/MovieLens/raw/ml-latest-small/ratings.csv')
links_df = pd.read_csv('data/MovieLens/raw/ml-latest-small/links.csv')
tags_df = pd.read_csv('data/MovieLens/raw/ml-latest-small/tags.csv')

print("Movies Sample:")
print(movies_df.head())
print("\nRatings Sample:")
print(ratings_df.head())

Movies Sample:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Sample:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [4]:
dataset = MovieLens(root='data/MovieLens')
data = dataset[0]
edge_index = data[('user', 'rates', 'movie')].edge_index 

In [5]:
# we have a problem here: Notice that the 
print(edge_index)

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]])


In [6]:
print(edge_index[:, :10]) 


tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  0,   2,   5,  43,  46,  62,  89,  97, 124, 130]])


In [7]:
import pandas as pd 
ratings_df = pd.read_csv('data/MovieLens/raw/ml-latest-small/ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
user_ids, user_mapping = pd.factorize(ratings_df['userId'])
movie_ids, movie_mapping = pd.factorize(ratings_df['movieId'])

In [9]:
ratings_df['user_idx'] = user_ids
ratings_df['movie_idx'] = movie_ids

In [10]:
ratings_df['positive'] = (ratings_df['rating'] >= 4).astype(int)


In [11]:
print("Original vs Mapped IDs (first 5 rows):")
print(ratings_df[['userId', 'user_idx', 'movieId', 'movie_idx', 'rating', 'positive']].head())
print(f"\nNumber of unique users: {len(user_mapping)}")
print(f"Number of unique movies: {len(movie_mapping)}")

Original vs Mapped IDs (first 5 rows):
   userId  user_idx  movieId  movie_idx  rating  positive
0       1         0        1          0     4.0         1
1       1         0        3          1     4.0         1
2       1         0        6          2     4.0         1
3       1         0       47          3     5.0         1
4       1         0       50          4     5.0         1

Number of unique users: 610
Number of unique movies: 9724


In [12]:
print("\nAll columns in our DataFrame:")
print(ratings_df.columns)  

positive_interactions = ratings_df[ratings_df['positive'] == 1]
print("\nPositive interactions (first 5):")
print(positive_interactions[['user_idx', 'movie_idx']].head())

user0_ratings = ratings_df[ratings_df['user_idx'] == 0]
print("\nAll ratings by user_idx 0:")
print(user0_ratings[['movie_idx', 'rating']].head())

print("\nDataset Statistics:")
print(f"Total ratings: {len(ratings_df)}")
print(f"Positive ratings (≥4): {ratings_df['positive'].sum()}")
print(f"Average rating: {ratings_df['rating'].mean():.2f}")


All columns in our DataFrame:
Index(['userId', 'movieId', 'rating', 'timestamp', 'user_idx', 'movie_idx',
       'positive'],
      dtype='object')

Positive interactions (first 5):
   user_idx  movie_idx
0         0          0
1         0          1
2         0          2
3         0          3
4         0          4

All ratings by user_idx 0:
   movie_idx  rating
0          0     4.0
1          1     4.0
2          2     4.0
3          3     5.0
4          4     5.0

Dataset Statistics:
Total ratings: 100836
Positive ratings (≥4): 48580
Average rating: 3.50


In [13]:

edge_index = torch.tensor([
    ratings_df[ratings_df['positive'] == 1]['user_idx'].values,      
    ratings_df[ratings_df['positive'] == 1]['movie_idx'].values      
], dtype=torch.long)

num_users = len(user_mapping)         
num_movies = len(movie_mapping)        
embedding_dim = 64                     
print(edge_index)
print(f"Edge index shape: {edge_index.shape}")
print(f"Number of users: {num_users}")
print(f"Number of movies: {num_movies}")

edge_index[1] += num_users # since we have been shifting the idxs, we have to make sure that node idxs and movies idxs don't overlap. Shift again accordingly
print(edge_index)
print("\nAfter shifting movie indices:")
print(f"User index range: [0, {num_users-1}]")
print(f"Movie index range: [{num_users}, {num_users + num_movies-1}]")

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 2035, 3121, 1392]])
Edge index shape: torch.Size([2, 48580])
Number of users: 610
Number of movies: 9724
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [ 610,  611,  612,  ..., 2645, 3731, 2002]])

After shifting movie indices:
User index range: [0, 609]
Movie index range: [610, 10333]


  edge_index = torch.tensor([


In [14]:
# #example : 
# userId  movieId  rating
# 1       50       4.5
# 1       22       5.0
# 2       50       4.0

# edge_index = [
#     [0,    0,    1],      # mapped user IDs
#     [610,  611,  610]     # mapped movie IDs (shifted by num_users)
# ]

In [15]:
nodes = num_users + num_movies
nodes

10334

In [16]:
edge_index

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [ 610,  611,  612,  ..., 2645, 3731, 2002]])

In [17]:
data = data = Data(
    edge_index=edge_index,
    num_nodes=num_users + num_movies  # total number of nodes
)

In [None]:

data = train_test_split_edges(data, val_ratio=0.1, test_ratio=0.1)




In [19]:
# use SparseTensor to build the adjacency matrix efficiently
train_sparse = SparseTensor(
    row=data.train_pos_edge_index[0],
    col=data.train_pos_edge_index[1],
    sparse_sizes=(num_users + num_movies, num_users + num_movies)
)

val_sparse = SparseTensor(
    row=data.val_pos_edge_index[0],
    col=data.val_pos_edge_index[1],
    sparse_sizes=(num_users + num_movies, num_users + num_movies)
)

test_sparse = SparseTensor(
    row=data.test_pos_edge_index[0],
    col=data.test_pos_edge_index[1],
    sparse_sizes=(num_users + num_movies, num_users + num_movies)
)

print(f"Train edges: {data.train_pos_edge_index.shape[1]}")
print(f"Val edges: {data.val_pos_edge_index.shape[1]}")
print(f"Test edges: {data.test_pos_edge_index.shape[1]}")


Train edges: 77728
Val edges: 4858
Test edges: 4858


In [20]:
train_sparse

SparseTensor(row=tensor([    0,     0,     0,  ..., 10317, 10320, 10322]),
             col=tensor([610, 611, 612,  ..., 609, 609, 609]),
             size=(10334, 10334), nnz=77728, density=0.07%)

In [21]:
torch.save({
    'train_sparse': train_sparse,
    'val_sparse': val_sparse,
    'test_sparse': test_sparse
}, 'sparse_tensors.pt')