In [4]:
!pip install pandas

import pandas as pd

# Download MovieLens data
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

# Load the data into pandas DataFrames
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
movies_df = pd.read_csv('ml-latest-small/movies.csv')

[0m--2023-07-20 07:54:59--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.1’


2023-07-20 07:55:01 (941 KB/s) - ‘ml-latest-small.zip.1’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize encoders
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

# Fit encoders and transform data
ratings_df['userId'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movieId'] = movie_encoder.fit_transform(ratings_df['movieId'])

In [6]:
ratings_df,\
ratings_df['userId'].max(),\
ratings_df['movieId'].max()

(        userId  movieId  rating   timestamp
 0            0        0     4.0   964982703
 1            0        2     4.0   964981247
 2            0        5     4.0   964982224
 3            0       43     5.0   964983815
 4            0       46     5.0   964982931
 ...        ...      ...     ...         ...
 100831     609     9416     4.0  1493848402
 100832     609     9443     5.0  1493850091
 100833     609     9444     5.0  1494273047
 100834     609     9445     5.0  1493846352
 100835     609     9485     3.0  1493846415
 
 [100836 rows x 4 columns],
 609,
 9723)

In [8]:
import torch
from torch_geometric.data import Data

user_num, movie_num = ratings_df['userId'].max()+1, ratings_df['movieId'].max()+1
ratings_df['movieId'] += user_num

# Build edge index (source and target nodes for each edge)
edge_index = torch.tensor(ratings_df[['userId', 'movieId']].values, dtype=torch.long).t().contiguous()

# Create node features (here just one-hot encoding)
x = torch.eye( user_num+movie_num, dtype=torch.float)

# Create edge features (ratings)
edge_attr = torch.tensor(ratings_df[['rating']].values, dtype=torch.float)

# Create graph data
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)


ModuleNotFoundError: No module named 'torch_geometric'

In [21]:
data,\
x, x.shape, \
edge_index, edge_index.shape,\
edge_attr, edge_attr.shape

(Data(x=[10334, 10334], edge_index=[2, 100836], edge_attr=[100836, 1]),
 tensor([[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.]]),
 torch.Size([10334, 10334]),
 tensor([[    0,     0,     0,  ...,   609,   609,   609],
         [  610,   612,   615,  ..., 10054, 10055, 10095]]),
 torch.Size([2, 100836]),
 tensor([[4.],
         [4.],
         [4.],
         ...,
         [5.],
         [5.],
         [3.]]),
 torch.Size([100836, 1]))

In [22]:
from torch_geometric.nn import GCNConv

class RecommenderSystem(torch.nn.Module):
    def __init__(self):
        super(RecommenderSystem, self).__init__()
        self.conv1 = GCNConv(data.num_node_features, 128)
        self.conv2 = GCNConv(128, 64)
        self.embed = torch.nn.Linear(64, 32)  # output embedding of size 32

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = torch.nn.functional.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.nn.functional.relu(x)
        x = self.embed(x)  # output a dense embedding for each node

        return x  # now x has shape [num_nodes, 32]

In [24]:
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
model = RecommenderSystem().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3)

# Build adjusted edge index
edge_index = torch.tensor(ratings_df[['userId', 'movieId']].values, dtype=torch.long).t().contiguous()

# All the other parts are the same...

# Split the edges into train and test
edge_train, edge_test, edge_attr_train, edge_attr_test = train_test_split(
    edge_index.T, edge_attr, test_size=0.2, random_state=42)

# Create train and test Data objects
data_train = Data(x=x, edge_index=edge_train.T, edge_attr=edge_attr_train)
data_test = Data(x=x, edge_index=edge_test.T, edge_attr=edge_attr_test)

# Move data to device
data_train = data_train.to(device)
data_test = data_test.to(device)

# For the model, adjust embedding fetching 
for epoch in range(10000):
    optimizer.zero_grad()
    embeddings = model(data_train)
    user_embeddings = embeddings[data_train.edge_index[0]]
    movie_embeddings = embeddings[data_train.edge_index[1]]
    predictions = (user_embeddings * movie_embeddings).sum(dim=1)
    loss = torch.nn.functional.mse_loss(predictions, data_train.edge_attr.squeeze(1))
    loss.backward()
    optimizer.step()

    # print training loss every 20 epochs
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, Training MSE: {loss.item()}")

# Evaluation
model.eval()
with torch.no_grad():
    embeddings = model(data_test)
    user_embeddings = embeddings[data_test.edge_index[0]]
    movie_embeddings = embeddings[data_test.edge_index[1]]
    predictions = (user_embeddings * movie_embeddings).sum(dim=1)
    loss = torch.nn.functional.mse_loss(predictions, data_test.edge_attr.squeeze(1))
    print(f"Test MSE: {loss.item()}")
model.train()


Epoch: 0, Training MSE: 12.040426254272461
Epoch: 100, Training MSE: 1.3198031187057495
Epoch: 200, Training MSE: 0.7992702126502991
Epoch: 300, Training MSE: 0.7833868861198425
Epoch: 400, Training MSE: 0.7688291668891907
Epoch: 500, Training MSE: 0.7556441426277161
Epoch: 600, Training MSE: 0.7491211295127869
Epoch: 700, Training MSE: 0.743732213973999
Epoch: 800, Training MSE: 0.7376899123191833
Epoch: 900, Training MSE: 0.7319514751434326
Epoch: 1000, Training MSE: 0.7270088791847229
Epoch: 1100, Training MSE: 0.7241324186325073
Epoch: 1200, Training MSE: 0.7227562665939331
Epoch: 1300, Training MSE: 0.7215970158576965
Epoch: 1400, Training MSE: 0.7214637398719788
Epoch: 1500, Training MSE: 0.7215844988822937
Epoch: 1600, Training MSE: 0.7200896739959717
Epoch: 1700, Training MSE: 0.7199994325637817
Epoch: 1800, Training MSE: 0.7201172113418579
Epoch: 1900, Training MSE: 0.7192979454994202
Epoch: 2000, Training MSE: 0.7201182842254639
Epoch: 2100, Training MSE: 0.7198350429534912
E

RecommenderSystem(
  (conv1): GCNConv(10334, 128)
  (conv2): GCNConv(128, 64)
  (embed): Linear(in_features=64, out_features=32, bias=True)
)

In [25]:
def get_recommendations(user_id, top_n=5):
    model.eval()
    with torch.no_grad():
        embeddings = model(data)
        
        # Compute scores for each movie for the given user
        user_embedding = embeddings[user_id]
        movie_embeddings = embeddings[user_num:]  # only consider movie nodes
        scores = (user_embedding * movie_embeddings).sum(dim=1)
        
        # Exclude movies the user has already rated
        rated_movies = data.edge_index[1][data.edge_index[0] == user_id]
        scores[rated_movies] = -float('inf')

        # Get the indices of the top N scores
        top_movie_ids = scores.topk(top_n).indices

        print(f"Top {top_n} recommendations for user {user_id}: {top_movie_ids.tolist()}")

# Get recommendations for user 0
get_recommendations(0)

Top 5 recommendations for user 0: [694, 686, 862, 602, 277]
