In [1]:
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, Tensor

from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.data import Data, Dataset, download_url, extract_zip
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# defines LightGCN model
class LightGCN(MessagePassing):
    """LightGCN Model as proposed in https://arxiv.org/abs/2002.02126
    """

    def __init__(self, num_users, num_items, embedding_dim=64, K=3, add_self_loops=False):
        """Initializes LightGCN Model

        Args:
            num_users (int): Number of users
            num_items (int): Number of items
            embedding_dim (int, optional): Dimensionality of embeddings. Defaults to 8.
            K (int, optional): Number of message passing layers. Defaults to 3.
            add_self_loops (bool, optional): Whether to add self loops for message passing. Defaults to False.
        """
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.embedding_dim, self.K = embedding_dim, K
        self.add_self_loops = add_self_loops

        self.users_emb = nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
        self.items_emb = nn.Embedding(
            num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0

        nn.init.normal_(self.users_emb.weight, std=0.1)
        nn.init.normal_(self.items_emb.weight, std=0.1)

    def forward(self, edge_index: SparseTensor):
        """Forward propagation of LightGCN Model.

        Args:
            edge_index (SparseTensor): adjacency matrix

        Returns:
            tuple (Tensor): e_u_k, e_u_0, e_i_k, e_i_0
        """
        # compute \tilde{A}: symmetrically normalized adjacency matrix
        edge_index_norm = gcn_norm(
            edge_index, add_self_loops=self.add_self_loops)

        emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
        embs = [emb_0]
        emb_k = emb_0

        # multi-scale diffusion
        for i in range(self.K):
            emb_k = self.propagate(edge_index_norm, x=emb_k)
            embs.append(emb_k)

        embs = torch.stack(embs, dim=1)
        emb_final = torch.mean(embs, dim=1) # E^K

        users_emb_final, items_emb_final = torch.split(
            emb_final, [self.num_users, self.num_items]) # splits into e_u^K and e_i^K

        # returns e_u^K, e_u^0, e_i^K, e_i^0
        return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight

    def message(self, x_j: Tensor) -> Tensor:
        return x_j

    def message_and_aggregate(self, adj_t: SparseTensor, x: Tensor) -> Tensor:
        # computes \tilde{A} @ x
        return matmul(adj_t, x)

In [3]:
def load_node_csv(path, index_col):
    """Loads csv containing node information

    Args:
        path (str): path to csv file
        index_col (str): column name of index column

    Returns:
        dict: mapping of csv row to node id
    """
    df = pd.read_csv(path, index_col=index_col)
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    return mapping


movie_path = '/Users/samanthatan/Desktop/ml-latest-small/movies.csv'
rating_path = '/Users/samanthatan/Desktop/ml-latest-small/ratings.csv'

user_mapping = load_node_csv(rating_path, index_col='userId')
movie_mapping = load_node_csv(movie_path, index_col='movieId')

In [4]:
def load_edge_csv(path, src_index_col, src_mapping, dst_index_col, dst_mapping, link_index_col, rating_threshold=4):
    """Loads csv containing edges between users and items

    Args:
        path (str): path to csv file
        src_index_col (str): column name of users
        src_mapping (dict): mapping between row number and user id
        dst_index_col (str): column name of items
        dst_mapping (dict): mapping between row number and item id
        link_index_col (str): column name of user item interaction
        rating_threshold (int, optional): Threshold to determine positivity of edge. Defaults to 4.

    Returns:
        torch.Tensor: 2 by N matrix containing the node ids of N user-item edges
    """
    df = pd.read_csv(path)
    edge_index = None
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_attr = torch.from_numpy(df[link_index_col].values).view(-1, 1).to(torch.long) >= rating_threshold


    edge_index = [[], []]
    for i in range(edge_attr.shape[0]):
        if edge_attr[i]:
            edge_index[0].append(src[i])
            edge_index[1].append(dst[i])

    return torch.tensor(edge_index)

In [5]:
# helper function to get N_u
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [6]:
edge_index = load_edge_csv(
    rating_path,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    link_index_col='rating',
    rating_threshold=4,
)

In [7]:
# Load the trained LightGCN model
PATH2 = "lightgcn2.pt"
model = torch.load(PATH2)
model.eval()  # Set model to evaluation mode

LightGCN()

In [8]:
def make_predictions(user_id, num_recs):
    user = user_mapping[user_id]
    e_u = model.users_emb.weight[user]
    scores = model.items_emb.weight @ e_u

    values, indices = torch.topk(scores, k=len(user_pos_items[user]) + num_recs)

    movies = [index.cpu().item() for index in indices if index in user_pos_items[user]][:10]
    movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in movies]
    titles = [movieid_title[id] for id in movie_ids]
    genres = [movieid_genres[id] for id in movie_ids]

    # print(f"Here are some movies that user {user_id} rated highly")
    # for i in range(10):
    #     print(f"title: {titles[i]}, genres: {genres[i]} ")

    # print()

    movies = [index.cpu().item() for index in indices if index not in user_pos_items[user]][:num_recs]
    movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in movies]
    titles = [movieid_title[id] for id in movie_ids]
    genres = [movieid_genres[id] for id in movie_ids]
    return titles

    # print(f"Here are some suggested movies for user {user_id}")
    # for i in range(num_recs):
        # print(f"title: {titles[i]}, genres: {genres[i]} ")

In [9]:
model.eval()
df = pd.read_csv(movie_path)
movieid_title = pd.Series(df.title.values,index=df.movieId).to_dict()
movieid_genres = pd.Series(df.genres.values,index=df.movieId).to_dict()

user_pos_items = get_user_positive_items(edge_index)

In [10]:
USER_ID = 1
NUM_RECS = 50

make_predictions(USER_ID, NUM_RECS)

['Shawshank Redemption, The (1994)',
 'Pulp Fiction (1994)',
 'Godfather, The (1972)',
 'Terminator 2: Judgment Day (1991)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
 'Apollo 13 (1995)',
 'Lord of the Rings: The Two Towers, The (2002)',
 'Lord of the Rings: The Return of the King, The (2003)',
 'Eternal Sunshine of the Spotless Mind (2004)',
 'Sixth Sense, The (1999)',
 'Aladdin (1992)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Godfather: Part II, The (1974)',
 'Memento (2000)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Dark Knight, The (2008)',
 'Shrek (2001)',
 'Finding Nemo (2003)',
 'Good Will Hunting (1997)',
 'Lion King, The (1994)',
 'Inception (2010)',
 'Kill Bill: Vol. 1 (2003)',
 'Die Hard (1988)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 'Speed (1994)',
 'Departed, The (2006)',
 'Monsters, Inc. (2001)',
 'Casablanca (1942)',
 'Beautiful Mind, A (2001)',
 'Beauty and the Beast (1

In [11]:
import google.generativeai as genai
import os
import json

GEMINI_API_KEY = "AIzaSyBhFKb9IwvzAZDUwzy-_iUBwOIfHrppeMo"

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import re

In [13]:
# Load the datasets
movies_df = pd.read_csv('/Users/samanthatan/Desktop/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('/Users/samanthatan/Desktop/ml-latest-small/ratings.csv')

merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')

In [14]:
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

In [15]:
# Get unique test users
test_users = test_df['userId'].unique()
len(test_users)

610

In [16]:
# Creating a dictionary of user histories
user_histories = {}

# Iterate over each row in the merged DataFrame
for _, row in train_df.iterrows():
    user_id = row['userId']
    movie_info = {
        'movieId': row['movieId'],
        'title': row['title'],
        'rating': row['rating'],
        'genres': row['genres']
    }
    
    # Initialize list if user_id is not yet in dictionary
    if user_id not in user_histories:
        user_histories[user_id] = []
    
    # Append the movie info to the user's history
    user_histories[user_id].append(movie_info)

# Display the user histories
len(user_histories)

610

In [51]:
for user in test_users[108:]:
    ratings = user_histories.get(user)
    prompt = f"Based on the user's previous movie ratings: {ratings}, a LightGCN model has recommended the following movies: {make_predictions(user, 50)}. The model is performing with a precision and recall of 10%. Remove movies that you think does not fit the user, and recommend new movies from the popular MovieLens Latest dataset from https://files.grouplens.org/datasets/movielens/ml-latest-small.zip for the user to watch."
    prompt += """Respond with only the titles in json format: OUTPUT = {
"recommendation": [/* array of movies */]
}"""
    recommendation = get_gemini_recommendations(prompt)
    titles_list = recommendation.split("{")[1]
    recommendations_list = eval(re.search(r'\[.*?\]', titles_list, re.DOTALL).group(0))
    recommendations_without_years = [re.sub(r"\s\(\d{4}\)$", "", title) for title in recommendations_list]
    all_recommendations[user] = recommendations_without_years

ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [52]:
len(all_recommendations)

117

In [53]:
def get_high_rated_movies_by_user(movies_df, ratings_df):
    # Merge movies and ratings data to include movie titles
    merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')
    
    # Filter for ratings >= 4
    high_rated_df = merged_df[merged_df['rating'] >= 4]
    
    # Group by userId and collect titles in a list for each user
    user_high_rated_movies = high_rated_df.groupby('userId')['title'].apply(list).to_dict()
    
    return user_high_rated_movies

# Usage
user_high_rated_movies = get_high_rated_movies_by_user(movies_df, ratings_df)

In [54]:
# Initialize counters for cumulative precision@k and recall@k
total_precision_at_k = 0
total_recall_at_k = 0
user_count = len(test_users)

# Set the value of k
k = 50  # You can adjust this to any desired value of k

# Iterate through each user to calculate precision@k and recall@k
for user in test_users:
    test = user_high_rated_movies.get(user, [])
    titles = all_recommendations.get(user, [])[:k]  # Take only the top k recommendations

    relevant_recommended = set()
    for recommended_movie in titles:
        for liked_movie in test:
            if recommended_movie in liked_movie or liked_movie in recommended_movie:
                relevant_recommended.add(recommended_movie)
                break  # Break to avoid duplicate addition

    # Calculate precision@k and recall@k for the current user
    precision_at_k = len(relevant_recommended) / k if titles else 0
    recall_at_k = len(relevant_recommended) / len(test) if test else 0

    # Accumulate precision@k and recall@k
    total_precision_at_k += precision_at_k
    total_recall_at_k += recall_at_k

    print(f"User: {user}")
    print("Relevant Recommendations (top k):", relevant_recommended)
    print("Precision@k:", precision_at_k)
    print("Recall@k:", recall_at_k)
    print("-" * 40)

# Calculate average precision@k and recall@k across all users
average_precision_at_k = total_precision_at_k / user_count if user_count else 0
average_recall_at_k = total_recall_at_k / user_count if user_count else 0

print(f"Average Precision@{k} across all users:", average_precision_at_k)
print(f"Average Recall@{k} across all users:", average_recall_at_k)

User: 432
Relevant Recommendations (top k): {'Toy Story'}
Precision@k: 0.02
Recall@k: 0.006993006993006993
----------------------------------------
User: 288
Relevant Recommendations (top k): {'Forrest Gump', 'Pulp Fiction'}
Precision@k: 0.04
Recall@k: 0.007246376811594203
----------------------------------------
User: 599
Relevant Recommendations (top k): {'Pulp Fiction', 'Fight Club'}
Precision@k: 0.04
Recall@k: 0.010526315789473684
----------------------------------------
User: 42
Relevant Recommendations (top k): {'Goodfellas', 'Forrest Gump', 'Pulp Fiction'}
Precision@k: 0.06
Recall@k: 0.011583011583011582
----------------------------------------
User: 75
Relevant Recommendations (top k): set()
Precision@k: 0.0
Recall@k: 0.0
----------------------------------------
User: 51
Relevant Recommendations (top k): set()
Precision@k: 0.0
Recall@k: 0.0
----------------------------------------
User: 354
Relevant Recommendations (top k): {'Forrest Gump', 'Pulp Fiction', 'Fight Club'}
Precisi