In [2]:
import json
from llamaapi import LlamaAPI

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [10]:
# Load the datasets
movies_df = pd.read_csv(MOVIES_FILE_PATH)
ratings_df = pd.read_csv(RATINGS_FILE_PATH)

merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')
merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [11]:
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

In [112]:
# Get unique test users
test_users = test_df['userId'].unique()
len(test_users)

610

In [13]:
# Creating a dictionary of user histories
user_histories = {}

# Iterate over each row in the merged DataFrame
for _, row in train_df.iterrows():
    user_id = row['userId']
    movie_info = {
        'movieId': row['movieId'],
        'title': row['title'],
        'rating': row['rating'],
        'genres': row['genres']
    }
    
    # Initialize list if user_id is not yet in dictionary
    if user_id not in user_histories:
        user_histories[user_id] = []
    
    # Append the movie info to the user's history
    user_histories[user_id].append(movie_info)

# Display the user histories
len(user_histories)

610

In [113]:
# Initialize the SDK
llama = LlamaAPI(LLAMA_API_KEY)

def get_llama_recommendations(prompt):
    # Build the API request
    api_request_json = {
        "model": "llama3.2-3b",
        "max_tokens": 2048,
        "temperature": 0,
        "top_p": 1.0,
        "messages": [
            {
                "role": "system",
                "content": f"You are a movie recommendation assistant.",
            },
            {"role": "user", "content": prompt},
        ],
    }

    # Execute the Request
    response = llama.run(api_request_json)
    return (response.json())
    # print(json.dumps(response.json(), indent=2))

In [121]:
all_recommendations = {}

for user in test_users:
    ratings = user_histories.get(user)
    prompt = f"Based on the user's previous movie ratings: {ratings}, recommend 50 new movies from the popular MovieLens Latest dataset for the user to watch." 
    prompt += """Respond with only the titles in json format: OUTPUT = {
"recommendation": [/* array of movies */]
}"""
    recommendation = get_llama_recommendations(prompt)
    content = recommendation['choices'][0]['message']['content']
    movies_with_year = re.findall(r'"([^"]+)"', content)
    movies_without_year = [re.sub(r'\s*\(\d{4}\)', '', movie) for movie in movies_with_year]
    all_recommendations[user] = movies_without_year

In [122]:
def get_high_rated_movies_by_user(movies_df, ratings_df):
    # Merge movies and ratings data to include movie titles
    merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='inner')
    
    # Filter for ratings >= 4
    high_rated_df = merged_df[merged_df['rating'] >= 4]
    
    # Group by userId and collect titles in a list for each user
    user_high_rated_movies = high_rated_df.groupby('userId')['title'].apply(list).to_dict()
    
    return user_high_rated_movies

# Usage
user_high_rated_movies = get_high_rated_movies_by_user(movies_df, ratings_df)

In [127]:
# Initialize counters for cumulative precision@k and recall@k
total_precision_at_k = 0
total_recall_at_k = 0
user_count = len(test_users)

# Set the value of k
k = 10  # You can adjust this to any desired value of k

# Iterate through each user to calculate precision@k and recall@k
for user in test_users:
    test = user_high_rated_movies.get(user, [])
    titles = all_recommendations.get(user, [])[:k]  # Take only the top k recommendations

    # Find relevant recommendations based on fuzzy matching (limited to top k recommendations)
    relevant_recommended = set()
    for recommended_movie in titles:
        for liked_movie in test:
            if recommended_movie in liked_movie or liked_movie in recommended_movie:
                relevant_recommended.add(recommended_movie)
                break  # Break to avoid duplicate addition

    # Calculate precision@k and recall@k for the current user
    precision_at_k = len(relevant_recommended) / k if titles else 0
    recall_at_k = len(relevant_recommended) / len(test) if test else 0

    # Accumulate precision@k and recall@k
    total_precision_at_k += precision_at_k
    total_recall_at_k += recall_at_k

    print(f"User: {user}")
    print("Relevant Recommendations (top k):", relevant_recommended)
    print("Precision@k:", precision_at_k)
    print("Recall@k:", recall_at_k)
    print("-" * 40)

# Calculate average precision@k and recall@k across all users
average_precision_at_k = total_precision_at_k / user_count if user_count else 0
average_recall_at_k = total_recall_at_k / user_count if user_count else 0

print(f"Average Precision@{k} across all users:", average_precision_at_k)
print(f"Average Recall@{k} across all users:", average_recall_at_k)

User: 432
Relevant Recommendations (top k): {"Pan's Labyrinth"}
Precision@k: 0.1
Recall@k: 0.006993006993006993
----------------------------------------
User: 288
Relevant Recommendations (top k): set()
Precision@k: 0.0
Recall@k: 0.0
----------------------------------------
User: 599
Relevant Recommendations (top k): {'Pulp Fiction'}
Precision@k: 0.1
Recall@k: 0.005263157894736842
----------------------------------------
User: 42
Relevant Recommendations (top k): {"Schindler's List", 'Pulp Fiction'}
Precision@k: 0.2
Recall@k: 0.007722007722007722
----------------------------------------
User: 75
Relevant Recommendations (top k): set()
Precision@k: 0.0
Recall@k: 0.0
----------------------------------------
User: 51
Relevant Recommendations (top k): set()
Precision@k: 0.0
Recall@k: 0.0
----------------------------------------
User: 354
Relevant Recommendations (top k): set()
Precision@k: 0.0
Recall@k: 0.0
----------------------------------------
User: 416
Relevant Recommendations (top k)