## Tutorial: From Traditional to LLM-Based Recommendations Using MovieLens Dataset

#### Objective
- Transitioning from traditional recommendation methods to LLM-based approaches.
- Challenges: Scalability, cold start problem, lack of context-awareness.

#### Setting Up the Environment

**Install Required Libraries**

In [34]:
# %pip install numpy pandas scikit-learn matplotlib surprise transformers torch
# %pip install rapidfuzz

In [35]:
# ### Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

import json
import yaml
import openai
from openai import OpenAI

# Load API keys from YAML file
with open('./../../../Curify/curify_api.yaml', 'r') as yaml_file:
    data = yaml.safe_load(yaml_file)

openai_api_key = data.get('openai').get('api_key')  # Assuming OpenAI API key is stored under 'openai'
client = OpenAI(api_key=openai_api_key)

#### Step 1: Load MovieLens Dataset


In [36]:
import pandas as pd

# Load MovieLens Dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_zip = "movielens.zip"
movielens_folder = "./../Data/ml-1m/"

# Load the dataset
ratings = pd.read_csv(f"{movielens_folder}ratings.dat", 
                      sep="::", 
                      engine="python", 
                      names=["userId", "movieId", "rating", "timestamp"])

movies = pd.read_csv(f"{movielens_folder}movies.dat", 
                     sep="::", 
                     engine="python", 
                     encoding="ISO-8859-1", 
                     header=None, 
                     names=["movieId", "title", "genres"])

# Create title-to-ID lookup dictionary
movie_lookup = movies.set_index('title')['movieId'].to_dict()

# Merge ratings with movie titles
ratings = ratings.merge(movies, on="movieId")

# Convert timestamp to datetime for easier handling
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

# ====== Temporal Split (Recommended) ======
# Use a fixed percentile (e.g., 80%) as cutoff
cutoff_time = ratings['timestamp'].quantile(0.9)

# Split by timestamp
train_data = ratings[ratings['timestamp'] <= cutoff_time].copy()
test_data = ratings[ratings['timestamp'] > cutoff_time].copy()

# Optionally filter test users that also exist in training set
test_data = test_data[test_data['userId'].isin(train_data['userId'].unique())]

# Optional: reset index
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# ====== Summary Statistics (optional) ======
print("Train samples:", len(train_data))
print("Test samples :", len(test_data))
print("Time cutoff  :", cutoff_time)
print("Number of test users  :", len(test_data['userId'].unique()))


Train samples: 900188
Test samples : 95812
Time cutoff  : 2000-12-29 23:42:56.400000
Number of test users  : 1180


In [37]:
test_data['userId'].value_counts()

userId
1088    1014
1447     985
678      936
424      885
531      860
        ... 
997        1
2583       1
2330       1
2505       1
2059       1
Name: count, Length: 1180, dtype: int64

#### Step 2: Traditional Model (Collaborative Filtering and Hybrid)


In [38]:
from surprise import Dataset, Reader, SVD
import pandas as pd

# Train SVD model
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

def traditional_recommendation(train_data, user_id, top_k=5, use_genres=False):
    """
    Optimized traditional recommendation using collaborative filtering (SVD).
    Efficiently filters unseen movies and returns top-k recommendations.

    Args:
        train_data (pd.DataFrame): ['userId', 'movieId', 'rating', 'genres'].
        user_id (int): Target user.
        top_k (int): Number of recommendations.
        use_genres (bool): Placeholder flag (not used in this version).

    Returns:
        list: Top-k recommended movie IDs.
    """

    # Get all movie IDs not rated by the user
    rated_movies = train_data[train_data['userId'] == user_id]['movieId'].unique()
    all_movies = train_data['movieId'].unique()
    unseen_movies = list(set(all_movies) - set(rated_movies))

    # Predict ratings only for unseen movies
    predictions = [(movie_id, model.predict(user_id, movie_id).est) for movie_id in unseen_movies]

    # Get top-k movie IDs
    top_k_movies = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_k]
    return [movie_id for movie_id, _ in top_k_movies]

#### Step 3: LLM-Based Model (IDs Only and IDs with Genres)

In [39]:
import openai
from rapidfuzz import process

# Build title-to-ID dictionary
movie_lookup = movies.set_index('title')['movieId'].to_dict()

# Fuzzy matching function
def map_title_to_id(title, threshold=80):
    match, score, _ = process.extractOne(title, movie_lookup.keys())
    return movie_lookup[match] if score >= threshold else None

def llm_recommendation(train_data, user_id, top_k=5, openai_api_key=None, model_name="gpt-4o-mini"):
    """
    LLM-based recommendation using GPT-4o-mini API with structured prompts.
    
    Args:
        train_data (pd.DataFrame): DataFrame with ['userId', 'movieId', 'title', 'rating'].
        user_id (int): ID of the target user.
        top_k (int): Number of recommendations.
        openai_api_key (str): Your OpenAI API key.
        model_name (str): Model name (default: gpt-4o-mini).
    
    Returns:
        list: Top-k recommended movie IDs.
    """
    openai.api_key = openai_api_key

    # Filter user's ratings
    user_ratings = train_data[train_data['userId'] == user_id]
    liked = user_ratings[user_ratings['rating'] >= 4].nlargest(10, 'rating')
    disliked = user_ratings[user_ratings['rating'] <= 2].nsmallest(10, 'rating')

    # Get movie titles
    liked_titles = [row['title'] for _, row in liked.iterrows()]
    disliked_titles = [row['title'] for _, row in disliked.iterrows()]

    # Construct prompt with role-play
    prompt = (
        f"You are a helpful movie recommendation assistant.\n"
        f"The user USER_{user_id} liked the following movies: {', '.join(liked_titles)}.\n"
        f"The user USER_{user_id} disliked these movies: {', '.join(disliked_titles)}.\n"
        f"Please recommend exactly {top_k} movies that are similar to the liked ones and different from the disliked ones.\n"
        f"Output only the recommended movie titles, sorted by their relevance to the user's preferences, separated by commas."
    )

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.9,
        max_tokens=1024
    )
    # print(prompt)
    # Extract and parse output
    generated_text = response.choices[0].message.content.strip()
    recommended_titles = [title.strip() for title in generated_text.split(",")]
    # print(generated_text)
    # Map recommended titles to movie IDs
    recommended_ids = [map_title_to_id(title) for title in recommended_titles]
    return [movie_id for movie_id in recommended_ids if movie_id is not None]


#### Step 4: Evaluating Model Performance

In [40]:
def generate_recommendations(train_data, test_data, model_type="traditional", use_genres=False, top_k=5):
    """
    Generate recommendations for all users in the test data.
    
    Args:
        train_data (pd.DataFrame): Training data with columns ['userId', 'movieId', 'rating', 'genres'].
        test_data (pd.DataFrame): Test data with columns ['userId', 'movieId', 'rating', 'genres'].
        model_type (str): Type of model to use ("traditional" or "llm").
        use_genres (bool): Whether to use genres in the model.
        top_k (int): Number of recommendations to generate.
    
    Returns:
        dict: A dictionary mapping user IDs to their top-k recommendations.
    """
    user_recommendations = {}
    unique_users = test_data['userId'].unique()
    
    for user_id in unique_users:
        if model_type == "traditional":
            recommendations = traditional_recommendation(train_data, user_id, top_k, use_genres)
        elif model_type == "llm":
            recommendations = llm_recommendation(train_data, user_id, top_k, use_genres)
        else:
            raise ValueError("Invalid model_type. Choose 'traditional' or 'llm'.")
        
        user_recommendations[user_id] = recommendations
    
    return user_recommendations

In [41]:
from sklearn.metrics import ndcg_score
from collections import Counter
import numpy as np
import math

def evaluate_recommendations(test_data, user_recommendations, top_k=5):
    """
    Evaluate recommendations using Recall@k, Precision@k, NDCG@k,
    Effective Catalog Ratio, and Entropy-based Diversity.
    
    Args:
        test_data (pd.DataFrame): Test data with ['userId', 'movieId', 'rating'].
        user_recommendations (dict): {userId: [recommended_movieIds]}
        movies (pd.DataFrame): Movie catalog with 'movieId'.
        top_k (int): Number of recommendations per user.
    
    Returns:
        dict: Evaluation metrics.
    """
    recall_scores = []
    precision_scores = []
    ndcg_scores = []

    all_recommended_items = []

    for user_id, recommendations in user_recommendations.items():
        # Ground truth positives
        user_positive = test_data[
            (test_data['userId'] == user_id) & (test_data['rating'] >= 4)
        ]['movieId'].tolist()

        if len(user_positive) == 0:
            continue

        # Accumulate recommended items for diversity metrics
        all_recommended_items.extend(recommendations)

        # Recall and Precision
        relevant = len(set(recommendations) & set(user_positive))
        recall_scores.append(relevant / len(user_positive))
        precision_scores.append(relevant / top_k)

        # NDCG
        relevance = [1 if movie_id in user_positive else 0 for movie_id in recommendations]
        # Pad relevance vector if fewer than top_k
        if len(relevance) < top_k:
            relevance += [0] * (top_k - len(relevance))

        # Simulated predicted scores from top_k to 1
        predicted_scores = list(range(top_k, 0, -1))  # simulate rank positions as scores
        try:
            ndcg = ndcg_score([relevance], [predicted_scores], k=len(relevance))
            ndcg_scores.append(ndcg)
        except Exception as e:
            print(f"Skipping NDCG for user {user_id}: {e}")

    # Catalog Coverage Ratio
    unique_recommended = len(set(all_recommended_items))
    total_catalog_size = movies['movieId'].nunique()
    catalog_coverage_ratio = unique_recommended / total_catalog_size

    # Entropy-based Diversity
    item_freq = Counter(all_recommended_items)
    total_recs = sum(item_freq.values())
    entropy = -sum((freq / total_recs) * math.log2(freq / total_recs) for freq in item_freq.values())
    normalized_entropy = entropy / math.log2(total_catalog_size) if total_catalog_size > 0 else 0

    # Final Metrics
    if len(recall_scores) == 0:
        return {
            "Recall@k": 0.0,
            "Precision@k": 0.0,
            "NDCG@k": 0.0,
            "Catalog Coverage Ratio": 0.0,
            "Entropy Diversity": 0.0,
            "Total users": 0
        }

    metrics = {
        "Recall@k": np.mean(recall_scores),
        "Precision@k": np.mean(precision_scores),
        "NDCG@k": np.mean(ndcg_scores),
        "Catalog Coverage Ratio": catalog_coverage_ratio,
        "Entropy Diversity": normalized_entropy,
        "Total users": len(user_recommendations.items())
    }

    return metrics


In [42]:
import time
from sklearn.metrics import ndcg_score
import numpy as np

# Function to evaluate all four methods
def evaluate_all_methods(train_data, test_data, top_k=5):
    """
    Evaluate all four recommendation methods and time the entire operation.
    
    Args:
        train_data (pd.DataFrame): Training data with columns ['userId', 'movieId', 'rating', 'genres'].
        test_data (pd.DataFrame): Test data with columns ['userId', 'movieId', 'rating', 'genres'].
        top_k (int): Number of recommendations to evaluate.
    
    Returns:
        dict: A dictionary containing evaluation metrics and execution times for all methods.
    """
    results = {}
    
    # Method 1: Traditional CF (User and Item IDs Only)
    start_time = time.time()
    user_recommendations = generate_recommendations(train_data, test_data, model_type="traditional", use_genres=False, top_k=top_k)
    metrics = evaluate_recommendations(test_data, user_recommendations, top_k=top_k)
    execution_time = time.time() - start_time
    results["traditional_cf"] = {"metrics": metrics, "execution_time": execution_time}
    
    # Method 2: LLM Prompt with titles Only
    start_time = time.time()
    user_recommendations = generate_recommendations(train_data, test_data, model_type="llm", use_genres=False, top_k=top_k)
    metrics = evaluate_recommendations(test_data, user_recommendations, top_k=top_k)
    execution_time = time.time() - start_time
    results["llm_titles_only"] = {"metrics": metrics, "execution_time": execution_time}
    
    return results

In [43]:
import json

# test_data = test_data[test_data['userId'].isin([1088, 1447,678, 531, 424, 997, 2583, 2330, 2505, 2059])]  # Filter test data for specific users

results_5 = evaluate_all_methods(train_data, test_data, top_k=5)
results_10 = evaluate_all_methods(train_data, test_data, top_k=10)
results_20 = evaluate_all_methods(train_data, test_data, top_k=20)

# Convert metrics and execution time to serializable format
def serialize_results(results_dict):
    serializable = {}
    for method, result in results_dict.items():
        serializable[method] = {
            "metrics": {k: float(v) for k, v in result["metrics"].items()},
            "execution_time": float(result["execution_time"])
        }
    return serializable

# Serialize both results
results_combined = {
    "results_top_5": serialize_results(results_5),
    "results_top_10": serialize_results(results_10),
    "results_top_20": serialize_results(results_20)
}

# Save to JSON file
with open("recommendation_results.json", "w") as f:
    json.dump(results_combined, f, indent=4)

print("Saved results to recommendation_results.json")

results_combined

Skipping NDCG for user 1447: array is not broadcastable to correct shape
Saved results to recommendation_results.json


{'results_top_5': {'traditional_cf': {'metrics': {'Recall@k': 0.002812690210883351,
    'Precision@k': 0.275,
    'NDCG@k': 0.3756426636604775,
    'Catalog Coverage Ratio': 0.007468452227659027,
    'Entropy Diversity': 0.3928710063979196,
    'Total users': 10.0},
   'execution_time': 0.2215566635131836},
  'llm_titles_only': {'metrics': {'Recall@k': 0.0017093730935911914,
    'Precision@k': 0.15000000000000002,
    'NDCG@k': 0.35081236846854846,
    'Catalog Coverage Ratio': 0.004893123873293845,
    'Entropy Diversity': 0.3189637876362132,
    'Total users': 10.0},
   'execution_time': 9.944148063659668}},
 'results_top_10': {'traditional_cf': {'metrics': {'Recall@k': 0.006710938339513937,
    'Precision@k': 0.3125,
    'NDCG@k': 0.41960966989938564,
    'Catalog Coverage Ratio': 0.013391707442698944,
    'Entropy Diversity': 0.4538098552229504,
    'Total users': 10.0},
   'execution_time': 0.21613454818725586},
  'llm_titles_only': {'metrics': {'Recall@k': 0.12675740854765627,
  

In [56]:
print(1)

1
