## Tutorial: From Traditional to LLM-Based Recommendations Using MovieLens Dataset

#### Objective
- Transitioning from traditional recommendation methods to LLM-based approaches.
- Challenges: Scalability, cold start problem, lack of context-awareness.
- LLM Importance: Contextual understanding, multimodal data handling, and enhanced personalization.

#### Setting Up the Environment

**Install Required Libraries**

In [1]:
!pip install numpy pandas scikit-learn matplotlib surprise transformers torch



In [3]:
# ### Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ### Load MovieLens Dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
movielens_zip = "movielens.zip"
movielens_folder = "ml-latest-small/"

# Download and extract the dataset
!wget -q $url -O $movielens_zip
!unzip -q $movielens_zip -d ./

# Load the dataset
ratings = pd.read_csv(f"{movielens_folder}ratings.csv")
movies = pd.read_csv(f"{movielens_folder}movies.csv")
ratings = ratings.merge(movies, on="movieId")

# Display dataset
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


#### Traditional Methods - Collaborative Filtering


In [None]:
# #### Prepare Dataset
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
train_data, test_data = train_test_split(ratings, test_size=0.2)

# #### Implement SVD for Collaborative Filtering
svd = TruncatedSVD(n_components=50)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_.T

# #### Evaluate Collaborative Filtering
predictions = []
for _, row in test_data.iterrows():
    user_idx = int(row['userId'] - 1)
    item_idx = int(row['movieId'] - 1)
    if user_idx < user_factors.shape[0] and item_idx < item_factors.shape[0]:
        pred = np.dot(user_factors[user_idx], item_factors[item_idx])
        predictions.append((row['rating'], pred))

true_ratings, predicted_ratings = zip(*predictions)
rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
print(f"RMSE: {rmse}")


##### Discuss Limitations

- Cold Start: No information about new users or movies.
- Sparsity: Large datasets with few interactions.
- Lack of Context: Can't handle textual or metadata features.


#### Transitioning to LLM-Based Recommendations

In [None]:
# ### Semantic Matching with LLMs

# #### Load Pre-Trained Transformer Model (e.g., DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# #### Generate Semantic Embeddings
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).numpy()

movies['embedding'] = movies['title'].apply(get_embedding)

# #### User Query Matching
user_query = "I love sci-fi movies like Star Wars."
user_embedding = get_embedding(user_query)

# Compute Cosine Similarity
movies['similarity'] = movies['embedding'].apply(lambda emb: cosine_similarity(user_embedding, emb.reshape(1, -1))[0][0])

# Recommend Top-5 Movies
movies.sort_values(by='similarity', ascending=False).head(5)


#### Building a Simple LLM-Powered Recommendation System

In [None]:
# Preprocess Movie Descriptions (Example Metadata)
movies['description'] = "A great movie about " + movies['genres']

# Tokenize and Generate Embeddings
def generate_embeddings(texts):
    tokens = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings

movies['description_embedding'] = generate_embeddings(movies['description'].tolist())

# Implement Retrieval
user_description = "A fan of adventure and fantasy movies."
user_description_embedding = generate_embeddings([user_description])
movies['retrieval_score'] = movies['description_embedding'].apply(
    lambda emb: cosine_similarity(user_description_embedding, emb.reshape(1, -1))[0][0]
)

# Recommend Movies
movies.sort_values(by='retrieval_score', ascending=False).head(5)

#### Evaluating Model Performance

In [None]:
# ### Define Metrics for Comparison
# - Precision@K
# - Recall@K
# - F1 Score

# ### Compute Metrics
def precision_at_k(true_ratings, predicted_ratings, k=5):
    top_k = np.argsort(predicted_ratings)[-k:]
    relevant = np.isin(top_k, true_ratings)
    return np.sum(relevant) / k

def recall_at_k(true_ratings, predicted_ratings, k=5):
    top_k = np.argsort(predicted_ratings)[-k:]
    relevant = np.isin(top_k, true_ratings)
    return np.sum(relevant) / len(true_ratings)

def evaluate_models(true_ratings, predicted_ratings_cf, predicted_ratings_llm):
    k = 5
    precision_cf = precision_at_k(true_ratings, predicted_ratings_cf, k)
    recall_cf = recall_at_k(true_ratings, predicted_ratings_cf, k)
    
    precision_llm = precision_at_k(true_ratings, predicted_ratings_llm, k)
    recall_llm = recall_at_k(true_ratings, predicted_ratings_llm, k)

    print("Collaborative Filtering:")
    print(f"Precision@{k}: {precision_cf:.2f}, Recall@{k}: {recall_cf:.2f}")
    print("LLM-Based Recommendation:")
    print(f"Precision@{k}: {precision_llm:.2f}, Recall@{k}: {recall_llm:.2f}")

# Simulate Predictions for Evaluation
predicted_ratings_llm = [pred[1] for pred in predictions]  # Using the same for simplicity
true_ratings = [pred[0] for pred in predictions]
evaluate_models(true_ratings, predicted_ratings, predicted_ratings_llm)
