# Personalized Profile Generation from Book Reviews and Ratings

##### Objectives

- **Profile Generation:** Use a text generation model to produce a personalized profile summary for each user.
- **Evaluation:** Assess the quality of generated profiles by measuring coherence, relevance, and personalization. Optionally, use an LLM-as-a-judge approach for evaluation.


In [6]:
import pandas as pd
from datasets import load_dataset
import logging
logging.getLogger("datasets").setLevel(logging.ERROR)


def get_user_aggregated_reviews(
    dataset_str="McAuley-Lab/Amazon-Reviews-2023",
    review_key="raw_review_Books",
    meta_key="raw_meta_Books",
    sampling_percent=0.1,
    min_reviews_per_user=5,
    n_users=5000,
    n_reviews_per_user=5,
    random_state=42
):
    """
    Returns a dictionary mapping user_id to a list of up to n_reviews_per_user aggregated review dicts.
    Each review dict contains title, genre, rating, review, and description.
    """
    # Load reviews
    # split_str = f"full[:{sampling_percent}%]"
    split_str = "full[:5000]"
    reviews = load_dataset(dataset_str, review_key, split=split_str, trust_remote_code=True)
    reviews_df = pd.DataFrame(reviews)
    
    reviews_df = reviews_df[['user_id', 'parent_asin', 'timestamp', 'rating', 'text']]
    reviews_df.rename(columns={'parent_asin': 'item_id'}, inplace=True)

    # Load metadata
    metadata_stream = load_dataset(dataset_str, meta_key, split="full", streaming=True, trust_remote_code=True)

    # Build a set of item_ids from reviews_df
    item_ids_set = set(reviews_df['item_id'].unique())

    filtered_metadata = []
    for example in metadata_stream:
        if example['parent_asin'] in item_ids_set:
            filtered_metadata.append(example)

    metadata_df = pd.DataFrame(filtered_metadata)
    metadata_df = metadata_df[['parent_asin', 'title', 'main_category', 'categories', 'description']]
    metadata_df.rename(columns={'parent_asin': 'item_id'}, inplace=True)
    metadata_df['genre'] = metadata_df['main_category']

    # Build metadata lookup for fast access
    book_metadata = metadata_df.set_index('item_id').to_dict(orient='index')

    # Filter users with at least min_reviews_per_user reviews
    user_review_counts = reviews_df.groupby('user_id').size()
    eligible_users = user_review_counts[user_review_counts >= min_reviews_per_user].index

    # Sample n_users users
    if len(eligible_users) > n_users:
        selected_users = pd.Series(eligible_users).sample(n=n_users, random_state=random_state)
    else:
        selected_users = pd.Series(eligible_users)
        print(f"Warning: Only {len(selected_users)} users with at least {min_reviews_per_user} reviews.")

    # Filter reviews to selected users
    filtered_reviews = reviews_df[reviews_df['user_id'].isin(selected_users)]

    # Aggregate reviews per user
    user_aggregated = {}
    for user_id in selected_users:
        user_reviews = filtered_reviews[filtered_reviews['user_id'] == user_id]
        user_reviews = user_reviews.sort_values(by='timestamp', ascending=False).head(n_reviews_per_user)
        aggregated = []
        for _, row in user_reviews.iterrows():
            meta = book_metadata.get(row['item_id'], {})
            aggregated.append({
                "title": meta.get("title", ""),
                "genre": meta.get("genre", ""),
                "rating": row["rating"],
                "review": row["text"],
                "description": meta.get("description", "")
            })
        user_aggregated[user_id] = aggregated

    return user_aggregated

# Example usage:
user_books_data = get_user_aggregated_reviews()
print(f"Total users with aggregated data: {len(user_books_data)}")
for i, (user_id, books) in enumerate(user_books_data.items()):
    if i < 3:
        print(f"\nUser ID: {user_id}")
        for book in books:
            print(f"  Title: {book['title']}")
            print(f"  Genre: {book['genre']}")
            print(f"  Rating: {book['rating']}")
            print(f"  Review: {book['review'][:60]}...")
            print(f"  Description: {book['description'][:60]}...")
            print("---")


{
  "title": "Thinking, Fast and Slow",
  "genre": "Behavioral Economics",
  "rating": 5,
  "review": "Fascinating insights into human decision-making. Changed how I view choices and biases.",
  "description": "A groundbreaking exploration of human cognition by Nobel laureate Daniel Kahneman."
}


In [None]:
import json
import openai
import yaml

# Read the YAML file
with open('./../../../Curify/curify_api.yaml', 'r') as yaml_file:
    data = yaml.safe_load(yaml_file)

# Access the API keys and other configuration data
api_key = data.get('openai').get('api_key')

def generate_profile_with_llm(reviews_structured, prompt_template, model, tokenizer=None, api_key=None):
    """
    Generate a user profile summary from structured reviews using specified LLM.
    model: either 'gpt-4o' (OpenAI) or a HuggingFace pipeline/model (e.g., FLAN-T5).
    tokenizer: required if using HuggingFace model.
    """
    
    input_str = json.dumps(reviews_structured, ensure_ascii=False, indent=2)
    prompt = prompt_template.format(reviews=input_str)

    if model == "gpt-4o":
        openai.api_key = api_key
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        return response.choices[0].message.content.strip()
    else:
        # Assume HuggingFace pipeline or model/tokenizer
        inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model.generate(**inputs, max_new_tokens=128)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)


PROFILE_PROMPT = (
    "Given the following books, reviews, and genres, summarize the user's interests, "
    "personality traits, and reading preferences in 3–4 sentences.\n\n"
    "{reviews}"
)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def genre_recall(profile_text, genres):
    """Compute recall: % of genres mentioned in profile."""
    found = sum(1 for genre in genres if genre.lower() in profile_text.lower())
    return found / len(genres) if genres else 0.0

def embedding_similarity(profile_text, reviews, embed_fn):
    """Cosine similarity between profile and concatenated reviews using embed_fn (e.g., OpenAI or SentenceTransformer)."""
    texts = [profile_text, " ".join(reviews)]
    embeddings = embed_fn(texts)
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def keyword_overlap(profile_text, reviews):
    """TF-IDF keyword overlap between profile and reviews."""
    corpus = [profile_text, " ".join(reviews)]
    vectorizer = TfidfVectorizer().fit(corpus)
    tfidf = vectorizer.transform(corpus)
    overlap = (tfidf[0].multiply(tfidf[1])).sum()
    return overlap / tfidf[0].sum() if tfidf[0].sum() else 0.0


def llm_as_judge(profile_text, reference_reviews, model="gpt-4o", api_key=None):
    """
    Ask GPT-4o to rate the profile for relevance, personalization, coherence, and provide comments.
    """
    prompt = (
        "Given the following user reviews:\n"
        f"{reference_reviews}\n\n"
        "And the following profile summary:\n"
        f"{profile_text}\n\n"
        "Rate the profile on a scale of 1-5 for:\n"
        "- Relevance\n- Personalization\n- Coherence\n"
        "Also provide a short comment."
    )
    openai.api_key = api_key
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content.strip()


In [None]:
def run_experiment_for_user(user_reviews, book_metadata, prompt_template, gpt_api_key, flan_model, flan_tokenizer, embed_fn):
    # Aggregate reviews
    structured = get_user_aggregated_reviews(user_reviews, book_metadata, n_reviews=5)
    genres = list({b.get("genre", "") for b in structured if b.get("genre")})

    # Generate profiles
    gpt_profile = generate_profile_with_llm(structured, prompt_template, "gpt-4o", api_key=gpt_api_key)
    flan_profile = generate_profile_with_llm(structured, prompt_template, flan_model, tokenizer=flan_tokenizer)

    # Evaluate
    reviews_texts = [r['review'] for r in structured]
    metrics = {
        "gpt-4o": {
            "genre_recall": genre_recall(gpt_profile, genres),
            "embedding_similarity": embedding_similarity(gpt_profile, reviews_texts, embed_fn),
            "keyword_overlap": keyword_overlap(gpt_profile, reviews_texts)
        },
        "flan-t5": {
            "genre_recall": genre_recall(flan_profile, genres),
            "embedding_similarity": embedding_similarity(flan_profile, reviews_texts, embed_fn),
            "keyword_overlap": keyword_overlap(flan_profile, reviews_texts)
        }
    }
    # LLM-as-a-judge
    judge_gpt = llm_as_judge(gpt_profile, " ".join(reviews_texts), model="gpt-4o", api_key=gpt_api_key)
    judge_flan = llm_as_judge(flan_profile, " ".join(reviews_texts), model="gpt-4o", api_key=gpt_api_key)

    return {
        "gpt_profile": gpt_profile,
        "flan_profile": flan_profile,
        "metrics": metrics,
        "judge_gpt": judge_gpt,
        "judge_flan": judge_flan
    }
