In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
model_name = "jinaai/jina-embeddings-v2-base-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.eval()

def get_embedding(text, model=model, tokenizer=tokenizer, max_length=8192):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**tokens)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

In [None]:
file_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/serendipity_scoring/restaurants/system_user_profiles_restaurants_reviews_match_rag.xlsx"
df = pd.read_excel(file_path)

# Build business â†’ set of unique aspects
business_aspect_dict = (
    df.groupby("business_id")["true_strong_weak"]
    .apply(lambda x: set(x.dropna()))
    .to_dict()
)

'''def compute_serendipity(user_business_aspects, utility_values, business_aspects):
    if not business_aspects or not user_business_aspects:
        return 0.0

    user_embeddings = [get_embedding(a) for a in user_business_aspects]
    business_embeddings = [get_embedding(a) for a in business_aspects]

    score = 0.0

    for idx, user_emb in enumerate(user_embeddings):
        sim_vector = cosine_similarity([user_emb], business_embeddings)[0]
        sim_sum = np.sum(sim_vector)

        if sim_sum == 0:
            continue

        score += utility_values[idx] / sim_sum

    return score'''

def compute_serendipity(user_business_aspects, utility_values, business_aspects):
    if not business_aspects or not user_business_aspects:
        return 0.0

    user_embeddings = [get_embedding(a) for a in user_business_aspects]
    business_embeddings = [get_embedding(a) for a in business_aspects]

    print("\n--- Serendipity Calculation ---")
    print(f"Business Aspects ({len(business_aspects)}): {list(business_aspects)}")

    score = 0.0

    for idx, user_emb in enumerate(user_embeddings):
        user_aspect = user_business_aspects[idx]
        utility = utility_values[idx]

        sim_vector = cosine_similarity([user_emb], business_embeddings)[0]
        sim_sum = np.sum(sim_vector)

        print(f"\nUser Aspect {idx+1}: '{user_aspect}'")
        print(f" - Utility: {utility:.4f}")
        print(f" - Similarities to Business Aspects:")
        for i, sim in enumerate(sim_vector):
            print(f"     {i+1:>2}: {sim:.4f}")
        print(f" - Sum of Similarities: {sim_sum:.4f}")

        contribution = utility / sim_sum
        print(f" - Contribution to Serendipity: {contribution:.4f}")

        score += contribution

    print(f"\n>>> Total Serendipity Score: {score:.4f}")

    return score

# Group by user_profile and business_id
results = []
grouped = df.groupby(["user_profile", "business_id"])

for (user, business), group in grouped:
    print(f"\nProcessing: User = {user[:50]}..., Business ID = {business}")

    user_business_aspects = group["true_strong_weak"].dropna().tolist()
    business_aspects = business_aspect_dict.get(business, set())

    utility_values = group["numeric_utility"].tolist()
    baseline_values = group["numeric_utility_baseline"].tolist()

    business_name = group["name"].iloc[0] if "name" in group.columns else ""

    numeric_serendipity = compute_serendipity(user_business_aspects, utility_values, business_aspects)
    numeric_serendipity_baseline = compute_serendipity(user_business_aspects, baseline_values, business_aspects)

    results.append({
        "user_profile": user,
        "business_id": business,
        "name": business_name,
        "numeric_serendipity": numeric_serendipity,
        "numeric_serendipity_baseline": numeric_serendipity_baseline
    })

# Save results
output_df = pd.DataFrame(results)
output_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/serendipity_scoring/restaurants/system_rag_serendipity_scores.xlsx"
output_df.to_excel(output_path, index=False)
print(f"\nResults saved to '{output_path}'")