## Tutorial: Content Embedding and Retrieval

#### Objective
- Understand the role of embeddings in capturing semantic relationships between content.
- Generate embeddings for textual data using pre-trained LLMs.
- Perform content retrieval using Approximate Nearest Neighbor (ANN) search with these embeddings.

In [4]:
# %pip install seaborn
# %pip install bm25s rank_bm25
# %pip install transformers
# %pip install datasets
# %pip install faiss-cpu
# %pip install numpy scikit-learn bm25s

#### Step 1: Generate Content Embedding
Content embedding is the process of transforming input data (e.g., text, images) into dense vectors that capture semantic meaning. These embeddings are essential for tasks like:

- Similarity Search: Finding similar items based on their embeddings.
- Recommendation: Matching user preferences with item embeddings.


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
from datasets import load_dataset
import pandas as pd
import numpy as np

# Load pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def generate_embeddings(texts, model, tokenizer, embedding_dim=384):
    """
    Generate dense embeddings for a list of texts using a pre-trained model.
    If an error occurs, returns an array of zeros with shape (len(texts), embedding_dim).
    """
    try:
        # Ensure texts are strings and handle empty inputs
        if not isinstance(texts, list) or len(texts) == 0:
            return np.zeros((1, embedding_dim))

        texts = [str(t) if t is not None else "" for t in texts]  # Convert None to empty string

        # Tokenize input
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

        # Generate embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Average pooling of last hidden states
        embeddings = model_output.last_hidden_state.mean(dim=1)

        return embeddings.numpy()  # Convert to NumPy for efficiency

    except Exception as e:
        print(f"Error in generate_embeddings: {e}")  # Log the error
        return np.zeros((len(texts), embedding_dim))  # Return zero embeddings

  from .autonotebook import tqdm as notebook_tqdm


#### Step 2: Visualizing Embeddings

Visualizing embeddings can help understand how content is semantically clustered.

**Code: 2D Visualization with t-SNE**

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

def plot_tsne_embeddings(embeddings_df, category_column='category', perplexity=30, figsize=(8, 6), title="t-SNE Visualization of Embeddings"):
    """
    Plot t-SNE of embeddings colored by a category column.

    Args:
        embeddings_df (pd.DataFrame): DataFrame containing embedding vectors and a category column.
        category_column (str): Column name indicating item categories for coloring.
        perplexity (int): t-SNE perplexity parameter.
        figsize (tuple): Size of the output plot.
        title (str): Title of the plot.
    """
    if isinstance(embeddings_df, pd.DataFrame):
        embeddings = embeddings_df.drop(columns=[category_column], errors='ignore').values
    else:
        raise ValueError("Expected a DataFrame for embeddings_df.")

    print(f"Number of samples: {embeddings.shape[0]}")

    # Adjust perplexity if needed
    perplexity = min(perplexity, embeddings.shape[0] - 1)
    print(f"Using perplexity: {perplexity}")

    # Run t-SNE
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)

    # Create visualization DataFrame
    df_vis = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
    df_vis[category_column] = embeddings_df[category_column].values

    # Plot
    unique_topics = df_vis[category_column].unique()
    color_palette = sns.color_palette("tab10", len(unique_topics))
    color_map = {topic: color_palette[i] for i, topic in enumerate(unique_topics)}

    plt.figure(figsize=figsize)
    for topic, color in color_map.items():
        subset = df_vis[df_vis[category_column] == topic]
        plt.scatter(subset['x'], subset['y'], c=[color], label=topic, s=5, alpha=0.6)

    plt.title(title)
    plt.legend(markerscale=2, fontsize=8, loc='best', frameon=True)
    plt.xlabel("t-SNE Dim 1")
    plt.ylabel("t-SNE Dim 2")
    plt.tight_layout()
    plt.show()


#### Step 3: LLM-based Query Rewriting


In [7]:
from openai import OpenAI
import yaml

def rewrite_query_with_llm(query, model="gpt-4o-mini"):
    prompt = f"""Rewrite the following query to make it more descriptive and retrieval-friendly.
Original query: "{query}"
Improved query:"""

    # Read the YAML file
    with open('./../../../Curify/curify_api.yaml', 'r') as yaml_file:
        data = yaml.safe_load(yaml_file)

    # Access the API keys and other configuration data
    openai_api_key = data.get('openai').get('api_key')
    client = OpenAI(api_key=openai_api_key)
        
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that rewrites user queries."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )
    rewritten = response.choices[0].message.content.strip()
    return rewritten


#### Step 3: Content Retrieval Using ANN

To retrieve content efficiently, we’ll use Approximate Nearest Neighbor (ANN) techniques, leveraging libraries like FAISS.

- BM25 retrieval.
- Hybrid retrieval.
- Hybrid retrieval with document expansion.

In [8]:
import time
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_all(query_text, corpus, embedding_dim=384, topn=40, alpha=0.5):
    """
    Retrieve relevant documents from a corpus using BM25, Dense, and Hybrid methods.

    Args:
        query_text (str): Query string.
        corpus (list of str): List of documents.
        model: Pretrained model for embedding.
        tokenizer: Tokenizer corresponding to the model.
        embedding_dim (int): Embedding dimensionality.
        topn (int): Number of top documents to retrieve.
        alpha (float): Weight for combining BM25 and Dense scores in hybrid retrieval.

    Returns:
        dict: Contains indices, scores, and latencies for BM25, Dense, and Hybrid methods.
    """

    results = {}

    # === BM25 Retrieval ===
    start = time.time()
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    bm25_scores = bm25.get_scores(query_text.split())
    bm25_indices = np.argsort(bm25_scores)[::-1][:topn]
    bm25_latency = time.time() - start
    results["bm25"] = {
        "indices": bm25_indices,
        "scores": [bm25_scores[i] for i in bm25_indices],
        "latency": bm25_latency,
    }

    # === Dense Embedding Retrieval ===
    start = time.time()
    query_embedding = generate_embeddings([query_text], model, tokenizer, embedding_dim=embedding_dim)
    document_embeddings = generate_embeddings(corpus, model, tokenizer, embedding_dim=embedding_dim)
    cosine_scores = cosine_similarity(query_embedding, document_embeddings)[0]
    dense_indices = np.argsort(cosine_scores)[::-1][:topn]
    dense_latency = time.time() - start
    results["dense"] = {
        "indices": dense_indices,
        "scores": [cosine_scores[i] for i in dense_indices],
        "latency": dense_latency,
    }

    # === Hybrid Retrieval ===
    start = time.time()
    # Normalize both scores before combining
    bm25_norm = np.array(bm25_scores) / (np.max(bm25_scores) + 1e-8)
    dense_norm = cosine_scores / (np.max(cosine_scores) + 1e-8)
    hybrid_scores = alpha * dense_norm + (1 - alpha) * bm25_norm
    hybrid_indices = np.argsort(hybrid_scores)[::-1][:topn]
    hybrid_latency = time.time() - start
    results["hybrid"] = {
        "indices": hybrid_indices,
        "scores": [hybrid_scores[i] for i in hybrid_indices],
        "latency": hybrid_latency,
    }
    return results

#### Step 4: Evaluating Retrieval
To assess the performance:

1. Use a metric like Precision@K: How many retrieved items are relevant?
2. Create labeled test sets with query-relevant pairs.

**Code: Precision@K**

In [9]:
import numpy as np
from sklearn.metrics import ndcg_score

def evaluate_retrieval_metrics(retrieved_results, relevant_documents, ks=[10]):
    """
    Compute Precision@k, Recall@k, NDCG@k, and F1@k for a list of retrieval results.

    Args:
        retrieved_results (list): List of retrieved item IDs, sorted by score.
        relevant_documents (list): List of relevant item IDs.
        ks (list): List of cutoff values (e.g., [5, 10, 20]).

    Returns:
        dict: {k: {'Precision': avg_precision, 'Recall': avg_recall, 'NDCG': avg_ndcg, 'F1': avg_f1}} for each k.
    """
    metrics = {k: {'Precision': [], 'Recall': [], 'NDCG': [], 'F1': []} for k in ks}

    # Convert relevant_documents to a set for faster lookup
    relevant_items = set(relevant_documents)
    if len(relevant_items) == 0:
        return {k: {'Precision': 0.0, 'Recall': 0.0, 'NDCG': 0.0, 'F1': 0.0} for k in ks}

    for k in ks:
        top_k_items = retrieved_results[:k]
        relevance = [1 if item in relevant_items else 0 for item in top_k_items]

        # Precision@k
        precision = sum(relevance) / k
        # Recall@k
        recall = sum(relevance) / len(relevant_items)

        # F1@k
        if precision + recall == 0:
            f1 = 0.0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)

        # Simulate predicted scores by rank positions (descending)
        predicted_scores = list(range(k, 0, -1))
        try:
            ndcg = ndcg_score([relevance], [predicted_scores], k=k)
        except ValueError:
            ndcg = 0.0

        metrics[k]['Precision'].append(precision)
        metrics[k]['Recall'].append(recall)
        metrics[k]['NDCG'].append(ndcg)
        metrics[k]['F1'].append(f1)

    # Aggregate averages
    for k in ks:
        for metric_name in ['Precision', 'Recall', 'NDCG', 'F1']:
            scores = metrics[k][metric_name]
            metrics[k][metric_name] = float(np.mean(scores)) if scores else 0.0

    return metrics

import pandas as pd

def evaluate_queries(queries, corpus, df, ks=[20]):
    """
    Evaluate retrieval performance for a list of queries against a corpus.

    Args:
        queries (list): List of query strings.
        corpus: The corpus to retrieve documents from.
        df (pd.DataFrame): DataFrame containing the ground truth (e.g., 'category' column).
        retrieve_all_func (function): Function to retrieve results for a query (e.g., retrieve_all).
        ks (list): List of cutoff values (e.g., [5, 10, 20]).

    Returns:
        dict: Dictionary of results keyed by query, with evaluation metrics for BM25, dense, and hybrid retrieval.
    """
    results = {}

    for query in queries:
        # Get relevant documents (ground truth)
        relevant_documents = df[df['category'].astype(str).str.contains(query, case=False, na=False)].index.tolist()

        # Retrieve results for the query
        retrieval_results = retrieve_all(query, corpus)

        # Evaluate metrics for BM25, dense, and hybrid retrieval
        bm25_eval = evaluate_retrieval_metrics(retrieval_results['bm25']['indices'], relevant_documents, ks=ks)
        dense_eval = evaluate_retrieval_metrics(retrieval_results['dense']['indices'], relevant_documents, ks=ks)
        hybrid_eval = evaluate_retrieval_metrics(retrieval_results['hybrid']['indices'], relevant_documents, ks=ks)

        # Add latency to evaluation results
        bm25_eval['latency'] = retrieval_results['bm25']['latency']
        dense_eval['latency'] = retrieval_results['dense']['latency']
        hybrid_eval['latency'] = retrieval_results['hybrid']['latency']

        # Combine results for the query
        results[query] = {
            "bm25": bm25_eval,
            "dense": dense_eval,
            "hybrid": hybrid_eval
        }

    return results

In [10]:
import json
# Load the CSV file
df = pd.read_csv('./../Data/news-recommendation/news_summary.tsv', sep='\t')

# Define texts as the titles of news articles
corpus = df['summary'].astype(str).fillna("").tolist()

# Generate embeddings
embeddings = generate_embeddings(corpus, model, tokenizer)

print("Embeddings shape:", embeddings.shape)

# Create embeddings DataFrame
embedding_df = pd.DataFrame(embeddings)

# Append category column
embedding_df['category'] = df['category'].values

plot_tsne_embeddings(embedding_df)

Embeddings shape: (2225, 384)
Number of samples: 2225
Using perplexity: 30


In [None]:
results_combined = evaluate_queries(['business', 'tech', 'entertainment', 'sport', 'politics'], corpus, df, ks=[10, 20])

# Save to JSON file
with open("retrieval_eval.json", "w") as f:
    json.dump(results_combined, f, indent=2, default=str)
results_combined