In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset


docs = load_dataset('irds/beir_hotpotqa', 'docs')
queries = load_dataset('irds/beir_hotpotqa', 'queries')



  table = cls._concat_blocks(blocks, axis=0)


In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from tqdm import tqdm

# Load two embedding models
def load_model(model_name):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

small_model, small_tokenizer = load_model('bert-base-uncased')
large_model, large_tokenizer = load_model('bert-large-uncased')

def get_embeddings(texts, model, tokenizer, max_length=512):
    model.eval()
    embeddings = []
    for text in tqdm(texts, desc="Generating embeddings"):
        inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())  # Use [CLS] token embedding
    return np.vstack(embeddings)

def get_top_k_passages(query, model, tokenizer, doc_embeddings, docs, k=10):
    inputs = tokenizer(query, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    

    similarities = np.dot(doc_embeddings, query_embedding.T).flatten()
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    
    top_k_passages = [docs[i] for i in top_k_indices]
    return top_k_passages


# Main execution
def main(docs, queries):

    doc_texts = docs['text']
    query_texts = queries['text']

    print("Generating embeddings...")
    small_doc_embeddings = get_embeddings(doc_texts, small_model, small_tokenizer)
    large_doc_embeddings = get_embeddings(doc_texts, large_model, large_tokenizer)

    small_doc_embeddings = small_doc_embeddings / np.linalg.norm(small_doc_embeddings, axis=1, keepdims=True)
    large_doc_embeddings = large_doc_embeddings / np.linalg.norm(large_doc_embeddings, axis=1, keepdims=True)

main(docs, queries)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

def load_ranking_model(model_name):
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

small_rank_model, small_rank_tokenizer = load_ranking_model('cross-encoder/ms-marco-MiniLM-L-12-v2')

large_rank_model, large_rank_tokenizer = load_ranking_model('nvidia/nv-rerankqa-mistral-4b-v3')

def rerank_passages(query, passages, model, tokenizer, batch_size=8):
    model.eval()
    scores = []
    
    for i in range(0, len(passages), batch_size):
        batch = passages[i:i+batch_size]
        inputs = tokenizer([query] * len(batch), batch, truncation=True, padding=True, return_tensors="pt", max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        batch_scores = outputs.logits.squeeze(-1).tolist()
        scores.extend(batch_scores)
    
    ranked_passages = sorted(zip(passages, scores), key=lambda x: x[1], reverse=True)
    return [passage for passage, score in ranked_passages], [score for passage, score in ranked_passages]

def rerank_with_multiple_models(query, passages, models_and_tokenizers, weights=None):
    if weights is None:
        weights = [1] * len(models_and_tokenizers)
    
    all_scores = []
    for (model, tokenizer), weight in zip(models_and_tokenizers, weights):
        _, scores = rerank_passages(query, passages, model, tokenizer)
        all_scores.append([score * weight for score in scores])
    
    combined_scores = [sum(scores) for scores in zip(*all_scores)]
    ranked_passages = sorted(zip(passages, combined_scores), key=lambda x: x[1], reverse=True)
    return [passage for passage, score in ranked_passages], [score for passage, score in ranked_passages]

def main1(docs, queries):

    models_and_tokenizers = [
        (small_rank_model, small_rank_tokenizer),
        (large_rank_model, large_rank_tokenizer)
    ]
    
    example_query = queries[0]['text']
    initial_passages = get_top_k_passages(example_query, small_model, small_tokenizer, small_doc_embeddings, docs, k=20)
    reranked_passages, scores = rerank_with_multiple_models(example_query, initial_passages, models_and_tokenizers)


main1(docs, queries)

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import ndcg_score
from typing import List, Dict

def calculate_ndcg(relevance_scores: List[float], k: int = 10) -> float:
    return ndcg_score([relevance_scores], [list(range(len(relevance_scores)))], k=k)

def get_top_k_passages(query, model, tokenizer, doc_embeddings, docs, k=10):
    inputs = tokenizer(query, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    similarities = np.dot(doc_embeddings, query_embedding.T).flatten()
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    
    top_k_passages = [docs[i] for i in top_k_indices]
    return top_k_passages, similarities[top_k_indices]

def evaluate_retrieval(queries: Dict, docs: Dict, model, tokenizer, doc_embeddings, k: int = 10):
    ndcg_scores = []
    for query in tqdm(queries['text'], desc="Evaluating queries"):
        top_k_passages, similarities = get_top_k_passages(query, model, tokenizer, doc_embeddings, docs['text'], k)
        relevance_scores = [queries['relevance'].get((query, passage), 0) for passage in top_k_passages]
        ndcg_scores.append(calculate_ndcg(relevance_scores, k))
    return np.mean(ndcg_scores)

def simple_ranking_model(query: str, passages: List[str]) -> List[float]:
    query_words = set(query.lower().split())
    scores = []
    for passage in passages:
        passage_words = set(passage.lower().split())
        overlap = len(query_words.intersection(passage_words))
        scores.append(overlap)
    return scores

def evaluate_with_ranking(queries: Dict, docs: Dict, model, tokenizer, doc_embeddings, ranking_model, k: int = 10):
    ndcg_scores = []
    for query in tqdm(queries['text'], desc="Evaluating queries with ranking"):
        top_k_passages, _ = get_top_k_passages(query, model, tokenizer, doc_embeddings, docs['text'], k)
        ranking_scores = ranking_model(query, top_k_passages)
        reranked_indices = np.argsort(ranking_scores)[::-1]
        reranked_passages = [top_k_passages[i] for i in reranked_indices]
        relevance_scores = [queries['relevance'].get((query, passage), 0) for passage in reranked_passages]
        ndcg_scores.append(calculate_ndcg(relevance_scores, k))
    return np.mean(ndcg_scores)

# Evaluation code
print("Evaluating retrieval performance without ranking:")
small_ndcg = evaluate_retrieval(queries, docs, small_model, small_tokenizer, small_doc_embeddings)
print(f"NDCG@10 for BERT-base: {small_ndcg:.4f}")
large_ndcg = evaluate_retrieval(queries, docs, large_model, large_tokenizer, large_doc_embeddings)
print(f"NDCG@10 for BERT-large: {large_ndcg:.4f}")

print("\nEvaluating retrieval performance with ranking:")
small_ndcg_ranked = evaluate_with_ranking(queries, docs, small_model, small_tokenizer, small_doc_embeddings, simple_ranking_model)
print(f"NDCG@10 for BERT-base with ranking: {small_ndcg_ranked:.4f}")
large_ndcg_ranked = evaluate_with_ranking(queries, docs, large_model, large_tokenizer, large_doc_embeddings, simple_ranking_model)
print(f"NDCG@10 for BERT-large with ranking: {large_ndcg_ranked:.4f}")

print("\nAnalyzing impact of different embedding and ranking model combinations:")
combinations = [
    ("BERT-base", "No ranking", small_ndcg),
    ("BERT-large", "No ranking", large_ndcg),
    ("BERT-base", "Simple ranking", small_ndcg_ranked),
    ("BERT-large", "Simple ranking", large_ndcg_ranked)
]

for emb_model, ranking_model, score in combinations:
    print(f"{emb_model} with {ranking_model}: NDCG@10 = {score:.4f}")