Upload these files to Google Drive:
- `docs_ada_with_embeddings_*.parquet` (2.2GB) - Ada embeddings (1536 dims)
- `docs_e5large_with_embeddings_*.parquet` (1.7GB) - E5-Large embeddings (1024 dims)  
- `docs_mpnet_with_embeddings_*.parquet` (1.4GB) - MPNet embeddings (768 dims)
- `docs_minilm_with_embeddings_*.parquet` (1.0GB) - MiniLM embeddings (384 dims)
- Questions dataset (JSON with ground truth links)

## 🔧 Setup and Installation

In [None]:
# Install required packages
!pip install sentence-transformers pandas numpy scikit-learn tqdm

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import json
from datetime import datetime
import gc
from typing import List, Dict, Tuple
from tqdm import tqdm

## 📁 Mount Google Drive

In [ ]:
from google.colab import drive
drive.mount('/content/drive')

# Set paths to your uploaded files
BASE_PATH = '/content/drive/MyDrive/RAG_Evaluation/'

# Available embedding collections
EMBEDDING_FILES = {
    'ada': BASE_PATH + 'docs_ada_with_embeddings_20250721_123712.parquet',
    'e5-large': BASE_PATH + 'docs_e5large_with_embeddings_20250721_124918.parquet', 
    'mpnet': BASE_PATH + 'docs_mpnet_with_embeddings_20250721_125254.parquet',
    'minilm': BASE_PATH + 'docs_minilm_with_embeddings_20250721_125846.parquet'
}

# Questions dataset
QUESTIONS_FILE = BASE_PATH + 'questions_with_links.json'

print("📁 File paths configured:")
for model, path in EMBEDDING_FILES.items():
    print(f"  {model}: {path}")
print(f"  questions: {QUESTIONS_FILE}")

## 🔍 Real Embedding Retriever Class

In [None]:
class RealEmbeddingRetriever:
    """Retriever que usa embeddings reales para cálculo de coseno"""
    
    def __init__(self, parquet_file: str):
        """
        Inicializar con archivo Parquet que contiene embeddings reales
        """
        print(f"🔄 Loading real embeddings from {parquet_file}...")
        self.df = pd.read_parquet(parquet_file)
        
        # Convertir embeddings a matriz numpy
        print("🔄 Converting embeddings to numpy matrix...")
        embeddings_list = self.df['embedding'].tolist()
        self.embeddings_matrix = np.array(embeddings_list)
        
        # Información del dataset
        self.num_docs = len(self.df)
        self.embedding_dim = self.embeddings_matrix.shape[1]
        
        print(f"✅ Loaded {self.num_docs:,} documents")
        print(f"📐 Embedding dimensions: {self.embedding_dim}")
        print(f"💾 Memory usage: {self.embeddings_matrix.nbytes / (1024**3):.2f} GB")
        
        # Preparar metadatos
        self.documents = self.df[['document', 'link', 'title', 'summary', 'content']].to_dict('records')
        
    def search_documents(self, query_embedding: np.ndarray, top_k: int = 10) -> List[Dict]:
        """Buscar documentos más similares usando coseno real"""
        # Calcular similaridad coseno real
        query_embedding = query_embedding.reshape(1, -1)
        similarities = cosine_similarity(query_embedding, self.embeddings_matrix)[0]
        
        # Obtener índices de documentos más similares
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        # Construir resultados con metadatos reales
        results = []
        for idx in top_indices:
            doc = self.documents[idx].copy()
            doc['cosine_similarity'] = float(similarities[idx])
            doc['rank'] = len(results) + 1
            results.append(doc)
        
        return results

## 📊 Real Metrics Calculation

In [None]:
def calculate_real_retrieval_metrics(
    question: str,
    query_embedding: np.ndarray,
    retriever: RealEmbeddingRetriever,
    ground_truth_links: List[str],
    top_k_values: List[int] = [1, 3, 5, 10]
) -> Dict:
    """Calcular métricas de retrieval reales usando coseno con embeddings auténticos"""
    
    # Buscar documentos con coseno real
    max_k = max(top_k_values) if top_k_values else 10
    retrieved_docs = retriever.search_documents(query_embedding, top_k=max_k)
    
    # Normalizar enlaces para comparación
    def normalize_link(link: str) -> str:
        if not link:
            return ""
        link = link.split('#')[0].split('?')[0]
        return link.rstrip('/')
    
    # Normalizar ground truth
    gt_normalized = set(normalize_link(link) for link in ground_truth_links)
    
    # Calcular métricas para cada k
    metrics = {}
    for k in top_k_values:
        top_k_docs = retrieved_docs[:k]
        
        # Enlaces recuperados (normalizados)
        retrieved_links = set()
        for doc in top_k_docs:
            link = normalize_link(doc.get('link', ''))
            if link:
                retrieved_links.add(link)
        
        # Métricas
        relevant_retrieved = retrieved_links.intersection(gt_normalized)
        
        # Precision@k, Recall@k, F1@k
        precision_k = len(relevant_retrieved) / k if k > 0 else 0.0
        recall_k = len(relevant_retrieved) / len(gt_normalized) if gt_normalized else 0.0
        f1_k = (2 * precision_k * recall_k) / (precision_k + recall_k) if (precision_k + recall_k) > 0 else 0.0
        
        metrics[f'precision@{k}'] = precision_k
        metrics[f'recall@{k}'] = recall_k
        metrics[f'f1@{k}'] = f1_k
    
    # MRR (Mean Reciprocal Rank)
    mrr = 0.0
    for rank, doc in enumerate(retrieved_docs, 1):
        link = normalize_link(doc.get('link', ''))
        if link in gt_normalized:
            mrr = 1.0 / rank
            break
    
    metrics['mrr'] = mrr
    metrics['ground_truth_count'] = len(gt_normalized)
    metrics['retrieved_count'] = len(retrieved_docs)
    
    return metrics

## 📂 Load Questions Dataset

In [None]:
# Load questions with ground truth
print("📥 Loading questions dataset...")
with open(QUESTIONS_FILE, 'r', encoding='utf-8') as f:
    questions_data = json.load(f)

print(f"✅ Loaded {len(questions_data)} questions")

# Show sample question
if questions_data:
    sample_q = questions_data[0]
    print(f"\n🔍 Sample question:")
    print(f"  Question: {sample_q.get('question', '')[:100]}...")
    print(f"  Ground truth links: {len(sample_q.get('ms_links', []))} links")
    print(f"  Sample link: {sample_q.get('ms_links', ['N/A'])[0]}")

## 🎯 Run Real Embedding Evaluation

In [ ]:
# Configuration
EMBEDDING_MODEL_TO_EVALUATE = 'ada'  # Change to 'e5-large', 'mpnet', or 'minilm'
NUM_QUESTIONS_TO_EVALUATE = 100  # Set to None for all questions

# Model mappings for query embedding generation
QUERY_MODELS = {
    'ada': 'sentence-transformers/all-MiniLM-L6-v2',  # Proxy for Ada
    'e5-large': 'intfloat/e5-large-v2',
    'mpnet': 'sentence-transformers/multi-qa-mpnet-base-dot-v1',
    'minilm': 'sentence-transformers/all-MiniLM-L6-v2'
}

print(f"🚀 Starting Real Embedding Evaluation")
print(f"📊 Embedding model: {EMBEDDING_MODEL_TO_EVALUATE}")
print(f"📄 Document corpus: {EMBEDDING_FILES[EMBEDDING_MODEL_TO_EVALUATE]}")
print(f"❓ Questions to evaluate: {NUM_QUESTIONS_TO_EVALUATE or 'ALL'}")
print("="*80)

# Load retriever with real embeddings
retriever = RealEmbeddingRetriever(EMBEDDING_FILES[EMBEDDING_MODEL_TO_EVALUATE])

# Load query embedding model
query_model_name = QUERY_MODELS[EMBEDDING_MODEL_TO_EVALUATE]
print(f"\n🔤 Loading query model: {query_model_name}")
query_model = SentenceTransformer(query_model_name)
print(f"✅ Query model loaded")

# Select questions to evaluate
questions_to_eval = questions_data[:NUM_QUESTIONS_TO_EVALUATE] if NUM_QUESTIONS_TO_EVALUATE else questions_data
print(f"\n📊 Evaluating {len(questions_to_eval)} questions...")

# Run evaluation
all_metrics = []

for i, qa_item in enumerate(tqdm(questions_to_eval, desc="Evaluating questions")):
    question = qa_item.get('question', '')
    ms_links = qa_item.get('ms_links', [])
    
    if not question or not ms_links:
        continue
        
    # Generate query embedding
    query_embedding = query_model.encode(question)
    
    # Calculate real metrics
    metrics = calculate_real_retrieval_metrics(
        question=question,
        query_embedding=query_embedding,
        retriever=retriever,
        ground_truth_links=ms_links,
        top_k_values=[1, 3, 5, 10]
    )
    
    metrics['question_index'] = i
    metrics['question'] = question
    all_metrics.append(metrics)

print(f"\n✅ Evaluation completed: {len(all_metrics)} questions processed")

## 📈 Results Analysis

In [None]:
# Calculate average metrics
if all_metrics:
    avg_metrics = {}
    for key in ['precision@1', 'precision@3', 'precision@5', 'precision@10',
               'recall@1', 'recall@3', 'recall@5', 'recall@10',
               'f1@1', 'f1@3', 'f1@5', 'f1@10', 'mrr']:
        values = [m[key] for m in all_metrics if key in m]
        avg_metrics[f'avg_{key}'] = np.mean(values) if values else 0.0
    
    # Display results
    print(f"📊 REAL EMBEDDING EVALUATION RESULTS")
    print(f"📄 Model: {EMBEDDING_MODEL_TO_EVALUATE}")
    print(f"📚 Documents: {retriever.num_docs:,}")
    print(f"📐 Dimensions: {retriever.embedding_dim}")
    print(f"❓ Questions: {len(all_metrics)}")
    print("="*60)
    
    # Precision metrics
    print("🎯 PRECISION METRICS:")
    for k in [1, 3, 5, 10]:
        precision = avg_metrics[f'avg_precision@{k}'] * 100
        print(f"  Precision@{k:2d}: {precision:6.2f}%")
    
    # Recall metrics
    print("\n🔍 RECALL METRICS:")
    for k in [1, 3, 5, 10]:
        recall = avg_metrics[f'avg_recall@{k}'] * 100
        print(f"  Recall@{k:2d}:    {recall:6.2f}%")
    
    # F1 metrics
    print("\n⚖️  F1-SCORE METRICS:")
    for k in [1, 3, 5, 10]:
        f1 = avg_metrics[f'avg_f1@{k}'] * 100
        print(f"  F1@{k:2d}:        {f1:6.2f}%")
    
    # MRR
    mrr = avg_metrics['avg_mrr']
    print(f"\n🥇 MEAN RECIPROCAL RANK: {mrr:.4f}")
    
    print("\n" + "="*60)
    print("✅ THESE ARE REAL METRICS - NO SIMULATION!")
    print("🔬 Based on actual cosine similarity with real embeddings")
    print("📊 Ground truth: Microsoft Learn documentation links")
else:
    print("❌ No metrics calculated")

## 🔬 Detailed Analysis

In [None]:
# Show best and worst performing questions
if all_metrics:
    # Sort by MRR
    sorted_by_mrr = sorted(all_metrics, key=lambda x: x['mrr'], reverse=True)
    
    print("🏆 TOP 5 BEST PERFORMING QUESTIONS (by MRR):")
    for i, metric in enumerate(sorted_by_mrr[:5]):
        print(f"  {i+1}. MRR: {metric['mrr']:.3f} | P@5: {metric['precision@5']:.3f} | Q: {metric['question'][:60]}...")
    
    print("\n📉 TOP 5 WORST PERFORMING QUESTIONS (by MRR):")
    for i, metric in enumerate(sorted_by_mrr[-5:]):
        print(f"  {i+1}. MRR: {metric['mrr']:.3f} | P@5: {metric['precision@5']:.3f} | Q: {metric['question'][:60]}...")
    
    # Statistics
    mrr_values = [m['mrr'] for m in all_metrics]
    precision5_values = [m['precision@5'] for m in all_metrics]
    
    print(f"\n📊 STATISTICS:")
    print(f"  MRR - Min: {min(mrr_values):.3f}, Max: {max(mrr_values):.3f}, Std: {np.std(mrr_values):.3f}")
    print(f"  P@5 - Min: {min(precision5_values):.3f}, Max: {max(precision5_values):.3f}, Std: {np.std(precision5_values):.3f}")
    
    # Perfect matches
    perfect_mrr = len([m for m in all_metrics if m['mrr'] == 1.0])
    zero_mrr = len([m for m in all_metrics if m['mrr'] == 0.0])
    
    print(f"\n🎯 PERFORMANCE DISTRIBUTION:")
    print(f"  Perfect matches (MRR=1.0): {perfect_mrr} ({perfect_mrr/len(all_metrics)*100:.1f}%)")
    print(f"  No matches (MRR=0.0):      {zero_mrr} ({zero_mrr/len(all_metrics)*100:.1f}%)")
    print(f"  Partial matches:           {len(all_metrics)-perfect_mrr-zero_mrr} ({(len(all_metrics)-perfect_mrr-zero_mrr)/len(all_metrics)*100:.1f}%)")

## 💾 Save Results

In [None]:
# Save detailed results
results = {
    'evaluation_config': {
        'embedding_model': EMBEDDING_MODEL_TO_EVALUATE,
        'query_model': query_model_name,
        'document_corpus': EMBEDDING_FILES[EMBEDDING_MODEL_TO_EVALUATE],
        'num_questions_evaluated': len(all_metrics),
        'total_documents': retriever.num_docs,
        'embedding_dimensions': retriever.embedding_dim,
        'evaluation_timestamp': datetime.now().isoformat()
    },
    'average_metrics': avg_metrics,
    'individual_metrics': all_metrics
}

# Save to Google Drive
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"{BASE_PATH}real_evaluation_{EMBEDDING_MODEL_TO_EVALUATE}_{timestamp}.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"💾 Results saved to: {output_file}")
print(f"📊 File size: {len(json.dumps(results)) / (1024*1024):.1f} MB")

# Cleanup memory
del retriever
del query_model
gc.collect()

print("\n✅ Evaluation completed and saved!")
print("🎉 You now have real retrieval metrics based on actual embeddings!")