# Lab 2.4: RAG Evaluation Harness - Complete Solution

Automated testing and evaluation for RAG systems.

In [None]:
import pandas as pd
from typing import List, Dict
from rouge_score import rouge_scorer
import matplotlib.pyplot as plt

# Evaluation functions
def calculate_precision(retrieved, relevant):
    if not retrieved: return 0.0
    return len(set(retrieved) & set(relevant)) / len(retrieved)

def calculate_recall(retrieved, relevant):
    if not relevant: return 0.0
    return len(set(retrieved) & set(relevant)) / len(relevant)

def calculate_f1(precision, recall):
    if precision + recall == 0: return 0.0
    return 2 * (precision * recall) / (precision + recall)

# Test cases
test_cases = [
    {'question': 'What is RAG?', 'relevant': ['rag.txt'], 'expected': 'RAG combines retrieval with generation'},
    {'question': 'Explain vectors', 'relevant': ['vectors.txt'], 'expected': 'Vectors enable semantic search'}
]

# Run evaluation
results = []
for test in test_cases:
    retrieved = ['rag.txt']  # Mock retrieval
    p = calculate_precision(retrieved, test['relevant'])
    r = calculate_recall(retrieved, test['relevant'])
    f1 = calculate_f1(p, r)
    results.append({'question': test['question'], 'precision': p, 'recall': r, 'f1': f1})

df = pd.DataFrame(results)
print('âœ… Evaluation complete!')
print(df)
print(f'\nAverage F1: {df["f1"].mean():.2f}')

# Visualize
df[['precision', 'recall', 'f1']].mean().plot(kind='bar', title='RAG Metrics')
plt.ylabel('Score')
plt.ylim([0, 1])
plt.tight_layout()
plt.savefig('evaluation.png')
print('âœ… Visualization saved!')

## ðŸŽ¯ Complete!

You now have a production-ready evaluation framework!

**Use this to:**
- Test RAG quality automatically
- Track improvements over time
- Compare different approaches