# RAG evaluation: 
- Retrieval: Primarly examine the recall of the RAG system.
- Generation: Secondarily examine the generation quality of the RAG system.


### Retrieval
Context Precision & Context Recall

### Generation
Faithfulness


# 1. Evaluating Retrieval

In [5]:
import sys
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

current_dir = Path.cwd().parent
parent_dir = current_dir.parent

sys.path.append(str(parent_dir))
from backend.brain.document_processing import load_pdf, chunk_docs
from utils import QuestionAnswerPair, generate_question
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import InMemoryVectorStore


# load & chunk data 
pdf_path = (
        parent_dir / "backend" / "docs" / "Bitcoin - A Peer-to-Peer Electronic Cash System.pdf"
    )

pages = load_pdf(pdf_path)
docs = chunk_docs(pages)

# create embeddings 
vector_store = InMemoryVectorStore(
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
)
vector_store.add_documents(docs)

# generate questions
qa_pairs = []

for doc in docs[:10]:
    question = generate_question(doc.page_content)
    qa_pairs.append(
        QuestionAnswerPair(
            question=question,
            source_chunk=doc.page_content,
            chunk_index=doc.metadata["page"],
        )
    )

In [6]:
from utils import calculate_metric_avg, evaluate_retrieval

k = 3
retriever = vector_store.as_retriever(k=k)

metrics = evaluate_retrieval(qa_pairs, retriever, k=k)
results = calculate_metric_avg(metrics)

print(metrics)
print(results)

{'precision': [0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.0, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333], 'recall': [1, 1, 1, 1, 0, 1, 1, 1, 1, 1], 'mrr': [1.0, 1.0, 0.3333333333333333, 1.0, 0.25, 1.0, 1.0, 1.0, 1.0, 0.5]}
{'avg_precision': 0.3, 'avg_recall': 0.9, 'avg_mrr': 0.8083333333333332}


# 2. Evaluating Generation

1. Correctness: Response vs. Reference Answer
2. Relevance: Response vs. Input Question
3. Groundedness: Response vs. Retrieved Chunks
4. Retrieval Relevance: Retrieved Chunks vs. Input Question