# RAG pipeline and RAGAs evaluation
This notebook builds a minimal RAG system over the abstract of 'Attention Is All You Need' and evaluates it using RAGAs on faithfulness, answer relevancy, context precision, and context recall.

In [None]:
# Setup: imports and environment check
import os
import pandas as pd
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas.llms import LangchainLLM
from ragas.embeddings import LangchainEmbeddings

from src.pipeline import (
    load_and_chunk_document,
    get_embeddings,
    create_faiss_vectorstore,
    get_default_llm,
    make_rag_chain,
    get_contexts_for_questions,
)

DATA_PATH = 'data/raw/attention_is_all_you_need.txt'
assert os.path.exists(DATA_PATH), f'Expected data at {DATA_PATH} not found'

# Verify GROQ_API_KEY is available
if not os.getenv('GROQ_API_KEY'):
    raise RuntimeError('Please set GROQ_API_KEY in your environment before running this notebook.')

print('Environment OK. Proceeding...')


## Build RAG chain
We load + chunk the document, create a FAISS vector store with bge-small-en-v1.5 embeddings, and create a simple RAG chain using Groq Llama3.

In [None]:
# Load & chunk
docs = load_and_chunk_document(DATA_PATH, chunk_size=500, chunk_overlap=60)
print(f'Loaded and split into {len(docs)} chunks')

# Embeddings and vector store
embeddings = get_embeddings()
vectorstore, retriever = create_faiss_vectorstore(docs, embeddings)

# LLM and chain
llm = get_default_llm()
rag_chain = make_rag_chain(retriever, llm)
print('RAG chain ready.')


## Create a small evaluation set
We define questions and write reference answers (ground truth) based on the abstract. The contexts column is populated by retrieving top-k chunks for each question.

In [None]:
questions = [
    'What new architecture does the paper propose?',
    'Which types of neural networks does the Transformer dispense with?',
    'What BLEU score does the model achieve on WMT 2014 English-to-German?',
    'What BLEU score does it achieve on WMT 2014 English-to-French?',
    'How long did training take and on what hardware?',
    'Does the Transformer generalize to other tasks? If so, which and with what score?',
    'Why are the proposed models faster to train?',
]

ground_truth = [
    'The Transformer, a simple architecture based solely on attention mechanisms.',
    'Recurrent and convolutional neural networks.',
    '28.4 BLEU on the WMT 2014 English-to-German task.',
    '41.8 BLEU on the WMT 2014 English-to-French task.',
    'About 3.5 days of training on eight GPUs.',
    'Yes. It generalizes to English constituency parsing, achieving 92.7 F1 with pretraining.',
    'They are more parallelizable, leading to significantly less training time.',
]

contexts = get_contexts_for_questions(vectorstore, questions, k=4)
print(f'Prepared contexts for {len(questions)} questions.')


## Generate answers with the RAG chain

In [None]:
inputs = [{'question': q} for q in questions]
answers = rag_chain.batch(inputs)
for q, a in list(zip(questions, answers))[:2]:
    print('Q:', q)
    print('A:', a)
    print('-'*60)


## Evaluate with RAGAs
We evaluate on four metrics: faithfulness, answer_relevancy, context_precision, context_recall.

In [None]:
data = {
    'question': questions,
    'answer': answers,
    'contexts': contexts,
    'ground_truth': ground_truth,
}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# Configure RAGAs to use the same LLM and embeddings
ragas_llm = LangchainLLM(llm=llm)
ragas_embeddings = LangchainEmbeddings(embeddings=embeddings)

metrics = [faithfulness, answer_relevancy, context_precision, context_recall]
result = evaluate(dataset=dataset, metrics=metrics, llm=ragas_llm, embeddings=ragas_embeddings)

# Aggregate scores
scores_df = result.to_pandas()
metric_names = [m.name for m in metrics]
mean_scores = scores_df[metric_names].mean().to_dict()
print('RAGAs metrics (mean over eval set):')
for k, v in mean_scores.items():
    print(f'- {k}: {v:.3f}')
