# RAG Pipeline Evaluation
- Create LLM generated set of Q&A about a document.
- Use LLM critique/quality control these Q&A pairs.
- Run another LLM on the pipeline with these Q&A pairs and study output to validate.

In [None]:
!pip install -q torch transformers transformers sentence-transformers tqdm openpyxl pandas datasets ragatouille

In [None]:
!pip install -q langchain langchain-huggingface langchain-community langchain-huggingface langchain-openai

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets
import os
import re
import torch

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
OPENAI_API_KEY = user_secrets.get_secret("OPENAI_RAG_KEY")

In [None]:
from huggingface_hub import login
#notebook_login()
login(token=HF_TOKEN)

### Sample Data for Evaluating RAG
- Import and preprocess a document

In [None]:
from langchain.document_loaders import PyPDFLoader, UnstructuredMarkdownLoader, UnstructuredHTMLLoader  # Assumes both loaders exist
#from langchain.docstore.document import Document
from langchain.schema import Document

# Function to write outputs to file
def write_output_to_file(output, filename):
    # Ensure the output directory exists
    out_dir = "/kaggle/working/"
    #os.makedirs(out_dir, exist_ok=True)

    # Define the full file path
    file_path = os.path.join(out_dir, filename)
    
    # Write the output to the file
    with open(file_path, "w") as file:
        file.write(str(output))

# Function to clean text (to remove unwanted line breaks within sentences)
def clean_text(text):
    return re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

# Function to load documents based on file type
def load_documents(file_path):
    _, file_extension = os.path.splitext(file_path)

    if file_extension.lower() == '.pdf':
        loader = PyPDFLoader(file_path)
        print("Loading PDF document...")
    elif file_extension.lower() == '.md':
        loader = UnstructuredMarkdownLoader(file_path)
        print("Loading Markdown document...")
    elif file_extension.lower() == '.html':
        loader = UnstructuredHTMLLoader(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a PDF or Markdown file.")
    
    documents = loader.load()
    cleaned_documents = [Document(page_content=clean_text(doc.page_content)) for doc in documents]
    return documents

In [None]:
# Load the document and questions
file_path = "/kaggle/input/course-bot-data/documents/rbain_syllabus.pdf"  # Change this to the path of your PDF or Markdown file
documents = load_documents(file_path)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set some params
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 200

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
split_docs = text_splitter.split_documents(documents)

# Add to list for later use
docs_processed = []
for doc in documents:
    docs_processed += text_splitter.split_documents([doc])

# Write the chunks to file for manual study
docs = []
for i, doc in enumerate(split_docs):
    output_str = f"Chunk {i + 1}:\n{doc.page_content}\n"
    docs.append(output_str)

write_output_to_file('\n'.join(docs), 'chunks.txt')

### Create Dataset of Q&A Pairs
- Build synthetic dataset for testing the RAG pipeline
- Mistral good for this see https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
- High performing model that has been used for synthetic question generation with 48.6 billion parameters.

In [None]:
# Define LLM to generate synthetic Q&A pairs
from huggingface_hub import InferenceClient

qa_creation_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llm_client = InferenceClient(model=qa_creation_model, timeout=120,)

def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={"inputs": prompt, "parameters": {"max_new_tokens": 1000},"task": "text-generation",},
    )
    return json.loads(response.decode())[0]["generated_text"]

NOTE: I tried to download the model locally below, but it exceeded maximum allowed disk space on Kaggle. On my local disk, I don't have access to a GPU, which is the best way to make use of it. HuggingFace obviously rate limits these large high performing models, so this part was quite tricky.

In [None]:
# Alternate method, download LLM locally
#from transformers import AutoTokenizer, AutoModelForCausalLM

#qa_creation_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
#tokenizer = AutoTokenizer.from_pretrained(qa_creation_model)
#model = AutoModelForCausalLM.from_pretrained(qa_creation_model)
#model.to("cuda")

#def call_local_llm(prompt: str):
#    inputs = tokenizer(prompt, return_tensors="pt")
#    outputs = model.generate(inputs.input_ids, max_new_tokens=1000)
#    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Below, we write a "factoid" question prompt (see https://www-cs-faculty.stanford.edu/people/mengqiu/publication/LSII-LitReview.pdf) to generate synthetic data. Such questions ask for a specific fact type answer that is concise. Straightforward task for LLM and reasonable for syllabus over reasoning/advanced analysis.

In [None]:
qa_creation_prompt = """
### Instructions
Your task is to generate a factoid question and its corresponding answer based on the provided context below.
The factoid question should
1. Be answerable with a specific, concise piece of factual information from the context.
2. Be written in a natural, user-friendly style, similar to what users might input in a search engine.
3. Avoid mentioning terms like "context," "passage," or "according to the text."

The answer to the factoid questions should
1. Be short, precise, and derived directly from the context.
2. Avoid adding any information that is not explicitly present in the context.

### Formatting
Provide your response exactly as follows:

Output:::
Factoid question: (insert your factoid question here)
Answer: (insert your answer to the factoid question here)

### Provided Context
Below is the context upon which to base the factoid question and its corresponding answer

Context: {context}\n
Output:::"""

In [None]:
import random
import time
#N_GENERATIONS = 30
N_GENERATIONS = min(30, len(docs_processed))

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    time.sleep(2)
    output_QA_couple = call_llm(llm_client, qa_creation_prompt.format(context=sampled_context.page_content))
    #output_QA_couple = call_local_llm(qa_creation_prompt.format(context=sampled_context.page_content))
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

In [None]:
# Save outputs to readable csv file so we can read manually
outputs_df = pd.DataFrame(outputs)
outputs_df.to_csv('generated_qa.csv', index=False)

In [None]:
outputs_df.head()

In [None]:
# Test one of the questions
print(f'Question: {outputs_df.iloc[0,:].question}')
print(f'Answer: {outputs_df.iloc[0,:].answer}')

### Build Critique Models
- Establish some evaluation metrics for Q&A set, create prompts
- Use LLM to evaluate the Q&A and create scores
- Filter our Q&A set quality based on those scores.

#### Metrics for Critiquing Questions
Ref. https://docs.ragas.io/en/latest/concepts/metrics/index.html 
- Groundedness: can the question be answered from the given context?
- Relevance: is the question relevant to users? For instance, "What is the date when transformers 4.29.1 was released?" is not relevant for ML practicioners.
- Stand-alone: is the question understandable free of any context, for someone with domain knowledge/Internet access? The opposite of this would be What is the function used in this article? for a question generated from a specific blog article.
- Faithfulness: number of claims in the generated answer that can be inferred from given context / total number of claims in generated answer
-   Set of claims from generated answer identified
-   Each claim cross checked within the context.
- https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/faithfulness/

In [None]:
faithfulness_prompt = """
Your task is to evaluate the **faithfulness** of a question based on the provided context.

### Faithfulness Definition:
Faithfulness is defined as the proportion of claims in the generated answer that can be directly inferred from the context. Specifically:
- **Faithful Claims**: Claims in the answer that are explicitly stated or logically inferable from the given context.
- **Unfaithful Claims**: Claims in the answer that are not supported by the context, including hallucinated or extraneous information.

The faithfulness score is calculated as:
`Faithfulness Score = (Number of Faithful Claims) / (Total Number of Claims in the Answer)`

### Instructions:
1. Carefully review the provided context, question, and generated answer.
2. Identify all individual claims made in the generated answer.
3. For each claim, determine if it is **faithful** (supported by the context) or **unfaithful** (unsupported or hallucinated).
4. Compute the faithfulness score and provide a brief explanation for the score.

### Format:
Provide your response in the following format:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

### Provided Question and Context
Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: 

"""
groundedness_prompt = """
### Instructions
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

relevance_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

standalone_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [None]:
print("Generating critique for each QA couple...")
# Loop over outputs df and grab the scores
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(llm_client,groundedness_prompt.format(context=output["context"], question=output["question"])),
        "relevance": call_llm(llm_client,relevance_prompt.format(question=output["question"])),
        "standalone": call_llm(llm_client,standalone_prompt.format(question=output["question"])),
        #"faithfulness": call_llm(llm_client, faithfulness_prompt.format(context=output['context'], question=output['question'])),
    }
    try:
        # Loop over each critique criterian and evaluation rating splitting on output string
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update({f"{criterion}_score": score,
                           f"{criterion}_eval": eval,})
    except Exception as e:
        print(f"{criterion=}")
        print(f"{evaluation=}")
        continue

In [None]:
# Filter q&A pairs based on critique scores and log
pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

# Save before filtering
#generated_questions.to_csv('generated_qa_critique_no_filter.csv', index=False)

# Preview the dataset
print("Evaluation dataset before filtering:")
display(generated_questions[["question","answer","groundedness_score","relevance_score","standalone_score"]].iloc[0:5,:])

# Filter to make sure relatively relevant responses
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 1)
    & (generated_questions["relevance_score"] >= 1)
    & (generated_questions["standalone_score"] >= 1)
#    & (generated_questions['faithfulness_score'] >= 1)
]

invalid_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] < 1)
    & (generated_questions["relevance_score"] < 1)
    & (generated_questions["standalone_score"] < 1)
 #   & (generated_questions['faithfulness_score'] < 1)
]

# Save filtered and killed data
generated_questions.to_csv('generated_qa_critique_filtered.csv', index=False)
invalid_questions.to_csv('generated_qa_critique_invalid.csv', index=False)

# Preview filtered dataset
print("============================================")
print("Final evaluation dataset:")
display(generated_questions[["question","answer","groundedness_score","relevance_score","standalone_score"]].iloc[0:5,:])

eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)

### Import/Build RAG Pipeline for Evaluation
Create methods to:
1. Split documents and preprocess
2. Create vector store and fill with embeddings
3. Build retriever with prompt to get the context and feed to LLM
4. Run tests to validate

In [None]:
# Implement splitting of docs via text_splitter
from transformers import AutoTokenizer

def split_documents(chunk_size: int, knowledge_base: List[Document],tokenizer_name: str,) -> List[Document]:

    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

Next define a method to create or load the embeddings using a FAISS.

In [None]:
# Method to load embeddins and create vectore store
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os

def load_embeddings(langchain_docs, chunk_size, embedding_model_name: Optional[str] = "thenlper/gte-small",) -> FAISS:

    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        #model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE,
            #allow_dangerous_deserialization=True
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

In [None]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

Next, we'll create a "reader" model that will be used to test our RAG pipeline. This should have mostly standard params, but have a very low temperature, since we want it to objectively evaluate results from another LLM.
Models: 
1. https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
2. https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
3. https://huggingface.co/colbert-ir/colbertv2.0

In [None]:
from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint

# Zephyr 7b from Mistral AI
repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"

# LLama 3.1 8B Instruct from Meta
#repo_id = 'meta-llama/Llama-3.1-8B-Instruct'
#READER_MODEL_NAME = 'Llama-3.1-8B-Instruct'

# Declare some parameters for the RAG Q&A LLM
reader_model_params = {"max_new_tokens": 512, "top_k": 30,"temperature": 0.1,"repetition_penalty": 1.03,}

reader_llm = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation", 
    huggingfacehub_api_token=HF_TOKEN,
    model_kwargs = reader_model_params,
)

In [None]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM

def answer_with_rag(question: str, llm: LLM, knowledge_index: VectorStore, 
                    reranker: Optional[RAGPretrainedModel] = None, num_retrieved_docs: int = 30,
                    num_docs_final: int = 7,) -> Tuple[str, List[Document]]:
    
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results using RAGatoulli ColBERT model
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)
    #answer = llm.invoke(final_prompt)

    return answer, relevant_docs

In [None]:
from langchain_core.language_models import BaseChatModel
# Method to run rag on set of questions
def run_rag_tests(eval_dataset, llm, knowledge_index, output_file,
                  reranker: Optional[RAGPretrainedModel] = None, verbose: Optional[bool] = True,
                  test_settings: Optional[str] = None,):
    
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    # loop over the q&a pairs in the eval dataset 
    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        # Optionally include the details about the model settings throughout the pipeline
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        # Print everything to an output json file
        with open(output_file, "w") as f:
            json.dump(outputs, f)

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain.prompts.chat import (ChatPromptTemplate,HumanMessagePromptTemplate,)
from langchain.schema import SystemMessage

evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."), 
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

#### Choose Judge Agent
Next we declare a judge agent to evaluate the results and a function to evaluate the a&a pairs for various metrics like faithfulness. We'll look at:
1. OpenAI GPT 4 1106 (Proprietary) - https://community.openai.com/t/gpt-4-1106-preview-vs-gpt-4/588424
2. FlowAI Judge - https://huggingface.co/flowaicom/Flow-Judge-v0.1-AWQ

In [None]:
# Choose a model for judge agent
#from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline, HuggingFaceEndpoint

# Uncomment to use best model available GPT4
eval_chat_model = ChatOpenAI(model='N', temperature=0, openai_api_key=OPENAI_API_KEY)
evaluator_name = 'gpt4'

# Uncomment to use open source much smaller FlowAI judge

#model_kwargs = {'temperature': 0.1, 'repetition_penalty':1.03}
#eval_chat_model = ChatHuggingFace(
#    llm= HuggingFacePipeline.from_model_id(
#        #endpoint_url = "https://api-inference.huggingface.co/models/flowaicom/Flow-Judge-v0.1",
#        model_id='flowaicom/Flow-Judge-v0.1',
#        task='text-generation',
#        model_kwargs=model_kwargs,
#        #temperature=0.1,
#        #reptition_penalty=1.03,
#    )
#)    
#evaluator_name = "flow-judge"


In [None]:
# Make function to evaluate qa pairs
def evaluate_answers(answer_path, eval_chat_model, evaluator_name, 
                     evaluation_prompt_template: ChatPromptTemplate,) -> None:
    
    answers = []
    # Only work on files where answer_path exists (i.e. where the q&a pairs are)
    if os.path.isfile(answer_path):
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        # Move on if score is already recorded
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)

### Loop Over Methods to Test + Finetune
- Loop over hyperparameters and run.
1. Chunk size
2. Embedding models and/or critique/evaluation models
3. Rerank of retrieved contexts or not

In [None]:
RAW_KNOWLEDGE_BASE = documents

In [None]:
if not os.path.exists("./output"):
    os.mkdir("./output")

for chunk_size in [200]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=reader_llm,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

### EDA the Results

In [None]:
# Use glob to recursively go through all of the json output files we dumped
import glob

outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
    
# Make dataframe of output results
result = pd.concat(outputs)

In [None]:
result.head(1)

In [None]:
result["eval_score_gpt4"] = result["eval_score_gpt4"].apply(lambda x: int(x) if isinstance(x, str) else 1)
result["eval_score_gpt4"] = (result["eval_score_gpt4"] - 1) / 4

In [None]:
average_scores = result.groupby("settings")["eval_score_gpt4"].mean()
average_scores.sort_values()
average_scores.to_csv('average_scores.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
plt.hist(result.eval_score_gpt4)

In [None]:
!zip -r 'rag_eval_results.zip' '/kaggle/working/'