### Experiment:  Comparison of RAG models using RAGAS framework

**Background:**  Generate RAGAS based measurement for proposed RAG API candidates.

**Test Approach:** Test RAG Parameters:

* RAG_0

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()


True

In [3]:
# Common import
from deh.assessment import QASetRetriever
from deh import settings
from deh.eval import generate_experiment_dataset

import pandas as pd
import json
import os
from pathlib import Path

# For RAGAS evaluation
from datasets import Dataset
from ragas import evaluate
import ragas.metrics as metrics
from ragas.run_config import RunConfig

  from .autonotebook import tqdm as notebook_tqdm


#### Test Configuration

In [4]:
num_samples:int = 100
experiment_folder:str = "../../data/evaluation/rag_api_ragas_metrics/"
qa_data_set_file:str = "../../data/qas/squad_qas.tsv"

# Create experiment folder:
if not os.path.exists(experiment_folder):
    Path(experiment_folder).mkdir(parents=True, exist_ok=True)

#### Evaluation Model Setup

In [5]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper

# Either local (Ollama) or remote (OpenAI) evaluation models can be used:

# llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4"))
# embedding = OpenAIEmbeddings()


In [6]:
embedding = OllamaEmbeddings(
    base_url=settings.OLLAMA_URL,
    model=settings.ASSESSMENT_EMBEDDING_MODEL,
)

llm = LangchainLLMWrapper(Ollama(
    base_url=settings.OLLAMA_URL,
    model=settings.ASSESSMENT_LLM_MODEL
))

  embedding = OllamaEmbeddings(
  llm = LangchainLLMWrapper(Ollama(


#### Sample QA dataset

In [7]:
qa_set = QASetRetriever.get_qasets(
    file_path = qa_data_set_file,
    sample_size= num_samples
)

print(f"{len(qa_set)} questions sampled from QA corpus ({qa_data_set_file})")

100 questions sampled from QA corpus (../../data/qas/squad_qas.tsv)


### NO_RAG Experimentation

#### Response Generation

In [8]:
def api_endpoint(**kwargs) -> str:
    """Endpoint for context retrieval."""
    hyde= False
    kwargs["h"] = False
    kwargs["e"] = False
    kwargs["k"] = 1
    kwargs["t"] = 0.5
    kwargs["m"] = "llama3.1:8b-instruct-q3_K_L"
    kwargs["lp"] = 2    # no context, 10 words

    query_params = "&".join([f"{key}={kwargs[key]}" for key in kwargs])
    print (query_params)
    return f"http://{settings.API_ANSWER_ENDPOINT}/answer?{query_params}"

def convert(response) -> pd.DataFrame:
    """Converts retrieved JSON response to Pandas DataFrame"""
    response_df = pd.json_normalize(
        data=response["response"], record_path="context", meta=["answer","question", "hyde", ["evaluation", "grade"]]
    )

    # Add reference/evaluation values:
    response_df["reference.ground_truth"] = response["reference"]["ground_truth"]
    response_df["reference.is_impossible"] = response["reference"]["is_impossible"]

    # Add full JSON response incase needed:
    response_df["json"] = json.dumps(response)
    return response_df

exp_df = generate_experiment_dataset(qa_set, convert, api_endpoint)

# Store the generated response:
exp_df.to_pickle( f"{experiment_folder}/no_rag_b1.pkl" )
exp_df[0:1]


Processing 1 of 100 question/answer pairs.
q=What%20is%20the%20power-to-weight%20ratio%20of%20a%20steam%20plant%20compared%20to%20that%20of%20an%20internal%20combustion%20engine%3F&h=False&e=False&k=1&t=0.5&m=llama3.1:8b-instruct-q3_K_L&lp=2
Processing 2 of 100 question/answer pairs.
q=Under%20which%20leader%20did%20the%20Huguenots%20fight%20in%20this%20conflict%3F&h=False&e=False&k=1&t=0.5&m=llama3.1:8b-instruct-q3_K_L&lp=2
Processing 3 of 100 question/answer pairs.
q=When%20did%20this%20attempt%20take%20place%3F&h=False&e=False&k=1&t=0.5&m=llama3.1:8b-instruct-q3_K_L&lp=2
Processing 4 of 100 question/answer pairs.
q=WHy%20was%20the%20Merit%20network%20formed%20in%20Michigan%20&h=False&e=False&k=1&t=0.5&m=llama3.1:8b-instruct-q3_K_L&lp=2
Processing 5 of 100 question/answer pairs.
q=Does%20the%20residential%20architecture%20of%20the%20Tower%20District%20compare%20or%20contrast%20with%20other%20part%20of%20Fresno%3F&h=False&e=False&k=1&t=0.5&m=llama3.1:8b-instruct-q3_K_L&lp=2
Processing

Unnamed: 0,id,page_content,type,metadata.source,metadata.similarity_score,answer,question,hyde,evaluation.grade,reference.ground_truth,reference.is_impossible,json,reference_id
0,,The weight of boilers and condensers generally...,Document,/data/contexts/context_1078.context,0.358998,Internal combustion engines have higher power-...,What is the power-to-weight ratio of a steam p...,False,,lower,False,"{""response"": {""question"": ""What is the power-t...",1


##### RAGAS Evaluation Responses

In [10]:
# Convert to Dataset
responses_df = pd.read_pickle(f"{experiment_folder}/no_rag_b1.pkl")

responses_df = responses_df.groupby("reference_id").agg(
    retrieved_contexts = ('page_content', lambda x: list(x)),
    question = ('question','first'),
    ground_truth = ('reference.ground_truth', 'first'),
    answer = ('answer', 'first')
    )

responses_df[0:1]


Unnamed: 0_level_0,retrieved_contexts,question,ground_truth,answer
reference_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,[The weight of boilers and condensers generall...,What is the power-to-weight ratio of a steam p...,lower,Internal combustion engines have higher power-...


In [None]:
# Single iteration for Exception management

from ragas import evaluate
from ragas.metrics import Faithfulness, FactualCorrectness

results_df = []

for i in range(len(responses_df)):
   print (f"Processing {i} of {len(responses_df)}")
   df=responses_df.iloc[i:i+1]
   ds = Dataset.from_pandas( df)
   
   metrics = [FactualCorrectness(), Faithfulness()]
   try:
      evaluation_ds = evaluate(
                     dataset=ds, 
                     metrics=metrics, 
                     llm=llm,
                     run_config=RunConfig(
                        max_workers=1,
                        max_retries=1
                     ))
   
      eval_df = evaluation_ds.to_pandas()
      results_df.append(eval_df)
   except:
      pass

appended_ds = pd.concat(results_df)
appended_ds.to_pickle( f"{experiment_folder}/no_rag_b1-results.pkl" )

appended_ds[0:5]

Processing 0 of 100


Evaluating: 100%|██████████| 2/2 [00:19<00:00,  9.71s/it]


Processing 1 of 100


Evaluating: 100%|██████████| 2/2 [00:17<00:00,  8.76s/it]


Processing 2 of 100


Evaluating: 100%|██████████| 2/2 [00:19<00:00,  9.94s/it]


Processing 3 of 100


Evaluating: 100%|██████████| 2/2 [00:21<00:00, 10.65s/it]


Processing 4 of 100


Evaluating: 100%|██████████| 2/2 [00:19<00:00,  9.84s/it]


Processing 5 of 100


Evaluating: 100%|██████████| 2/2 [00:15<00:00,  7.71s/it]


Processing 6 of 100


Evaluating: 100%|██████████| 2/2 [00:18<00:00,  9.41s/it]


Processing 7 of 100


Evaluating: 100%|██████████| 2/2 [00:20<00:00, 10.06s/it]


Processing 8 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt claim_decomposition_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Evaluating: 100%|██████████| 2/2 [00:24<00:00, 12.32s/it]


Processing 9 of 100


Evaluating: 100%|██████████| 2/2 [00:17<00:00,  8.97s/it]


Processing 10 of 100


Evaluating: 100%|██████████| 2/2 [00:17<00:00,  8.93s/it]


Processing 11 of 100


Evaluating: 100%|██████████| 2/2 [00:13<00:00,  6.66s/it]


Processing 12 of 100


Evaluating: 100%|██████████| 2/2 [00:17<00:00,  8.64s/it]


Processing 13 of 100


Evaluating: 100%|██████████| 2/2 [00:15<00:00,  7.50s/it]


Processing 14 of 100


Evaluating: 100%|██████████| 2/2 [00:18<00:00,  9.45s/it]


Processing 15 of 100


Evaluating: 100%|██████████| 2/2 [00:15<00:00,  7.89s/it]


Processing 16 of 100


Evaluating: 100%|██████████| 2/2 [00:18<00:00,  9.21s/it]


Processing 17 of 100


Evaluating: 100%|██████████| 2/2 [00:23<00:00, 11.51s/it]


Processing 18 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt claim_decomposition_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Evaluating: 100%|██████████| 2/2 [00:26<00:00, 13.41s/it]


Processing 19 of 100


Evaluating: 100%|██████████| 2/2 [00:17<00:00,  8.62s/it]


Processing 20 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt claim_decomposition_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Evaluating: 100%|██████████| 2/2 [00:31<00:00, 15.52s/it]


Processing 21 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt n_l_i_statement_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Evaluating: 100%|██████████| 2/2 [00:25<00:00, 12.84s/it]


Processing 22 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt claim_decomposition_prompt failed to parse output: The output parser failed to parse the output including retries.
Exception raised in Job[0]: RagasOutputParserException(The output parser failed to parse the output including retries.)
Evaluating: 100%|██████████| 2/2 [00:19<00:00,  9.97s/it]


Processing 23 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]Exception raised in Job[0]: ValueError(Chunk too big)
Evaluating: 100%|██████████| 2/2 [00:29<00:00, 14.63s/it]


Processing 24 of 100


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

### RAG_FINAL Experimentation

In [9]:
def api_endpoint(**kwargs) -> str:
    """Endpoint for context retrieval."""
    hyde= False
    kwargs["h"] = False
    kwargs["e"] = False
    kwargs["k"] = 2
    kwargs["t"] = 0.5
    kwargs["m"] = "qwen2.5:7b"
    kwargs["lp"] = 0    # no context, 10 words

    query_params = "&".join([f"{key}={kwargs[key]}" for key in kwargs])
    print (query_params)
    return f"http://{settings.API_ANSWER_ENDPOINT}/answer?{query_params}"

def convert(response) -> pd.DataFrame:
    """Converts retrieved JSON response to Pandas DataFrame"""
    response_df = pd.json_normalize(
        data=response["response"], record_path="context", meta=["answer","question", "hyde", ["evaluation", "grade"]]
    )

    # Add reference/evaluation values:
    response_df["reference.ground_truth"] = response["reference"]["ground_truth"]
    response_df["reference.is_impossible"] = response["reference"]["is_impossible"]

    # Add full JSON response incase needed:
    response_df["json"] = json.dumps(response)
    return response_df

exp_df = generate_experiment_dataset(qa_set, convert, api_endpoint)

# Store the generated response:
exp_df.to_pickle( f"{experiment_folder}/rag_final_b1.pkl" )
exp_df[0:1]


Processing 1 of 100 question/answer pairs.
q=What%20is%20the%20power-to-weight%20ratio%20of%20a%20steam%20plant%20compared%20to%20that%20of%20an%20internal%20combustion%20engine%3F&h=False&e=False&k=2&t=0.5&m=qwen2.5:7b&lp=0
Processing 2 of 100 question/answer pairs.
q=Under%20which%20leader%20did%20the%20Huguenots%20fight%20in%20this%20conflict%3F&h=False&e=False&k=2&t=0.5&m=qwen2.5:7b&lp=0
Processing 3 of 100 question/answer pairs.
q=When%20did%20this%20attempt%20take%20place%3F&h=False&e=False&k=2&t=0.5&m=qwen2.5:7b&lp=0
Processing 4 of 100 question/answer pairs.
q=WHy%20was%20the%20Merit%20network%20formed%20in%20Michigan%20&h=False&e=False&k=2&t=0.5&m=qwen2.5:7b&lp=0
Processing 5 of 100 question/answer pairs.
q=Does%20the%20residential%20architecture%20of%20the%20Tower%20District%20compare%20or%20contrast%20with%20other%20part%20of%20Fresno%3F&h=False&e=False&k=2&t=0.5&m=qwen2.5:7b&lp=0
Processing 6 of 100 question/answer pairs.
q=How%20do%20you%20pronounce%20Fresno%3F&h=False&e=F

Unnamed: 0,id,page_content,type,metadata.source,metadata.similarity_score,answer,question,hyde,evaluation.grade,reference.ground_truth,reference.is_impossible,json,reference_id
0,,The weight of boilers and condensers generally...,Document,/data/contexts/context_1078.context,0.358998,Lower due to boiler and condenser weight const...,What is the power-to-weight ratio of a steam p...,False,,lower,False,"{""response"": {""question"": ""What is the power-t...",1


In [None]:
# Convert to Dataset
responses_df = pd.read_pickle(f"{experiment_folder}/rag_final_b1.pkl")

responses_df = responses_df.groupby("reference_id").agg(
    retrieved_contexts = ('page_content', lambda x: list(x)),
    question = ('question','first'),
    ground_truth = ('reference.ground_truth', 'first'),
    answer = ('answer', 'first')
    )

responses_df[0:1]

In [None]:
# Single iteration for Exception management

from ragas import evaluate
from ragas.metrics import Faithfulness, FactualCorrectness

results_df = []

for i in range(len(responses_df)):
   print (f"Processing {i} of {len(responses_df)}")
   df=responses_df.iloc[i:i+1]
   ds = Dataset.from_pandas( df)
   
   metrics = [FactualCorrectness(), Faithfulness()]
   try:
      evaluation_ds = evaluate(
                     dataset=ds, 
                     metrics=metrics, 
                     llm=llm,
                     run_config=RunConfig(
                        max_workers=1,
                        max_retries=1
                     ))
   
      eval_df = evaluation_ds.to_pandas()
      results_df.append(eval_df)
   except:
      pass

appended_ds = pd.concat(results_df)
appended_ds.to_pickle( f"{experiment_folder}/rag_final_b1-results.pkl" )
appended_ds[0:5]

#### View Results

In [23]:
no_rag_results_df = pd.read_pickle(f"{experiment_folder}/no_rag_b1-results.pkl")
no_rag_results_df[0:5]
no_rag_results_df.to_csv(f"{experiment_folder}/no_rag_b1-results.csv")


In [25]:
rag_final_results_df = pd.read_pickle(f"{experiment_folder}/rag_final_b1-results.pkl")
rag_final_results_df[0:5]
rag_final_results_df.to_csv(f"{experiment_folder}/rag_final_b1-results.csv")


In [21]:
# NO_RAG:
print( f"""
Count: {len(no_rag_results_df)}
Factual_Correctness: {no_rag_results_df["factual_correctness"].mean()}
Faithfulness: {no_rag_results_df["faithfulness"].mean()}
""")


Count: 51
Factual_Correctness: 0.42058823529411754
Faithfulness: 0.4852941176470588



In [22]:
# RAG_FINAL:
print( f"""
Count: {len(rag_final_results_df)}
Factual_Correctness: {rag_final_results_df["factual_correctness"].mean()}
Faithfulness: {rag_final_results_df["faithfulness"].mean()}
""")


Count: 20
Factual_Correctness: 0.48250000000000004
Faithfulness: 0.7

