# Notebook to develop rag evaluation methods

In [1]:
# Define API ENDPOINTS 
LLM_URL="http://10.103.251.104:8040/v1"
LLM_NAME="llama3"
MARQO_URL="http://10.103.251.104:8882"
# Old Marqo endpoint; version 1.5
# MARQO_URL="http://10.103.251.100:8882"


In [2]:
# Imports
import marqo
import re
import os
from langchain.text_splitter import (
    CharacterTextSplitter,  # need to install langchain
    NLTKTextSplitter,
    RecursiveCharacterTextSplitter,
)
from datasets import load_dataset
import pprint
import time
import random
import requests
from components import VectorStore, RagPipe

In [3]:
# Evaluation for a single rag element
example_entry = {
    "question": "What is the capital of France?",
    "answer": "Paris",
    "contexts": [
        "Paris is the capital of France and a major European city.",
        "Paris is located on the River Seine in northern France.",
        "Marseille is known for its art, culture, and landmarks."
    ],
    "context_ids": ["1", "2", "3"],
    "ground_truth": "The capital of France is Paris."
}

print(example_entry)





{'question': 'What is the capital of France?', 'answer': 'Paris', 'contexts': ['Paris is the capital of France and a major European city.', 'Paris is located on the River Seine in northern France.', 'Marseille is known for its art, culture, and landmarks.'], 'context_ids': ['1', '2', '3'], 'ground_truth': 'The capital of France is Paris.'}


In [5]:
# Evaluate context relevancen for a single element

## 
## 1. Connect to Vector database
## 

documentDB = VectorStore(MARQO_URL) # Connect to marqo client via python API
print(documentDB.getIndexes()) # Print all indexes
documentDB.connectIndex("miniwikiindex") # Connect to index with name miniWikiIndex

##
## Create pipeline object 
## 

pipe = RagPipe()
pipe.connectVectorStore(documentDB)
pipe.connectLLM(LLM_URL, LLM_NAME)

question = example_entry["question"]
contexts = example_entry["contexts"]
contexts_ids = example_entry["context_ids"]
answer = example_entry["answer"]
ground_truth = example_entry["ground_truth"]

##
## Evaluate example context relevance. Possible to evaluate multiple questions-contexts at once
##

# pipe.evaluate_context_relevance([question], [contexts,contexts], [contexts_ids,contexts_ids])


## 
## Evaluate faithfulness. I.e. If answer is faithful to the context
##

# pipe.evaluate_faithfulness(["Marseille is a nice city", "Paris is in the north of France"], [contexts, contexts])

## 
## Evaluate answer relevance. I.e. If answer is relevant to the question
##
 
# pipe.evaluate_answer_relevance([question], ["Berlin is the capital of France"])


##
## Evaluate answer correctness. I.e. If answer is correct compared to ground truth
##

pipe.evaluate_correctness([answer], [ground_truth]) 
# Need more than just sentence transformer comparison
# Either also an LLM judge or use ROUGE or BLEU score in addtion to semantic sentence. 
# E'g' "Paris is the capital of France" and "Paris" should be considered a perfect match 
# but semantic sentence similarity is just 0.56 or so.






[{'indexName': 'miniwikiindex'}, {'indexName': 'ait-qm'}]
Index connected: miniwikiindex 
 Language model URL: http://10.103.251.104:8040/v1
 Language model connected: llama3
Answer: Paris
Query: Paris

 Similarity Score = tensor([[1.0000]]) 


[tensor([[1.0000]])]