# Notebook to develop rag evaluation methods

In [3]:
# Define API ENDPOINTS 
LLM_URL="http://10.103.251.104:8040/v1"
LLM_NAME="llama3"
MARQO_URL="http://10.103.251.104:8882"
# Old Marqo endpoint; version 1.5
# MARQO_URL="http://10.103.251.100:8882"


In [2]:
# Imports
import marqo
import re
import os
from langchain.text_splitter import (
    CharacterTextSplitter,  # need to install langchain
    NLTKTextSplitter,
    RecursiveCharacterTextSplitter,
)
from datasets import load_dataset
import pprint
import time
import random
import requests
from components import VectorStore, RagPipe, DatasetHelpers

In [None]:
# Evaluation for a single rag element
example_entry = {
    "question": "What is the capital of France?",
    "answer": "Paris",
    "contexts": [
        "Paris is the capital of France and a major European city.",
        "Paris is located on the River Seine in northern France.",
        "Marseille is known for its art, culture, and landmarks."
    ],
    "context_ids": ["1", "2", "3"],
    "ground_truth": "The capital of France is Paris."
}

print(example_entry)



In [4]:
# Run eval in a few lines

# Load the dataset
dataset = DatasetHelpers()
corpus_list, queries, ground_truths = dataset.loadMiniWiki()

# Load the VectorStore
documentDB = VectorStore(MARQO_URL) # Connect to marqo client via python API
print(documentDB.getIndexes()) # Print all indexes
documentDB.connectIndex("miniwikiindex") # Connect to the miniwikiindex

# Load the RagPipe
pipe = RagPipe()
pipe.connectVectorStore(documentDB)
pipe.connectLLM(LLM_URL, LLM_NAME)

# Run the rag pipeline and ingest
pipe.run(queries,ground_truths, corpus_list,newIngest=False,maxDocs=10,maxQueries=3)




Loading MiniWiki dataset
[{'indexName': 'miniwikiindex'}, {'indexName': 'ait-qm'}]
Index connected: miniwikiindex 
 Language model URL: http://10.103.251.104:8040/v1
 Language model connected: llama3
Using already indexed documents
Index Stats:  {'numberOfDocuments': 10, 'numberOfVectors': 15, 'backend': {'memoryUsedPercentage': 0.08814318706999999, 'storageUsedPercentage': 30.90243340992}}
Start answering queries. Please wait. 
Current Question: Was Abraham Lincoln the sixteenth President of the United States?
Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Current Question: Did Lincoln sign the National Banking Act of 1863?
Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Current Question: Did his mother die of pneumonia?
Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...


In [7]:
# Evaluate the rag pipeline
pipe.eval(method="correctness")

Answer: A very easy one!

Yes, Abraham Lincoln was indeed the 16th President of the United States! He served from March 1861 until his assassination in April 1865.
Ground truth: yes





 Similarity Score = tensor([[0.1328]]) 
Answer: No, Abraham Lincoln did not sign the National Banking Act of 1863.

The National Banking Act was actually signed into law by President Andrew Johnson on February 25, 1865. This act established a national banking system in the United States and created a new type of bank charter called a "national bank." The act also required national banks to invest a certain percentage of their capital in U.S. government securities.

Abraham Lincoln was assassinated on April 14, 1865, and died the next morning, so he did not have an opportunity to sign this legislation into law.
Ground truth: yes

 Similarity Score = tensor([[0.1200]]) 
Answer: I apologize, but this prompt doesn't seem to be related to the previous conversation about Uruguay. Could you please provide more context or clarify what you would like to know about someone's mother passing away from pneumonia? I'll do my best to help!
Ground truth: no

 Similarity Score = tensor([[0.0170]]) 


[tensor([[0.1328]]), tensor([[0.1200]]), tensor([[0.0170]])]

In [None]:
# Get data from an example dataset
# Code to prepare dataset
def prepare_mini_wiki(corpus, chunking_params):
    # Upload a mini wiki corpus to the marqo instance
    # The corpus is a dictionary with two keys. Passages and id. 
    # Passage is a list of strings 
    # Id is a list of ints.
    # Open the PDF file

    # Create a list of dictionaries with keys: passage, id
    corpus_list = []
    for passage, iD in zip(corpus["passage"], corpus["id"]):
            corpus_list.append({"text": passage, "id": iD})
    return corpus_list

# Prepare the mini wiki corpus
chunking_params = {
    "chunk_size": 1024,
    "chunk_overlap": 128,
    "chunk_method": "recursive",
}


mini_wiki_corpus = load_dataset("rag-datasets/mini_wikipedia", "text-corpus")["passages"]
corpus_list = prepare_mini_wiki(mini_wiki_corpus, chunking_params)
print(corpus_list[0])

# Get question - answer - (passages) dataset
mini_wiki_qa = load_dataset("rag-datasets/mini_wikipedia", "question-answer")["test"][0:5] # Load the first 5 qestion-answer-id triples
print(mini_wiki_qa)
questions = mini_wiki_qa["question"]
ground_truths = mini_wiki_qa["answer"]
print(questions)
print(ground_truths)





In [None]:
## 
## 1. Connect to Vector database
## 

documentDB = VectorStore(MARQO_URL) # Connect to marqo client via python API
print(documentDB.getIndexes()) # Print all indexes
documentDB.connectIndex("miniwikiindex") # Connect to index with name miniWikiIndex
documentDB.emptyIndex()


##
## 2. Index Documents
##
mini_wiki_corpus = load_dataset("rag-datasets/mini_wikipedia", "text-corpus")["passages"]
corpus_list = prepare_mini_wiki(mini_wiki_corpus, chunking_params)
print(corpus_list[0])

maxDocs = 1000
documentDB.indexDocuments(documents=corpus_list, maxDocs=maxDocs)
print(documentDB.getIndexStats())


In [None]:
# Create elemtents for the RAG model. With question and ground truth. 
# Then appear contexts and llm answers via RAG model

# Create pipeline object 
pipe = RagPipe()
pipe.connectVectorStore(documentDB)
pipe.connectLLM(LLM_URL, LLM_NAME)


# Get question - answer - (passages) dataset
mini_wiki_qa = load_dataset("rag-datasets/mini_wikipedia", "question-answer")["test"][0:5] # Load the first 5 qestion-answer-id triples

# Create a list of dictionaries with keys: question, answer, contexts, context_ids, ground_truth
rag_elements = []
for question, ground_truth in zip(questions, ground_truths):
    rag_elements.append({"question": question, "answer": "", "contexts": [], "context_ids": [], "ground_truth": ground_truth})

# Iterate over the rag elements and get the answer from the LLM model and the contexts from the Vector DB
for rag_element in rag_elements: 
    llmanswer, contexts, context_ids = pipe.answerQuery(rag_element["question"]) # Get answer from LLM model
    rag_element["answer"] = llmanswer
    rag_element["contexts"] = contexts
    rag_element["context_ids"] = context_ids
    



In [None]:
for rag_element in rag_elements:
    print(rag_element)
    print("Question: ", rag_element["question"])
    print("Answer: ", rag_element["answer"])
    print("Ground Truth: ", rag_element["ground_truth"])
    print("Contexts: ")
    for context, context_id in zip(rag_element["contexts"], rag_element["context_ids"]):
        print("Context ID: ", context_id)
        print("Context: ", context)
    print("\n\n\n")

In [None]:
# Evaluate context relevancen for a single element

## 
## 1. Connect to Vector database
## 

#documentDB = VectorStore(MARQO_URL) # Connect to marqo client via python API
#print(documentDB.getIndexes()) # Print all indexes
#documentDB.connectIndex("miniwikiindex") # Connect to index with name miniWikiIndex

##
## Create pipeline object 
## 

#pipe = RagPipe()
#pipe.connectVectorStore(documentDB)
#pipe.connectLLM(LLM_URL, LLM_NAME)

# question = example_entry["question"]
# contexts = example_entry["contexts"]
# contexts_ids = example_entry["context_ids"]
# answer = example_entry["answer"]
# ground_truth = example_entry["ground_truth"]

##
## Evaluate example context relevance. Possible to evaluate multiple questions-contexts at once
##

for rag_element in rag_elements:
    question = rag_element["question"]
    contexts = rag_element["contexts"]
    contexts_ids = rag_element["context_ids"]
    scores = pipe.evaluate_context_relevance([question], [contexts], [contexts_ids])
    print("Context Relevance Scores: ", scores)
    
# pipe.evaluate_context_relevance([question], [contexts,contexts], [contexts_ids,contexts_ids])


## 
## Evaluate faithfulness. I.e. If answer is faithful to the context
##

# pipe.evaluate_faithfulness(["Marseille is a nice city", "Paris is in the north of France"], [contexts, contexts])

## 
## Evaluate answer relevance. I.e. If answer is relevant to the question
##
 
# pipe.evaluate_answer_relevance([question], ["Berlin is the capital of France"])


##
## Evaluate answer correctness. I.e. If answer is correct compared to ground truth
##

# pipe.evaluate_correctness([answer], [ground_truth]) 
# Need more than just sentence transformer comparison
# Either also an LLM judge or use ROUGE or BLEU score in addtion to semantic sentence. 
# E'g' "Paris is the capital of France" and "Paris" should be considered a perfect match 
# but semantic sentence similarity is just 0.56 or so.

