# Notebook to develop rag evaluation methods

In [1]:
# Define API ENDPOINTS 
LLM_URL="http://10.103.251.104:8040/v1"
LLM_NAME="mixtral"
MARQO_URL="http://10.103.251.104:8882"
# Old Marqo endpoint; version 1.5
# MARQO_URL="http://10.103.251.100:8882"


In [2]:
# Imports
import marqo
import re
import os
from langchain.text_splitter import (
    CharacterTextSplitter,  # need to install langchain
    NLTKTextSplitter,
    RecursiveCharacterTextSplitter,
)
from datasets import load_dataset
import pprint
import time
import random
import requests
from components import VectorStore, RagPipe


### Create or Connect to Vector Store

In [3]:
# Set Index Settings, docs: https://docs.marqo.ai/2.5/API-Reference/Indexes/create_index/
INDEX_NAME = "mini_wiki_index"
index_params = {
    "split_method": "sentence",
    "distance_metric": "prenormalized-angular",
    "model": "hf/all_datasets_v4_MiniLM-L6",
    #"model" : 'flax-sentence-embeddings/all_datasets_v4_mpnet-base',
}

documentDB = VectorStore(MARQO_URL) # Connect to marqo client via python API
#documentDB.createIndex("miniWikiIndex", index_params) # Create index with name miniWikiIndex
#print(documentDB.getIndexes()) # Print all indexes
#documentDB.deleteIndex("mini_wiki_index") # Delete index with name miniWikiIndex
print(documentDB.getIndexes()) # Print all indexes
documentDB.connectIndex("miniwikiindex") # Connect to index with name miniWikiIndex

[{'indexName': 'miniwikiindex'}, {'indexName': 'ait-qm'}]
Index connected: miniwikiindex 


### Prepare Dataset 

In [4]:
# Code to prepare dataset
def prepare_mini_wiki(corpus, chunking_params):
    # Upload a mini wiki corpus to the marqo instance
    # The corpus is a dictionary with two keys. Passages and id. 
    # Passage is a list of strings 
    # Id is a list of ints.
    # Open the PDF file

    # Create a list of dictionaries with keys: passage, id
    corpus_list = []
    for passage, iD in zip(corpus["passage"], corpus["id"]):
            corpus_list.append({"text": passage, "id": iD})
    return corpus_list

# Prepare the mini wiki corpus
chunking_params = {
    "chunk_size": 1024,
    "chunk_overlap": 128,
    "chunk_method": "recursive",
}


mini_wiki_corpus = load_dataset("rag-datasets/mini_wikipedia", "text-corpus")
passages = mini_wiki_corpus["passages"]
corpus_list = prepare_mini_wiki(passages, chunking_params)
print(corpus_list[0])

{'text': 'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.', 'id': 0}


### Index Documents

In [5]:
# Code to Index data
upload_start = time.time()
maxDocs = 100
documentDB.indexDocuments(documents=corpus_list, maxDocs=maxDocs)
upload_end = time.time()
print(f"Time taken to upload {min(len(corpus_list), maxDocs)} passages: {upload_end - upload_start} seconds")
# Check if index contains data
print(documentDB.getIndexStats())

Time taken to upload 100 passages: 7.095343112945557 seconds
{'numberOfDocuments': 200, 'numberOfVectors': 334, 'backend': {'memoryUsedPercentage': 0.08232219977, 'storageUsedPercentage': 30.8928638317}}


Great, now we have the documents indexed. Now retrieve passages based on a query. 
Indexing 3200 short passages takes ~ 4minutes 

### Get questions and answers from dataset

In [6]:
# Get question - answer - (passages) dataset
mini_wiki_qa = load_dataset("rag-datasets/mini_wikipedia", "question-answer")
mini_wiki_qa = mini_wiki_qa["test"][0:5] # Load the first 5 qestion-answer-id triples
print(mini_wiki_qa)
questions = mini_wiki_qa["question"]
answers = mini_wiki_qa["answer"]
print(questions)
print(answers)

{'question': ['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?'], 'answer': ['yes', 'yes', 'no', '18 months', '1832'], 'id': [0, 2, 4, 6, 8]}
['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?']
['yes', 'yes', 'no', '18 months', '1832']


### Create Pipeline and connect to vector store

In [7]:
# Create pipeline object 
pipe = RagPipe()
pipe.connectVectorStore(documentDB)
pipe.connectLLM(LLM_URL, LLM_NAME)
pipe.answerQuery("What is the capital of Uruguay?")

 Language model URL: http://10.103.251.104:8040/v1
 Language model connected: mixtral
["Montevideo, Uruguay's capital.",
 "Montevideo, Uruguay's capital.",
 'Map of Uruguay']
Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

(' The capital of Uruguay is Montevideo. It is the largest city in the country '
 'and serves as the political, economic, and cultural hub of Uruguay. '
 'Montevideo is known for its vibrant cultural scene, beautiful beaches, and '
 'historic architecture, including the iconic Salvo Palace and the '
 'Metropolitan Cathedral. The city has a rich history, having been founded in '
 "1726 by Spanish settlers, and it played an important role in the country's "
 'struggle for independence from Spain in the early 19th century. Today, '
 'Montevideo is a popular tourist destination and a thriving center of '
 'commerce and industry in Uruguay.')


" The capital of Uruguay is Montevideo. It is the largest city in the country and serves as the political, economic, and cultural hub of Uruguay. Montevideo is known for its vibrant cultural scene, beautiful beaches, and historic architecture, including the iconic Salvo Palace and the Metropolitan Cathedral. The city has a rich history, having been founded in 1726 by Spanish settlers, and it played an important role in the country's struggle for independence from Spain in the early 19th century. Today, Montevideo is a popular tourist destination and a thriving center of commerce and industry in Uruguay."

In [8]:
# Test cell 
# Delete all docs in index
documentDB.getIndexStats()
#documentDB.emptyIndex()
documentDB.getIndexSettings()

{'type': 'unstructured',
 'treatUrlsAndPointersAsImages': False,
 'filterStringMaxLength': 20,
 'model': 'hf/all_datasets_v4_MiniLM-L6',
 'normalizeEmbeddings': True,
 'textPreprocessing': {'splitLength': 2,
  'splitOverlap': 0,
  'splitMethod': 'sentence'},
 'imagePreprocessing': {},
 'vectorNumericType': 'float',
 'annParameters': {'spaceType': 'prenormalized-angular',
  'parameters': {'efConstruction': 512, 'm': 16}}}

In [9]:
# 1. Analyse Search Results - CONTEXT RELEVANCE
def evaluate_context_relevance(queries, dataBase , k, goldPassages=None, ):
    # Get retrieved text.
    # Get retrieved text.
    scores = {}
    for query in queries:
        response = dataBase.retrieveDocuments(query=query, k=k)
        contexts = [response["hits"][i]["text"] for i in range(len(response["hits"]))]
        ids = [response["hits"][i]["id"] for i in range(len(response["hits"]))]
        measurements = []
        for id, context in zip(ids,contexts):
            # Insert here evaluation measure of retrieved context
            print(f"ID: {id}")
            print(f"Context: {context}")
            measure = id # Insert evaluation measure here
            measurements.append(measure)
        
        scores[query] = measurements # Insert evaluation measure here

    return scores


queries = ["What is the capital of Uruguay?", "Where is Washington?"] # Query
evaluate_context_relevance(queries, documentDB, 3)

ID: 36
Context: Montevideo, Uruguay's capital.
ID: 36
Context: Montevideo, Uruguay's capital.
ID: 28
Context: Map of Uruguay
ID: 80
Context: Michael Faraday was born in Newington Butts, near present-day South London, England. His family was not well off. His father, James, was a member of the Sandemanian sect of Christianity. James Faraday had come to London ca 1790 from Outhgill in Westmorland, where he had been the village blacksmith. The young Michael Faraday, one of four children, having only the most basic of school educations, had to largely educate himself. "Michael Faraday."  History of Science and Technology. Houghton Mifflin Company, 2004. Answers.com 4 June 2007.  /ref> At fourteen he became apprenticed to a local bookbinder and bookseller George Riebau and, during his seven-year apprenticeship, he read many books, including Isaac Watts' The Improvement of the Mind, and he enthusiastically implemented the principles and suggestions contained therein. He developed an interest

{'What is the capital of Uruguay?': [36, 36, 28],
 'Where is Washington?': [80, 80, 2]}

In [24]:
# Test llm judge on rating the relevance of a context
context = ("Uruguay's capital, Montevideo, was founded by the Spanish in the early 18th century "
"as a military stronghold; its natural "
"harbor soon developed into a commercial center competing with Argentina's capital, Buenos Aires.")
context1 = "Urugays largest city is Montevideo"
query = "What is the capital of Uruguay?"

def llm_binary_context_relevance(context, query, LLM_NAME, LLM_URL):
    messages = [
                {"role": "system", "content": "Given the following context and query,"
                " Give a binary rating, either 0 or 1."
                " 0 means the context is not sufficient for answering the query. "
                " 1 means the context is sufficient for answering the query. "
                ".Respond with a single integer and give no additional explaination. "
                'The output must strictly be "0" or "1"' },
                {"role": "user", "content": f"Context: {context1} ; Query: {query}"}
            ]

    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer N/A ",
    }

    data = {
        "model": LLM_NAME,
        "messages": messages,
        "temperature": 0,
        "max_tokens": 2,
        # "presence_penalty": presence_pen,
        # "repeat_penalty": repeat_pen,
    }
    endpoint = LLM_URL + "/chat/completions"
    print("Sending query to OpenAI endpoint: " + endpoint)
    report = requests.post(endpoint, headers=headers, json=data).json()
    print("Received response...")
    if "choices" in report:
        if len(report["choices"]) > 0:  # Always take the first choice.
            result = report["choices"][0]["message"]["content"]
        else:
            result = "No result generated!"
    else:
        result = report
    print("Response: \n")
    pprint.pprint(result)
    return result
score = llm_binary_context_relevance(context, query, LLM_NAME, LLM_URL)


Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

' 1'


In [25]:
# Test llm judge on rating the faithfullness of a context and answer
context = ("Uruguay's capital, Montevideo, was founded by the Spanish in the early 18th century "
"as a military stronghold; its natural "
"harbor soon developed into a commercial center competing with Argentina's capital, Buenos Aires.")
context1 = "Urugays largest city is Montevideo"
answer = "The capital of uraguay is Bueonos Aires"

def llm_binary_faithfullness(context, answer, LLM_NAME, LLM_URL):
    messages = [
                {"role": "system", "content": "Given the following context and answer,"
                " Give a binary rating, either 0 or 1."
                " 0 means the answer is not sufficiently grounded in the context. "
                " 1 means the answer is sufficiently grounded in the context "
                ".Respond with a single integer and give no additional explaination. "
                'The output must strictly be "0" or "1"' },
                {"role": "user", "content": f"Context: {context1} ; Answer: {answer}"}
            ]

    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer N/A ",
    }

    data = {
        "model": LLM_NAME,
        "messages": messages,
        "temperature": 0,
        "max_tokens": 2,
        # "presence_penalty": presence_pen,
        # "repeat_penalty": repeat_pen,
    }
    endpoint = LLM_URL + "/chat/completions"
    print("Sending query to OpenAI endpoint: " + endpoint)
    report = requests.post(endpoint, headers=headers, json=data).json()
    print("Received response...")
    if "choices" in report:
        if len(report["choices"]) > 0:  # Always take the first choice.
            result = report["choices"][0]["message"]["content"]
        else:
            result = "No result generated!"
    else:
        result = report
    print("Response: \n")
    pprint.pprint(result)
    return result
score = llm_binary_faithfullness(context, answer, LLM_NAME, LLM_URL)


# 2. Analyse if answer built upon search results  - FAITHFULNESS
def evaluate_faithfulness(answers, contexts):
    scores = {}
    for answer in answers:
        measurements = []
        for context in contexts:
            # Insert here evaluation measure of retrieved context
            print(f"ID: {id}")
            print(f"Context: {context}")
            measure = id # Insert evaluation measure here
            measurements.append(measure)

        scores[query] = measurements # Insert evaluation measure here

    return scores
#faithfulness_scores = evaluate_faithfulness(answer, context)

Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

' 0'


In [28]:
# 3. Analyse if answer is relevant to the question - ANSWER RELEVANCE
def evaluate_answer_relevance(answers, queries):
    scores = []
    for answer,query in zip(answers,queries):
        print(f"Answer: {answer}")
        print(f"Query: {query}")
        measure = 42 # Insert evaluation measure here
        scores.append(measure)
    return scores


def llm_binary_answer_relevance(answer,query, LLM_NAME, LLM_URL):
    messages = [
                {"role": "system", "content": "Given the following query and answer,"
                " Give a binary rating, either 0 or 1."
                " 0 means the answer is not sufficient in answering the question"
                " 1 means the answer is sufficient in answering the question"
                ".Respond with a single integer and give no additional explaination. "
                'The output must strictly be "0" or "1"' },
                {"role": "user", "content": f"Query: {query} ; Answer: {answer}"}
            ]

    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer N/A ",
    }

    data = {
        "model": LLM_NAME,
        "messages": messages,
        "temperature": 0,
        "max_tokens": 2,
        # "presence_penalty": presence_pen,
        # "repeat_penalty": repeat_pen,
    }
    endpoint = LLM_URL + "/chat/completions"
    print("Sending query to OpenAI endpoint: " + endpoint)
    report = requests.post(endpoint, headers=headers, json=data).json()
    print("Received response...")
    if "choices" in report:
        if len(report["choices"]) > 0:  # Always take the first choice.
            result = report["choices"][0]["message"]["content"]
        else:
            result = "No result generated!"
    else:
        result = report
    print("Response: \n")
    pprint.pprint(result)
    return result


question = "Who is the best basketball player ever. "
answer1 = "The best basketball player ever is Michael Jordan."
answer2 = "The best football player ever is Lionel Messi"
llm_binary_answer_relevance(answer2, question, LLM_NAME, LLM_URL)


Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

' 0'


' 0'

In [32]:
# 4. Analyse if answer is correct - ANSWER CORRECTNESS
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer


def evaluate_correctness(answers,ground_truths):
    scores = []
    for answer,query in zip(answers,ground_truths):
        print(f"Answer: {answer}")
        print(f"Query: {ground_truth}")
        measure = 42 # Insert evaluation measure here
        scores.append(measure)
    return scores


def semantic_similarity(sentence1,sentence2):
    model = SentenceTransformer('all-mpnet-base-v2')

    sentence1_vec = model.encode([sentence1])

    sentence2_vec = model.encode([sentence2])
    similarity_score = model.similarity(sentence1_vec, sentence2_vec) # Default is cosine simi
    print(f'\n Similarity Score = {similarity_score} ')

    return similarity_score

answer = "The best basketball player ever is Stephen Curry."
ground_truth = "The best basketball player ever is Michael Jordan."
semantic_similarity(answer,ground_truth)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


 Similarity Score = tensor([[0.8866]]) 


tensor([[0.8866]])