# Notebook to develop rag evaluation methods

In [1]:
# Define API ENDPOINTS 
LLM_URL="http://10.103.251.104:8040/v1"
LLM_NAME="mixtral"
MARQO_URL="http://10.103.251.104:8882"
# Old Marqo endpoint; version 1.5
# MARQO_URL="http://10.103.251.100:8882"


In [43]:
# Imports
import marqo
import re
import os
from langchain.text_splitter import (
    CharacterTextSplitter,  # need to install langchain
    NLTKTextSplitter,
    RecursiveCharacterTextSplitter,
)
from datasets import load_dataset
import pprint
import time
import random
import requests


In [None]:
# Function definitions

def token_estimate(text):  # OpenAI suggest a token consists of 3/4 words
    return len(re.findall(r"\w+", text)) * 4 / 3


def get_marqo_client(murl):
    # Create a connection with the marqo instance
    return marqo.Client(url=murl)


def createIndex(mq, indexName, settings):
    try:
        split_method = settings["split_method"]
        distance_metric = settings["distance_metric"]
        model = settings["model"]

    except:
        print(
            f"Settings could not be parsed to create a new index with name: {indexName}"
        )

    indexName = indexName.lower()
    if indexName in mq.get_indexes():
        print(f"Index already exists: {indexName} ")
    else:
        index_settings = {
                "model": model,
                "normalizeEmbeddings": True,
                "textPreprocessing": {
                    "splitLength": 2,
                    "splitOverlap": 0,
                    "splitMethod": split_method,
                },
                "annParameters": {
                    "spaceType": distance_metric,
                    # Tinker with this. Try increasing efConstruction along with m for better recall
                    # https://www.pinecone.io/learn/series/faiss/hnsw/
                    "parameters": {"efConstruction": 512, "m": 16},
                },
        }
        try:
            mq.create_index(indexName, settings_dict=index_settings)
            #mq.create_index(indexName,  model='flax-sentence-embeddings/all_datasets_v4_mpnet-base')
            print("New index created: " + indexName)
        except:
            print(
                "Failed to created new index: " + indexName + " - check marqo endpoint!"
            )


def deleteIndex(mq, indexName):
    # Delete index by indexName
    try:
        # now remove the marqo index
        mq.delete_index(indexName)
        # now remove the files -- For now, no need to delete the files locally.
        # output_directory = os.path.join(OUTPUT_DIRECTORY, indexName)
        # if os.path.exists(output_directory):
        #     shutil.rmtree(output_directory)
        # else:
        #     print("No associated file directories found and hence none deleted.")

        print(f" Sucessfuylly deleted Index: {indexName}")
    except:
        print("Unable to delete: " + indexName)


def printIndexes(marqoClient):
    for index in marqoClient.get_indexes()["results"]:
        print(index)


def upload_mini_wiki(corpus, chunking_params):
    # Upload a mini wiki corpus to the marqo instance
    # The corpus is a dictionary with two keys. Passages and id. 
    # Passage is a list of strings 
    # Id is a list of ints.
    # Open the PDF file
   
        for passage, iD in zip(corpus["passage"], corpus["id"]):

            tokens = token_estimate(passage)
            # TODO: Add a check and compare current file by id and timestamp
            # If file is already in the index, check if it has been modified
            # If the file has been modified, update the file in the index
            try:
                marqoClient.index(INDEX_NAME).add_documents(
                    [
                        {
                            "text": passage,
                            "tokens": tokens,
                            "id": iD,
                        }
                    ],
                    # Arguments in tensor_fields will have vectors generated for them. For best recall and performance, minimise the number of arguments in tensor_fields.
                    tensor_fields=["text"],
                )
            except:
                print(f"Ingest error for passage with id: {iD}")



In [22]:
# Set Index Settings, docs: https://docs.marqo.ai/2.5/API-Reference/Indexes/create_index/

# Open marqo with index name, if index not found create new index with the name
marqoClient = get_marqo_client(MARQO_URL)
INDEX_NAME = "mini_wiki_index"
index_params = {
    "split_method": "sentence",
    "distance_metric": "prenormalized-angular",
    "model": "hf/all_datasets_v4_MiniLM-L6",
    #"model" : 'flax-sentence-embeddings/all_datasets_v4_mpnet-base',
}

chunking_params = {
    "chunk_size": 1024,
    "chunk_overlap": 128,
    "chunk_method": "recursive",
}

# Test createIndex and deleteIndex
createIndex(marqoClient, INDEX_NAME, index_params)
printIndexes(marqoClient)


# Index chunks of data
# Get the mini wiki corpus
mini_wiki_corpus = load_dataset("rag-datasets/mini_wikipedia", "text-corpus")
passages = mini_wiki_corpus["passages"] # Load the first 5 passages
print(len(passages))
upload_start = time.time()
upload_mini_wiki(passages, chunking_params)
upload_end = time.time()
print(f"Time taken to upload {len(passages)} passages: {upload_end - upload_start} seconds")
# Check if index contains data
print(marqoClient.index(INDEX_NAME).get_stats())







Failed to created new index: mini_wiki_index - check marqo endpoint!
{'indexName': 'mini_wiki_index'}
{'indexName': 'ait-qm'}


3200
Time taken to upload 3200 passages: 238.87357354164124 seconds
{'numberOfDocuments': 3205, 'numberOfVectors': 6312, 'backend': {'memoryUsedPercentage': 0.066052599, 'storageUsedPercentage': 48.54578915951}}


Great, now we have the documents indexed. Now retrieve passages based on a query. 
Indexing 3200 short passages takes ~ 4minutes 

In [10]:
# Get question - answer - (passages) dataset
mini_wiki_qa = load_dataset("rag-datasets/mini_wikipedia", "question-answer")
mini_wiki_qa = mini_wiki_qa["test"][0:5] # Load the first 5 qestion-answer-id triples
print(mini_wiki_qa)
questions = mini_wiki_qa["question"]
answers = mini_wiki_qa["answer"]
print(questions)
print(answers)



{'question': ['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?'], 'answer': ['yes', 'yes', 'no', '18 months', '1832'], 'id': [0, 2, 4, 6, 8]}
['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?']
['yes', 'yes', 'no', '18 months', '1832']


In [36]:
# Send Questions to index to retrieve data from vector db
for question in questions[:1]:
    response = marqoClient.index(INDEX_NAME).search(
        q=question, 
        limit=2,
        attributes_to_retrieve=["text"],
    )
    # pprint.pprint(response)
    

# 1. Analyse Search Results - CONTEXT RELEVANCE
def evaluate_context_relevance(response):
    return random.random()

context_score = evaluate_context_relevance(response)
print(context_score)

# Get retrieved text. 
contexts = [response["hits"][i]["text"] for i in range(len(response["hits"]))]
pprint.pprint(contexts)
background = ""
for text in contexts:
    background += text + " "




0.9852319488424731
['Young Abraham Lincoln',
 'Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the '
 'sixteenth President of the United States, serving from March 4, 1861 until '
 'his assassination. As an outspoken opponent of the expansion of slavery in '
 'the United States, "[I]n his short autobiography written for the 1860 '
 'presidential campaign, Lincoln would describe his protest in the Illinois '
 "legislature as one that 'briefly defined his position on the slavery "
 'question, and so far as it goes, it was then the same that it is now." This '
 'was in reference to the anti-expansion sentiments he had then expressed. '
 'Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham '
 'Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, '
 'Holzer notes, "Cooper Union proved a unique confluence of political culture, '
 'rhetorical opportunity, technological innovation, and human genius, and it '
 'brought Abraham Lincoln t

In [45]:
# Send retrieved passages along with question to the model to get the answer
prompt = "Use the following context to answer the question afterwards. Only use information from the context and no prior knowledge."
model = "mixtral"
model_temp = 0.0
answer_size = 500
presence_pen = 0.5
repeat_pen = 0.5

for question in questions[:1]: 
    messages = [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": background},
                    {"role": "user", "content": question},
                ]
    headers = {
                "Content-Type": "application/json",
                "Authorization": f"Bearer N/A ",
               }
    data = {
        "model": model,
        "messages": messages,
        "temperature": model_temp,
        "max_tokens": answer_size,
        "presence_penalty": presence_pen,
        "repeat_penalty": repeat_pen,
    }
    endpoint = LLM_URL + "/chat/completions"
    print("Sending query to OpenAI endpoint: " + endpoint)
    report = requests.post(
        endpoint, headers=headers, json=data
    ).json()
    print("Received response...")
    if "choices" in report:
        if (
            len(report["choices"]) > 0
        ):  # Always take the first choice.
            result = report["choices"][0]["message"]["content"]
        else:
            result = "No result generated!"
    else:
        result = report
    print("Response: \n")
    pprint.pprint(result)


# 2. Analyse if answer built upon search results  - FAITHFULNESS
def evaluate_faithfulness(contexts, llm_response):
    return random.random()

faithfulness_score = evaluate_faithfulness(contexts, result)

# 3. Analyse if answer is relevant to the question - ANSWER RELEVANCE
def evaluate_answer_relevance(question, llm_response):
    return random.random()

answer_relevance_score = evaluate_answer_relevance(question, result)



# 4. Analyse if answer is correct - ANSWER CORRECTNESS

def evaluate_correctness(answer,llm_response):
    return random.random()

correctness_score = evaluate_correctness(answer,response):





Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

(' Yes, according to the context provided, Abraham Lincoln was indeed the '
 'sixteenth President of the United States.')
