# Notebook to develop rag evaluation methods

In [1]:
# Define API ENDPOINTS 
LLM_URL="http://10.103.251.104:8040/v1"
LLM_NAME="mixtral"
MARQO_URL="http://10.103.251.104:8882"
# Old Marqo endpoint; version 1.5
# MARQO_URL="http://10.103.251.100:8882"


In [2]:
# Imports and Functions 
import marqo
import re
import os
from langchain.text_splitter import (
    CharacterTextSplitter,  # need to install langchain
    NLTKTextSplitter,
    RecursiveCharacterTextSplitter,
)
from datasets import load_dataset


def token_estimate(text):  # OpenAI suggest a token consists of 3/4 words
    return len(re.findall(r"\w+", text)) * 4 / 3


def get_marqo_client(murl):
    # Create a connection with the marqo instance
    return marqo.Client(url=murl)


def createIndex(mq, indexName, settings):
    try:
        split_method = settings["split_method"]
        distance_metric = settings["distance_metric"]
        model = settings["model"]

    except:
        print(
            f"Settings could not be parsed to create a new index with name: {indexName}"
        )

    indexName = indexName.lower()
    if indexName in mq.get_indexes():
        print(f"Index already exists: {indexName} ")
    else:
        index_settings = {
                "model": model,
                "normalizeEmbeddings": True,
                "textPreprocessing": {
                    "splitLength": 2,
                    "splitOverlap": 0,
                    "splitMethod": split_method,
                },
                "annParameters": {
                    "spaceType": distance_metric,
                    "parameters": {"efConstruction": 512, "m": 16},
                },
        }
        try:
            mq.create_index(indexName, settings_dict=index_settings)
            #mq.create_index(indexName,  model='flax-sentence-embeddings/all_datasets_v4_mpnet-base')
            print("New index created: " + indexName)
        except:
            print(
                "Failed to created new index: " + indexName + " - check marqo endpoint!"
            )


def deleteIndex(mq, indexName):
    # Delete index by indexName
    try:
        # now remove the marqo index
        mq.delete_index(indexName)
        # now remove the files -- For now, no need to delete the files locally.
        # output_directory = os.path.join(OUTPUT_DIRECTORY, indexName)
        # if os.path.exists(output_directory):
        #     shutil.rmtree(output_directory)
        # else:
        #     print("No associated file directories found and hence none deleted.")

        print(f" Sucessfuylly deleted Index: {indexName}")
    except:
        print("Unable to delete: " + indexName)


def printIndexes(marqoClient):
    for index in marqoClient.get_indexes()["results"]:
        print(index)


def upload_mini_wiki(corpus, chunking_params):
    # Upload a mini wiki corpus to the marqo instance
    # The corpus is a dictionary with two keys. Passages and id. 
    # Passage is a list of strings 
    # Id is a list of ints.
    # Open the PDF file
   
        for passage, iD in zip(corpus["passage"], corpus["id"]):

            tokens = token_estimate(passage)
            # TODO: Add a check and compare current file by id and timestamp
            # If file is already in the index, check if it has been modified
            # If the file has been modified, update the file in the index
            try:
                marqoClient.index(INDEX_NAME).add_documents(
                    [
                        {
                            "text": passage,
                            "tokens": tokens,
                            "id": iD,
                        }
                    ],
                    # Arguments in tensor_fields will have vectors generated for them. For best recall and performance, minimise the number of arguments in tensor_fields.
                    tensor_fields=["text"],
                )
            except:
                print(f"Ingest error for passage with id: {iD}")



In [5]:
# Set Index Settings, docs: https://docs.marqo.ai/2.5/API-Reference/Indexes/create_index/

# Open marqo with index name, if index not found create new index with the name
marqoClient = get_marqo_client(MARQO_URL)
INDEX_NAME = "mini_wiki_index"
index_params = {
    "split_method": "sentence",
    "distance_metric": "prenormalized-angular",
    #"model": "hf/all_datasets_v4_MiniLM-L6",
    "model" : 'flax-sentence-embeddings/all_datasets_v4_mpnet-base',
}

chunking_params = {
    "chunk_size": 1024,
    "chunk_overlap": 128,
    "chunk_method": "recursive",
}

# Test createIndex and deleteIndex
createIndex(marqoClient, INDEX_NAME, index_params)
printIndexes(marqoClient)


# Index chunks of data
# Get the mini wiki corpus
mini_wiki_corpus = load_dataset("rag-datasets/mini_wikipedia", "text-corpus")
passages = mini_wiki_corpus["passages"][0:5] # Load the first 5 passages
upload_mini_wiki(passages, chunking_params)
# Check if index contains data
print(marqoClient.index(INDEX_NAME).get_stats())
deleteIndex(marqoClient, INDEX_NAME)
printIndexes(marqoClient)






New index created: mini_wiki_index
{'indexName': 'mini_wiki_index'}
{'indexName': 'ait-qm'}
{'numberOfDocuments': 5, 'numberOfVectors': 8, 'backend': {'memoryUsedPercentage': None, 'storageUsedPercentage': None}}
 Sucessfuylly deleted Index: mini_wiki_index
{'indexName': 'ait-qm'}


In [4]:
# Get question - answer - (passages) dataset

# Send Questions to index to retrieve data from vector db 

# 1. Analyse Search Results - CONTEXT RELEVANCE

# Send retrieved passages along with queestion to the model to get the answer

# 2. Analyse if answer built upon search results  - FAITHFULNESS

# 3. Analyse if answer is relevant to the question - ANSWER RELEVANCE

# 4. Analyse if answer is correct - ANSWER CORRECTNESS 

