# Notebook to develop rag evaluation methods

In [1]:
# Define API ENDPOINTS 
LLM_URL="http://10.103.251.104:8040/v1"
LLM_NAME="mixtral"
MARQO_URL="http://10.103.251.104:8882"
# Old Marqo endpoint; version 1.5
# MARQO_URL="http://10.103.251.100:8882"


In [2]:
# Imports
import marqo
import re
import os
from langchain.text_splitter import (
    CharacterTextSplitter,  # need to install langchain
    NLTKTextSplitter,
    RecursiveCharacterTextSplitter,
)
from datasets import load_dataset
import pprint
import time
import random
import requests
from components import VectorStore, RagPipe


### Create or Connect to Vector Store

In [3]:
# Set Index Settings, docs: https://docs.marqo.ai/2.5/API-Reference/Indexes/create_index/
INDEX_NAME = "mini_wiki_index"
index_params = {
    "split_method": "sentence",
    "distance_metric": "prenormalized-angular",
    "model": "hf/all_datasets_v4_MiniLM-L6",
    #"model" : 'flax-sentence-embeddings/all_datasets_v4_mpnet-base',
}

documentDB = VectorStore(MARQO_URL) # Connect to marqo client via python API
#documentDB.createIndex("miniWikiIndex", index_params) # Create index with name miniWikiIndex
#print(documentDB.getIndexes()) # Print all indexes
#documentDB.deleteIndex("mini_wiki_index") # Delete index with name miniWikiIndex
print(documentDB.getIndexes()) # Print all indexes
documentDB.connectIndex("miniwikiindex") # Connect to index with name miniWikiIndex

[{'indexName': 'miniwikiindex'}, {'indexName': 'ait-qm'}]
Index connected: miniwikiindex 


### Prepare Dataset 

In [4]:
# Code to prepare dataset
def prepare_mini_wiki(corpus, chunking_params):
    # Upload a mini wiki corpus to the marqo instance
    # The corpus is a dictionary with two keys. Passages and id. 
    # Passage is a list of strings 
    # Id is a list of ints.
    # Open the PDF file

    # Create a list of dictionaries with keys: passage, id
    corpus_list = []
    for passage, iD in zip(corpus["passage"], corpus["id"]):
            corpus_list.append({"text": passage, "id": iD})
    return corpus_list

# Prepare the mini wiki corpus
chunking_params = {
    "chunk_size": 1024,
    "chunk_overlap": 128,
    "chunk_method": "recursive",
}


mini_wiki_corpus = load_dataset("rag-datasets/mini_wikipedia", "text-corpus")
passages = mini_wiki_corpus["passages"]
corpus_list = prepare_mini_wiki(passages, chunking_params)
print(corpus_list[0])

{'text': 'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.', 'id': 0}


### Index Documents

In [10]:
# Code to Index data
upload_start = time.time()
maxDocs = 100
documentDB.indexDocuments(documents=corpus_list, maxDocs=maxDocs)
upload_end = time.time()
print(f"Time taken to upload {min(len(corpus_list), maxDocs)} passages: {upload_end - upload_start} seconds")
# Check if index contains data
print(documentDB.getIndexStats())

Time taken to upload 100 passages: 6.9903342723846436 seconds
{'numberOfDocuments': 100, 'numberOfVectors': 167, 'backend': {'memoryUsedPercentage': 0.08124080171, 'storageUsedPercentage': 30.892277273379996}}


Great, now we have the documents indexed. Now retrieve passages based on a query. 
Indexing 3200 short passages takes ~ 4minutes 

### Get questions and answers from dataset

In [6]:
# Get question - answer - (passages) dataset
mini_wiki_qa = load_dataset("rag-datasets/mini_wikipedia", "question-answer")
mini_wiki_qa = mini_wiki_qa["test"][0:5] # Load the first 5 qestion-answer-id triples
print(mini_wiki_qa)
questions = mini_wiki_qa["question"]
answers = mini_wiki_qa["answer"]
print(questions)
print(answers)

{'question': ['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?'], 'answer': ['yes', 'yes', 'no', '18 months', '1832'], 'id': [0, 2, 4, 6, 8]}
['Was Abraham Lincoln the sixteenth President of the United States?', 'Did Lincoln sign the National Banking Act of 1863?', 'Did his mother die of pneumonia?', "How many long was Lincoln's formal education?", 'When did Lincoln begin his political career?']
['yes', 'yes', 'no', '18 months', '1832']


### Create Pipeline and connect to vector store

In [11]:
# Create pipeline object 
pipe = RagPipe()
pipe.connectVectorStore(documentDB)
pipe.connectLLM(LLM_URL, LLM_NAME)
pipe.answerQuery("What is the capital of Uruguay?")

 Language model URL: http://10.103.251.104:8040/v1
 Language model connected: mixtral
["Montevideo, Uruguay's capital.",
 'Map of Uruguay',
 "Uruguay's capital, Montevideo, was founded by the Spanish in the early 18th "
 'century as a military stronghold; its natural harbor soon developed into a '
 "commercial center competing with Argentina's capital, Buenos Aires. "
 "Uruguay's early 19th century history was shaped by ongoing conflicts between "
 'the British, Spanish, Portuguese, and colonial forces for dominance in the '
 'Argentina-Brazil-Uruguay region.  /ref> In 1806 and 1807, the British army '
 'attempted to seize Buenos Aires as part of their war with Spain. As a '
 'result, at the beginning of 1807, Montevideo was occupied by a 10,000-strong '
 'British force who held it until the middle of the year when they left to '
 'attack Buenos Aires.']
Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

' The capital of Ur

In [12]:
# Test cell 
# Delete all docs in index
documentDB.getIndexStats()
#documentDB.emptyIndex()
documentDB.getIndexSettings()

{'type': 'unstructured',
 'treatUrlsAndPointersAsImages': False,
 'filterStringMaxLength': 20,
 'model': 'hf/all_datasets_v4_MiniLM-L6',
 'normalizeEmbeddings': True,
 'textPreprocessing': {'splitLength': 2,
  'splitOverlap': 0,
  'splitMethod': 'sentence'},
 'imagePreprocessing': {},
 'vectorNumericType': 'float',
 'annParameters': {'spaceType': 'prenormalized-angular',
  'parameters': {'efConstruction': 512, 'm': 16}}}

In [16]:
# 1. Analyse Search Results - CONTEXT RELEVANCE
def evaluate_context_relevance(queries, dataBase , k, goldPassages=None, ):
    # Get retrieved text.
    # Get retrieved text.
    scores = {}
    for query in queries:
        response = dataBase.retrieveDocuments(query=query, k=k)
        contexts = [response["hits"][i]["text"] for i in range(len(response["hits"]))]
        ids = [response["hits"][i]["id"] for i in range(len(response["hits"]))]
        measurements = []
        for id, context in zip(ids,contexts):
            # Insert here evaluation measure of retrieved context
            print(f"ID: {id}")
            print(f"Context: {context}")
            measure = id # Insert evaluation measure here
            measurements.append(measure)
        
        scores[query] = measurements # Insert evaluation measure here

    return scores


queries = ["What is the capital of Uruguay?", "Where is Washington?"] # Query
evaluate_context_relevance(queries, documentDB, 3)

ID: 36
Context: Montevideo, Uruguay's capital.
ID: 28
Context: Map of Uruguay
ID: 15
Context: Uruguay's capital, Montevideo, was founded by the Spanish in the early 18th century as a military stronghold; its natural harbor soon developed into a commercial center competing with Argentina's capital, Buenos Aires. Uruguay's early 19th century history was shaped by ongoing conflicts between the British, Spanish, Portuguese, and colonial forces for dominance in the Argentina-Brazil-Uruguay region.  /ref> In 1806 and 1807, the British army attempted to seize Buenos Aires as part of their war with Spain. As a result, at the beginning of 1807, Montevideo was occupied by a 10,000-strong British force who held it until the middle of the year when they left to attack Buenos Aires.
ID: 80
Context: Michael Faraday was born in Newington Butts, near present-day South London, England. His family was not well off. His father, James, was a member of the Sandemanian sect of Christianity. James Faraday ha

{'What is the capital of Uruguay?': [36, 28, 15],
 'Where is Washington?': [80, 2, 71]}

In [38]:
# Test llm judge on rating the relevance of a context
context = ("Uruguay's capital, Montevideo, was founded by the Spanish in the early 18th century "
"as a military stronghold; its natural "
"harbor soon developed into a commercial center competing with Argentina's capital, Buenos Aires.")
context1 = "Map of Uruguay"
query = "What is the capital of Uruguay?"


messages = [
            {"role": "user", "content": "Give a rating of relevancy of this text: "
            f"{context} to the follwoing query: {query}."
            " Give a rating between 1 and 10. With 1 being completetly irrelevant"
            " and 10 meaning extremely relevant. Respond with a single integer and nothing else!" }
        ]

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer N/A ",
}

data = {
    "model": LLM_NAME,
    "messages": messages,
    "temperature": 0,
    "max_tokens": 3,
    # "presence_penalty": presence_pen,
    # "repeat_penalty": repeat_pen,
}
endpoint = LLM_URL + "/chat/completions"
print("Sending query to OpenAI endpoint: " + endpoint)
report = requests.post(endpoint, headers=headers, json=data).json()
print("Received response...")
if "choices" in report:
    if len(report["choices"]) > 0:  # Always take the first choice.
        result = report["choices"][0]["message"]["content"]
    else:
        result = "No result generated!"
else:
    result = report
print("Response: \n")
pprint.pprint(result)

Sending query to OpenAI endpoint: http://10.103.251.104:8040/v1/chat/completions
Received response...
Response: 

' 10'
