In [1]:
from datasets import load_dataset, Dataset
import random
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.embeddings import HuggingFaceEmbedding, OpenAIEmbedding, BedrockEmbedding
from llama_index.indices.postprocessor import SentenceTransformerRerank
from IPython.display import display
from llama_index.indices.query.schema import QueryBundle
import chromadb
from llama_index.evaluation import RetrieverEvaluator
import pandas as pd
import datetime
import torch
import os
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.embeddings.cohereai import CohereEmbedding
from api_key import oai_api_key, cohere_api_key
import openai
import boto3

dataset = load_dataset("Finnish-NLP/wikipedia_20230501_fi_cleaned")

# set up OpenAI
os.environ["OPENAI_API_KEY"] =  oai_api_key
openai.api_key = os.environ["OPENAI_API_KEY"]


# Set up Bedrock
session = boto3.Session(profile_name='default')

bedrock_client = session.client(
    service_name="bedrock-runtime",
    region_name="eu-central-1",
)

# Initial dataset processing

In [2]:

# Lets make 10 questions first for 100 samples and see how eval works with few simple embedding models and take if from there
samples = 50
dataset = dataset["train"].select([i for i in range(samples)])

# Lets reset index with pandas to simplify things later
dataset_pd = dataset.to_pandas()
dataset_pd["id"] = [i for i in range(samples)]
dataset = Dataset.from_pandas(dataset_pd)

# Some embedding models mighgt have smaller context size so lets limit size in first testing to 256 tokens or 200 words
def take_first_256_words(row):
    row["text"] = ' '.join(row["text"].split(' ')[:256])
    return row

dataset = dataset.map(take_first_256_words)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

# Create test object (Documents or Textnodes)

In [3]:
# Lets create test nodes/documents
# Nodes are first class citizens and will be used for testing
from llama_index.schema import Document, TextNode
documents_all = []
nodes_all = []
for i, sample in enumerate(dataset):
    documents_all.append(Document(
        text=sample["text"],
        id_=sample["id"],
    )
    )
    
for i, sample in enumerate(dataset):
    nodes_all.append(TextNode(
        text=sample["text"],
        id_=sample["id"],
    )
    )

corpus = {sample["id"]: sample["text"] for sample in dataset}

# Automatic Question creation with llm based on the dataset

In [4]:
# # Automatic question creation

from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.llms import OpenAI

# Prompt to generate questions
qa_generate_prompt_tmpl = """\
Saat seuraavaksi tietoa kontekstiksi.

---------------------
{context_str}
---------------------

Annettuna edeltävä teksti, tehtävänäsi on luoda kysymyksiä perustuen vain yllä annettuun tietoon.

Olet Professori. Tehtävänäsi on luoda \
{num_questions_per_chunk} kysymystä tulevaan \
kokeeseen/tentiin. Kysymysten tulisi olla monipuolisia \
ja kattavasti koko aineistosta. Kysymykset eivät saa sisältää vaihtoehtoja, eikä alkaa tekstillä kysymys 1 / kysymys 2. \
Kysymykset tulee rajoittua vain annettuun tekstiin. Tämä on erittäin tärkeä tehtävä.\
"""

In [5]:
# lets create up to x questions

amount_of_questions = 10
ids_picked = []
max_len = len(nodes_all)
assert amount_of_questions < max_len * 0.5
while len(ids_picked) < amount_of_questions:
    ids_picked.append(random.randint(0,max_len-1))
    ids_picked = list(set(ids_picked))
    

question_nodes = [nodes_all[i] for i in ids_picked]

llm = OpenAI(temperature=0, api_key=openai.api_key)

# qa_dataset = generate_question_context_pairs(
#     question_nodes, llm=llm, num_questions_per_chunk=1, qa_generate_prompt_tmpl=qa_generate_prompt_tmpl
# )

questions = []
for question_node in question_nodes:
    llm_response = llm.complete(qa_generate_prompt_tmpl.format(context_str=question_node.text, num_questions_per_chunk=1), max_tokens=256)
    questions.append(llm_response.text)
    
testing_questions_llm_created = {ids_picked[i]: questions[i] for i in range(len(ids_picked))}
relevant_docs_llm_created = {id_picked:  [id_picked] for id_picked in ids_picked}

assert len(questions) == amount_of_questions

In [6]:
print(testing_questions_llm_created)
print(relevant_docs_llm_created)

{35: 'Mikä on Ateenan väestötiheys?', 4: 'Mikä on AU-tunnus ja mitä se tarkoittaa astronomiassa?', 7: 'Mikä oli Nasan avaruussukkulalentojen kokonaismäärä ja kuinka monta lentoa päättyi onnettomuuteen?', 42: 'Miksi Alfred Hitchcockia pidetään jännityselokuvan mestarina ja mitkä ovat hänen tunnetuimmat elokuvansa?', 12: 'Mikä on fysikaalisen avaruuden määritelmä klassisessa fysiikassa?', 13: 'Miten avoimien joukkojen käsite liittyy topologian keskeisiin käsitteisiin, kuten raja-arvoon, jatkuvuuteen ja yhtenäisyyteen?', 25: 'Miten arkkitehtuurin toteuttaminen eroaa muista taiteista?', 26: 'Mikä on antropologian yhteys muihin tieteenlajeihin ja millä tavoin antropologia tutkii ihmisen biologiaa?', 28: 'Mikä on atomiytimen rakenne ja mitä hiukkasia se sisältää?', 29: 'Mikä on analogisen teknologian etu verrattuna digitaaliseen teknologiaan varhaisessa kehitysvaiheessa?'}
{35: [35], 4: [4], 7: [7], 42: [42], 12: [12], 13: [13], 25: [25], 26: [26], 28: [28], 29: [29]}


# Manual test questions creation

In [11]:
# # Now i run this one by one and try to come up with one question per sample to have at least one question for 10 documents that I am expecting to hit
# import random
# nodes_with_no_questions = [i for i in range(len(nodes_all)) if i not in ids_picked]

# i = random.choice(nodes_with_no_questions)
# i = 130
# print(dataset[i]["title"])
# print(int(nodes_all[i].id_))
# print(nodes_all[i].text)
# for line in dataset[i]["text"].split('.'):
#     print(line)

In [7]:
# MANUAL QUESTION CREATION
# id, question

manual=True

if manual:
    
    testing_questions = [(41, "Miten selittäisi mitä algoritmi tarkoittaa?"),
                        (73, "Missä maassa sijaitsee Gentin satama?"),
                        (100, "Paljonko dieselveturi painaa"),
                        (79, "Mitä Eteläisiä aikakausia tai siis dynastioita Kiinassa oli?"),
                        (130, "Millä keinotekoisella kielellä on noin tuhat puhujaa?"),
                        ]

    testing_questions = [testing_question for testing_question in testing_questions if testing_question[0] < samples]

    # Do same kind of structure as in the example at https://colab.research.google.com/drive/1TxDVA__uimVPOJiMEQgP5fwHiqgKqm4-?usp=sharing#
    filtered_questions_manual = {q[0]:  q[1] for q in testing_questions}
    filtered_relevant_docs_manual = {q[0]:  [q[0]] for q in testing_questions}


# Combine LLM created and manually created questions

In [8]:
# Lets combine
if manual:
    questions = {**filtered_questions_manual, **testing_questions_llm_created}
    relevant_docs= {**filtered_relevant_docs_manual, **relevant_docs_llm_created}

    # Define QA dataset
    qa_dataset = EmbeddingQAFinetuneDataset(
                queries=questions, # Dict jossa avaimena id, valuena kysymys jolla aineisto pitäisi löytyä
                corpus=corpus, # Dict jossa avaimena id, valuena teksti
                relevant_docs=relevant_docs # Dict jossa avaimena query id ja valuena sitä mätsäävien tekstien id:t listana
    )
else:
    questions = {**testing_questions_llm_created}
    relevant_docs= {**relevant_docs_llm_created}

    # Define QA dataset
    qa_dataset = EmbeddingQAFinetuneDataset(
                queries=questions, # Dict jossa avaimena id, valuena kysymys jolla aineisto pitäisi löytyä
                corpus=corpus, # Dict jossa avaimena id, valuena teksti
                relevant_docs=relevant_docs # Dict jossa avaimena query id ja valuena sitä mätsäävien tekstien id:t listana
    )

In [9]:
import numpy as np

def calculate_hit_rate(retrieval_dict):
    """
    Calculate the hit rate for each target document id and its corresponding list of retrieved ids.

    Parameters:
    - retrieval_dict (dict): Dictionary with keys as target document ids and values as lists of retrieved document ids.

    Returns:
    - hit_rates (dict): Dictionary with target document ids as keys and hit rates as values.
    """
    hit_rates = []
    for target_id, retrieved_ids in retrieval_dict.items():
        hit_rate = 1 if target_id in retrieved_ids else 0
        hit_rates.append(hit_rate)

    return np.mean(hit_rates)


def calculate_mrr(retrieval_dict):
    """
    Calculate the Mean Reciprocal Rank (MRR) for each target document id and its corresponding list of retrieved ids.

    Parameters:
    - retrieval_dict (dict): Dictionary with keys as target document ids and values as lists of retrieved document ids.

    Returns:
    - mrrs (dict): Dictionary with target document ids as keys and MRRs as values.
    """
    mrrs = []
    for target_id, retrieved_ids in retrieval_dict.items():
        if target_id in retrieved_ids:
            rank = retrieved_ids.index(target_id) + 1
            mrr = 1 / rank
        else:
            mrr = 0
        mrrs.append(mrr)
    mrr = np.mean(mrrs)
    return mrr



def get_query_results(qa_dataset, querymachine):
    targets = list(qa_dataset.queries.keys())
    querylist = list(qa_dataset.queries.values())
    retrieved_dict = {}

    query_time_start = datetime.datetime.now()
    for i, query in enumerate(querylist):
        q_retrieved_ids = []
        retrieved_nodes = querymachine.retrieve(QueryBundle(query))
        for node in retrieved_nodes:
            q_retrieved_ids.append(node.id_)
        retrieved_dict[targets[i]] = q_retrieved_ids
    
    query_time_end = datetime.datetime.now()
    query_time_total = (query_time_end-query_time_start).total_seconds()
    query_time_single = query_time_total / len(querylist)

    return retrieved_dict, query_time_total, query_time_single


def form_results_df(retrieval_results, model_name, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single):
    result_dict = {}
    hit_rate = calculate_hit_rate(retrieval_results)
    mrr = calculate_mrr(retrieval_results)
    df = pd.DataFrame([[mrr, hit_rate]], columns = ['mrr', 'hit_rate'])
    df["model_name"] = model_name
    df["reranker_name"] = reranker_name
    df["embedding_time"] = embedding_time
    df["embedding_time_single"] = embedding_time_single
    df["retrieval_time"] = retrieval_time
    df["retrieval_time_single"] = retrieval_time_single
    result_dict["mrr"] = mrr
    result_dict["hit_rate"] = hit_rate
    result_dict["model_name"] = model_name
    result_dict["reranker_name"] = reranker_name
    result_dict["embedding_time"] = embedding_time
    result_dict["embedding_time_single"] = embedding_time_single
    result_dict["retrieval_time"] = retrieval_time
    result_dict["retrieval_time_single"] = retrieval_time_single
    
    return df, result_dict

In [10]:
# create client and a new collection
def embed_data_and_def_retriever(embed_model, reranker=None, embed_model_name=None):
    
    chroma_client = chromadb.EphemeralClient()
    try:
        chroma_collection = chroma_client.delete_collection(embed_model_name)
        chroma_collection = chroma_client.create_collection(embed_model_name)
        print("found existing collection")
    except Exception as e:
        chroma_collection = chroma_client.create_collection(embed_model_name)
        print("Created new collection")


    # set up ChromaVectorStore and load in data
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    

    # Define context etch objects for  retrieval
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)
    print(f"CREATING EMBEDDING DB WITH EMBEDDING MODEL: {embed_model_name}, reranker is {reranker}")
    start = datetime.datetime.now()
    index = VectorStoreIndex(nodes=nodes_all, storage_context=storage_context, service_context=service_context, show_progress=True)
    end = datetime.datetime.now()
    embedding_time = (end-start).total_seconds()
    embedding_time_single = embedding_time / len(nodes_all)

    
    # Define retriever
    if reranker != None:
        print(f"Reranker is not none, adding reranker")
        retriever = index.as_query_engine(similarity_top_k=5, node_postprocessors=[reranker])
        return retriever, index, embedding_time, embedding_time_single
    
    else:
        print(f"Reranker is None")
        retriever = index.as_query_engine(similarity_top_k=5)
        return retriever, index, embedding_time, embedding_time_single

In [11]:
def get_query_results(qa_dataset, querymachine):
    targets = list(qa_dataset.queries.keys())
    querylist = list(qa_dataset.queries.values())
    retrieved_dict = {}

    query_time_start = datetime.datetime.now()
    for i, query in enumerate(querylist):
        q_retrieved_ids = []
        retrieved_nodes = querymachine.retrieve(QueryBundle(query))
        for node in retrieved_nodes:
            q_retrieved_ids.append(node.id_)
        retrieved_dict[targets[i]] = q_retrieved_ids
    
    query_time_end = datetime.datetime.now()
    query_time_total = (query_time_end-query_time_start).total_seconds()
    query_time_single = query_time_total / len(querylist)

    return retrieved_dict, query_time_total, query_time_single

In [15]:
result_df = None

Huggingface_models_to_try =["intfloat/multilingual-e5-base", "TurkuNLP/sbert-cased-finnish-paraphrase"]
OpenAI_models_to_try = ["text-embedding-3-small"]
CohereEmbeddings = ["embed-multilingual-v3.0"]
HF_rerankers_to_try = ["BAAI/bge-reranker-base", None]


Eval_openAI = True
Eval_cohere_multilingual_embeddings = True
Eval_Bedrock_titan = True 
Eval_Cohere_rerank = True 


models_to_try = Huggingface_models_to_try
rerankers_to_try = HF_rerankers_to_try

if Eval_openAI:
    models_to_try.extend(OpenAI_models_to_try)

if Eval_cohere_multilingual_embeddings:
    models_to_try.extend(CohereEmbeddings)

if Eval_Bedrock_titan:
    models_to_try.append("Bedrock")

if Eval_Cohere_rerank:
    rerankers_to_try.append("Cohere")


# Outer loop loops over embedding models
# Inner loop loops over rerankers
# We need to embed data only once for inner loop,  so for when j != 0 we should not re-embed data but just change the reranker
print(f'Using {"cuda" if torch.cuda.is_available() else "cpu"} as device')
for i, model_to_try in enumerate(models_to_try):
    collection_name = model_to_try.replace('/','_').replace('-','_')
    print('-' * 50)
    print(f"model_to_try is: {model_to_try}")
    for j, reranker_name in enumerate(rerankers_to_try):
        if j == 0:
            print(f"reranker is: {reranker_name}")
            if model_to_try in OpenAI_models_to_try:
                embedding_model = OpenAIEmbedding(api_key=openai.api_key, model=model_to_try)
                if reranker_name != None:
                    if reranker_name == 'Cohere':
                        reranker = CohereRerank(api_key=cohere_api_key, top_n=3)
                    else:
                        reranker = SentenceTransformerRerank(model=reranker_name, top_n=3, device='cuda:0' if torch.cuda.is_available() else 'cpu', keep_retrieval_score=True)
                retriever, index, embedding_time, embedding_time_single = embed_data_and_def_retriever(embedding_model, reranker, collection_name)
                retrieval_results, retrieval_time, retrieval_time_single = get_query_results(qa_dataset, retriever)
                results_df, result_dict = form_results_df(retrieval_results, model_to_try, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single)
            elif model_to_try in CohereEmbeddings:
                embedding_model = cohere_embed = CohereEmbedding(cohere_api_key=cohere_api_key, model_name="embed-multilingual-v3.0", input_type="search_query")
                if reranker_name != None:
                    if reranker_name == 'Cohere':
                        reranker = CohereRerank(api_key=cohere_api_key, top_n=3)
                    else:
                        reranker = SentenceTransformerRerank(model=reranker_name, top_n=3, device='cuda:0' if torch.cuda.is_available() else 'cpu', keep_retrieval_score=True)
                retriever, index, embedding_time, embedding_time_single = embed_data_and_def_retriever(embedding_model, reranker, collection_name)
                retrieval_results, retrieval_time, retrieval_time_single = get_query_results(qa_dataset, retriever)
                results_df, result_dict = form_results_df(retrieval_results, model_to_try, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single)
            elif model_to_try == 'Bedrock':
                embedding_model = BedrockEmbedding(model_name='amazon.titan-embed-text-v1',client=bedrock_client)
                if reranker_name != None:
                    if reranker_name == 'Cohere':
                        reranker = CohereRerank(api_key=cohere_api_key, top_n=3)
                    else:
                        reranker = SentenceTransformerRerank(model=reranker_name, top_n=3, device='cuda:0' if torch.cuda.is_available() else 'cpu', keep_retrieval_score=True)
                retriever, index, embedding_time, embedding_time_single = embed_data_and_def_retriever(embedding_model, reranker, collection_name)
                retrieval_results, retrieval_time, retrieval_time_single = get_query_results(qa_dataset, retriever)
                results_df, result_dict = form_results_df(retrieval_results, model_to_try, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single)
            else:
                embedding_model = HuggingFaceEmbedding(model_name=model_to_try, device='cuda:0' if torch.cuda.is_available() else 'cpu', max_length=512)
                if reranker_name != None:
                    if reranker_name == 'Cohere':
                        reranker = CohereRerank(api_key=cohere_api_key, top_n=3)
                    else:
                        reranker = SentenceTransformerRerank(model=reranker_name, top_n=3, device='cuda:0' if torch.cuda.is_available() else 'cpu', keep_retrieval_score=True)
                retriever, index, embedding_time, embedding_time_single = embed_data_and_def_retriever(embedding_model, reranker, collection_name)
                retrieval_results, retrieval_time, retrieval_time_single = get_query_results(qa_dataset, retriever)
                results_df, result_dict = form_results_df(retrieval_results, model_to_try, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single)
            print(f'Results: hit_rate: {result_dict["hit_rate"]}, mrr: {result_dict["mrr"]}')
        else:
            # Now we just need to adjust the retriever
            print(f"model_to_try is: {model_to_try}")
            print(f"reranker is: {reranker_name}")
            if reranker_name != None:
                if reranker_name == 'Cohere':
                    reranker = CohereRerank(api_key=cohere_api_key, top_n=3)
                    retriever = index.as_query_engine(similarity_top_k=5, node_postprocessors=[reranker])
                else:
                    reranker = SentenceTransformerRerank(model=reranker_name, top_n=3, device='cuda:0' if torch.cuda.is_available() else 'cpu', keep_retrieval_score=True)
                    retriever = index.as_query_engine(similarity_top_k=5, node_postprocessors=[reranker])
                retrieval_results, retrieval_time, retrieval_time_single = get_query_results(qa_dataset, retriever)
                new_results, result_dict = form_results_df(retrieval_results, model_to_try, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single)
            else:
                retriever = index.as_query_engine(similarity_top_k=5, node_postprocessors=None)
                retrieval_results, retrieval_time, retrieval_time_single = get_query_results(qa_dataset, retriever)
                new_results, result_dict = form_results_df(retrieval_results, model_to_try, reranker_name, embedding_time, embedding_time_single, retrieval_time, retrieval_time_single)
            results_df = pd.concat([results_df, new_results])
            
            print(f'Results: hit_rate: {result_dict["hit_rate"]}, mrr: {result_dict["mrr"]}')
            print('-' * 50)
    if i == 0:
        results_all = results_df
    else:
        results_all = pd.concat([results_all, results_df])
results_all

Using cuda as device
--------------------------------------------------
model_to_try is: intfloat/multilingual-e5-base
reranker is: BAAI/bge-reranker-base
found existing collection
LLM is explicitly disabled. Using MockLLM.
CREATING EMBEDDING DB WITH EMBEDDING MODEL: intfloat_multilingual_e5_base, reranker is callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001F554AA77C0> model='BAAI/bge-reranker-base' top_n=3 device='cuda:0' keep_retrieval_score=True


Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

Reranker is not none, adding reranker
Results: hit_rate: 1.0, mrr: 1.0
model_to_try is: intfloat/multilingual-e5-base
reranker is: None
Results: hit_rate: 1.0, mrr: 0.9545454545454546
--------------------------------------------------
model_to_try is: intfloat/multilingual-e5-base
reranker is: Cohere
Results: hit_rate: 1.0, mrr: 1.0
--------------------------------------------------
--------------------------------------------------
model_to_try is: TurkuNLP/sbert-cased-finnish-paraphrase
reranker is: BAAI/bge-reranker-base
found existing collection
LLM is explicitly disabled. Using MockLLM.
CREATING EMBEDDING DB WITH EMBEDDING MODEL: TurkuNLP_sbert_cased_finnish_paraphrase, reranker is callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001F28EDEAF50> model='BAAI/bge-reranker-base' top_n=3 device='cuda:0' keep_retrieval_score=True


Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

Reranker is not none, adding reranker
Results: hit_rate: 1.0, mrr: 1.0
model_to_try is: TurkuNLP/sbert-cased-finnish-paraphrase
reranker is: None
Results: hit_rate: 1.0, mrr: 0.9090909090909091
--------------------------------------------------
model_to_try is: TurkuNLP/sbert-cased-finnish-paraphrase
reranker is: Cohere
Results: hit_rate: 1.0, mrr: 1.0
--------------------------------------------------
--------------------------------------------------
model_to_try is: text-embedding-3-small
reranker is: BAAI/bge-reranker-base
found existing collection
LLM is explicitly disabled. Using MockLLM.
CREATING EMBEDDING DB WITH EMBEDDING MODEL: text_embedding_3_small, reranker is callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001F2736EDFC0> model='BAAI/bge-reranker-base' top_n=3 device='cuda:0' keep_retrieval_score=True


Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

Reranker is not none, adding reranker
Results: hit_rate: 1.0, mrr: 1.0
model_to_try is: text-embedding-3-small
reranker is: None
Results: hit_rate: 1.0, mrr: 1.0
--------------------------------------------------
model_to_try is: text-embedding-3-small
reranker is: Cohere
Results: hit_rate: 1.0, mrr: 1.0
--------------------------------------------------
--------------------------------------------------
model_to_try is: embed-multilingual-v3.0
reranker is: BAAI/bge-reranker-base
found existing collection
LLM is explicitly disabled. Using MockLLM.
CREATING EMBEDDING DB WITH EMBEDDING MODEL: embed_multilingual_v3.0, reranker is callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001F37E408E80> model='BAAI/bge-reranker-base' top_n=3 device='cuda:0' keep_retrieval_score=True


Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

Reranker is not none, adding reranker
Results: hit_rate: 1.0, mrr: 1.0
model_to_try is: embed-multilingual-v3.0
reranker is: None
Results: hit_rate: 1.0, mrr: 0.9545454545454546
--------------------------------------------------
model_to_try is: embed-multilingual-v3.0
reranker is: Cohere
Results: hit_rate: 1.0, mrr: 1.0
--------------------------------------------------
--------------------------------------------------
model_to_try is: Bedrock
reranker is: BAAI/bge-reranker-base
found existing collection
LLM is explicitly disabled. Using MockLLM.
CREATING EMBEDDING DB WITH EMBEDDING MODEL: Bedrock, reranker is callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x000001F37E37F130> model='BAAI/bge-reranker-base' top_n=3 device='cuda:0' keep_retrieval_score=True


Generating embeddings:   0%|          | 0/50 [00:00<?, ?it/s]

Reranker is not none, adding reranker
Results: hit_rate: 0.9090909090909091, mrr: 0.9090909090909091
model_to_try is: Bedrock
reranker is: None
Results: hit_rate: 0.9090909090909091, mrr: 0.7575757575757575
--------------------------------------------------
model_to_try is: Bedrock
reranker is: Cohere
Results: hit_rate: 0.9090909090909091, mrr: 0.9090909090909091
--------------------------------------------------


Unnamed: 0,mrr,hit_rate,model_name,reranker_name,embedding_time,embedding_time_single,retrieval_time,retrieval_time_single
0,1.0,1.0,intfloat/multilingual-e5-base,BAAI/bge-reranker-base,0.881297,0.017626,0.832286,0.075662
0,0.954545,1.0,intfloat/multilingual-e5-base,,0.881297,0.017626,0.180121,0.016375
0,1.0,1.0,intfloat/multilingual-e5-base,Cohere,0.881297,0.017626,11.931781,1.084707
0,1.0,1.0,TurkuNLP/sbert-cased-finnish-paraphrase,BAAI/bge-reranker-base,0.788571,0.015771,0.807556,0.073414
0,0.909091,1.0,TurkuNLP/sbert-cased-finnish-paraphrase,,0.788571,0.015771,0.187661,0.01706
0,1.0,1.0,TurkuNLP/sbert-cased-finnish-paraphrase,Cohere,0.788571,0.015771,4.71988,0.42908
0,1.0,1.0,text-embedding-3-small,BAAI/bge-reranker-base,2.033344,0.040667,4.105723,0.373248
0,1.0,1.0,text-embedding-3-small,,2.033344,0.040667,2.611783,0.237435
0,1.0,1.0,text-embedding-3-small,Cohere,2.033344,0.040667,7.966851,0.724259
0,1.0,1.0,embed-multilingual-v3.0,BAAI/bge-reranker-base,2.977586,0.059552,5.075453,0.461405
