# Evaluations on indexing and retrieval
<!-- We want to evalute:
* The performance of different indexing and retrieval strategies, spanning sparse retrieval, classic dense embedding, and advanced retrieval model.

* The influence of precise retrieval on the quality of LLM interpretation -->


## Models

We consider the following 5 indexing and retrieval methods:

1) BM-25, a lightweight sparse retrieval method without complex neural networks, ranking document segments based on the appearing frequency of query terms.

2) all-MiniLM-L6, from SentenceTransformer, a prevalent dense embedding model, mapping sentences to a 384-dimensional dense vector space. 

3) all-mpnet-base, another widely utilized embedding model from SentenceTransformer, noted for its larger architecture and improved performance. 

4) text-embedding-3-large-model, the latest embedding model from OpenAI, with enhanced capability. 

5) ColBERT, an advanced retrieval model, relying on token-level embedding and fine-grained contextual late interaction.

In [1]:
# Preparation
import sys
import os
from pathlib import Path

# Get the project root directory
root_dir = Path(os.path.abspath("")).resolve().parents[1]
sys.path.append(str(root_dir))
# Change the working directory to the project root
os.chdir(root_dir)


In [2]:
# Set up the configs for this demo
DEMO_SIZE = 2
res_dir = f"experiment/retrieval/res/"
if not os.path.exists(res_dir):
    os.makedirs(res_dir)

import warnings
warnings.filterwarnings('ignore')

Run the retrieval experiments, utilizing the functional implementation provided within the `uda.utils` module.

In [3]:
from uda.utils import retrieve as rt
from uda.utils import retrieve_exp as rt_exp
from uda.utils import preprocess as pre
import json

DATASET_NAME_LIST = ["fin", "paper_tab", "paper_text", "nq", "feta"]
RT_MODEL_LIST = ["bm25", "all-MiniLM-L6-v2", "all-mpnet-base-v2", "openai", "colbert"]
# The procedure of complex models may be time-consuming, you can choose to run a sub-list of models and datasets
DATASET_NAME_LIST = DATASET_NAME_LIST[:]
RT_MODEL_LIST = RT_MODEL_LIST[:1]


for DATASET_NAME in DATASET_NAME_LIST:
    for RT_MODEL in RT_MODEL_LIST:
        print(f"=== Start {DATASET_NAME} on {RT_MODEL} ===")
        res_file = os.path.join(res_dir, f"{DATASET_NAME}_{RT_MODEL}.jsonl")
        bench_json_file = pre.meta_data[DATASET_NAME]["bench_json_file"]
        with open(bench_json_file, "r") as f:
            bench_data = json.load(f)
        doc_list = list(bench_data.keys())
        for doc in doc_list[:1]:
            pdf_path = pre.get_example_pdf_path(DATASET_NAME, doc)
            if pdf_path is None:
                continue
            for qa_item in bench_data[doc]:
                question = qa_item["question"]
                q_uid = qa_item["q_uid"]
                collection_name = f"{DATASET_NAME}_vector_db"
                # Prepare the index
                collection = rt.prepare_collection(pdf_path, collection_name, RT_MODEL)
                # Retrieve the contexts
                contexts = rt.get_contexts(collection, question, RT_MODEL)
                # Save the results
                rt_exp.log_score(
                    contexts, doc, q_uid, DATASET_NAME, res_file, bench_json_file
                )
    print(f"=== Finish {DATASET_NAME} ===\n")


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda-11.8'


=== Start fin on bm25 ===
Retrieval-Match-Scores {'doc_name': 'GS_2016', 'q_uid': 'GS/2016/page_79.pdf-3', 'Top-1': 0.7555555555555555, 'Top-5': 0.9333333333333333, 'Top-10': 0.9333333333333333, 'Top-20': 0.9333333333333333, 'Top-30': 0.9333333333333333}
Retrieval-Match-Scores {'doc_name': 'GS_2016', 'q_uid': 'GS/2016/page_79.pdf-1', 'Top-1': 0.7555555555555555, 'Top-5': 0.9333333333333333, 'Top-10': 0.9333333333333333, 'Top-20': 0.9333333333333333, 'Top-30': 0.9333333333333333}
Retrieval-Match-Scores {'doc_name': 'GS_2016', 'q_uid': 'GS/2016/page_161.pdf-1', 'Top-1': 0.631578947368421, 'Top-5': 0.9836065573770492, 'Top-10': 0.9836065573770492, 'Top-20': 0.9836065573770492, 'Top-30': 0.9836065573770492}
Retrieval-Match-Scores {'doc_name': 'GS_2016', 'q_uid': 'GS/2016/page_183.pdf-3', 'Top-1': 0.2857142857142857, 'Top-5': 1.0, 'Top-10': 1.0, 'Top-20': 1.0, 'Top-30': 1.0}
Retrieval-Match-Scores {'doc_name': 'GS_2016', 'q_uid': 'GS/2016/page_186.pdf-2', 'Top-1': 0.9411764705882353, 'Top-5

Get the averaged retrieval matching scores

In [4]:
import json
import pandas as pd

def get_avg_score(file_path):
    with open(file_path,"r") as f:
        lines=f.readlines()
        data_list=[json.loads(l) for l in lines]
    df=pd.DataFrame(data_list)
    avg_1_score=df["Top-1"].mean()
    avg_5_score=df["Top-5"].mean()
    avg_10_score=df["Top-10"].mean()
    avg_20_score=df["Top-20"].mean()
    res_df=pd.DataFrame({"avg_1_score":[avg_1_score],"avg_5_score":[avg_5_score],"avg_10_score":[avg_10_score],"avg_20_score":[avg_20_score]})
    return res_df

# rt_models=["bm25","all-MiniLM-L6-v2","all-mpnet-base-v2","openai","colbert"]
dataset_name="fin"
rt_model="bm25"
# relative path based on the project root
res_file_name=f"experiment/retrieval/res/{dataset_name}_{rt_model}.jsonl" 
res_df=get_avg_score(res_file_name)
print(f"===== {rt_model} on {dataset_name} =====")
print(res_df)


===== bm25 on fin =====
   avg_1_score  avg_5_score  avg_10_score  avg_20_score
0     0.672708     0.918458      0.921382       0.92723
