In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mirage import MirageIndex
from mirage.MIRAGE import MIRAGE
from mirage.embedders.HuggingFaceEmbedder import HuggingFaceEmbedder
from mirage.inference.MirageInfefrence import MirageInference
from mirage.inference.quorums import RusVectoresQuorum
from mirage.inference.rerankers.ReciprocalRankFusionReranker import ReciprocalRankFusionReranker
import os
import pandas as pd
from tqdm import tqdm

In [3]:
path = 'E:/indexes2'
files = [fr'{path}/{i}' for i in os.listdir(path)]

In [4]:
embedder = {
    'BAAI': HuggingFaceEmbedder(model_name='BAAI/bge-m3'),
    'DeepPavlov': HuggingFaceEmbedder(model_name='DeepPavlov/rubert-base-cased-sentence'),
    'intfloat': HuggingFaceEmbedder(model_name='intfloat/e5-small-v2'),
    'thenlper': HuggingFaceEmbedder(model_name='thenlper/gte-base')
}

No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.


In [5]:
rerancer = ReciprocalRankFusionReranker()

In [6]:
df = pd.read_pickle('cuad-df.pkl')

In [7]:
dict_file = []
dict_idx = []
dict_text = []

In [8]:
progress_bar = tqdm(total = df.shape[0] * len(files))
for file in files:
    mirage_index = MirageIndex.load(file)
    quorum  = RusVectoresQuorum(chunk_storage=mirage_index.chunk_storage, path_to_model='model.bin', visualize=False,max_entries=50 )
    for idx, q in enumerate(df['question'].to_list()):
        full_text = quorum.query(q)
        semantic = mirage_index.vector_index.query(embedder[file.split('_')[4]].embed(q), top_k=50)
        texts = rerancer(fulltext_search_results = full_text, vector_search_results = semantic)[:10]
        progress_bar.update()
        dict_file.append(file)
        dict_idx.append(idx)
        dict_text.append(mirage_index.chunk_storage.get_texts_for_search_results(texts))

100%|██████████| 19200/19200 [1:03:59<00:00,  5.75it/s]

In [9]:
new = pd.DataFrame([dict_file, dict_idx, dict_text]).transpose()
new.columns = ['file', 'question', 'texts']

In [10]:
new.to_pickle('E:/results/bench2.pkl')

In [11]:
# mirage_index.chunk_storage.get_texts_for_search_results(texts)
# [mirage_index.chunk_storage.get_raw_index_of_document(result.chunk_storage_key) for result in texts]