In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from mirage import MirageIndex
from mirage.MIRAGE import MIRAGE
from mirage.embedders.HuggingFaceEmbedder import HuggingFaceEmbedder
from mirage.inference.MirageInfefrence import MirageInference
from mirage.inference.quorums import RusVectoresQuorum
from mirage.inference.rerankers.ReciprocalRankFusionReranker import ReciprocalRankFusionReranker
import os
import pandas as pd
from tqdm import tqdm

In [3]:
indexes = '''S_16_0.5_BAAI
S_8_0.5_thenlper
S_16_0.5_DeepPavlov
S_16_0.5_intfloat
S_32_0.5_BAAI
W_256_0.25_BAAI
S_8_0.5_DeepPavlov
S_4_0.5_thenlper
W_256_0.25_intfloat
W_256_0.1_intfloat'''.split()

In [4]:
def get_files(index_signature: str):

    path_cuad = 'E:/indexes2'
    files_cuad = [fr'{path_cuad}/{i}' for i in os.listdir(path_cuad) if i[i.index('_') + 1:][:-16] in indexes and 'l2' in i]

    path_bench = 'E:/indexes1'
    files_bench = [fr'{path_bench}/{i}' for i in os.listdir(path_bench) if i[:-16] in indexes and 'l2' in i]

    bench_out = [i for i in files_bench if index_signature in i][0]
    cuad_out = [i for i in files_cuad if index_signature in i][0]
    return bench_out, cuad_out

In [5]:
embedder = {
    'BAAI': HuggingFaceEmbedder(model_name='BAAI/bge-m3'),
    'DeepPavlov': HuggingFaceEmbedder(model_name='DeepPavlov/rubert-base-cased-sentence'),
    'intfloat': HuggingFaceEmbedder(model_name='intfloat/e5-small-v2'),
    'thenlper': HuggingFaceEmbedder(model_name='thenlper/gte-base')
}


No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.


In [6]:
df_cuad = pd.read_pickle('cuad-df.pkl')
df_bench = pd.read_pickle('bench-df.pkl')

In [7]:
dict_index = []
dict_dataset = []
dict_idx = []
dict_text = []
dict_k = []
dict_c = []

In [8]:
from loguru import logger

C_CANDIDATES = [16]
# C_CANDIDATES = [1, 2, 4, 8, 32, 60]
K1_CANDIDATES = [1, 1.2, 1.5, 1.75, 2]
#      модели                                             вопросы
total = 10 * len(C_CANDIDATES) * len(K1_CANDIDATES) *       80
progress_bar = tqdm(total = total)
arr = []
for index in indexes:
    bench_file, cuad_file = get_files(index)
    logger.info(index)
    # if index.split('_')[-1] == "BAAI":
    #     continue
    for c in C_CANDIDATES:
    # for c in [60]:
        rerancer = ReciprocalRankFusionReranker(c=c)
        for k1 in K1_CANDIDATES:
            # if c == 60 and k1 == 1.2:
            #     continue
            
            mirage_index_bench = MirageIndex.load(bench_file)
            mirage_index_cuad = MirageIndex.load(cuad_file)

            mirage_index_bench.chunk_storage.K1 = k1
            mirage_index_cuad.chunk_storage.K1 = k1

            quorum_bench  = RusVectoresQuorum(chunk_storage=mirage_index_bench.chunk_storage, path_to_model='model.bin', visualize=False,max_entries=50 )
            quorum_cuad  = RusVectoresQuorum(chunk_storage=mirage_index_cuad.chunk_storage, path_to_model='model.bin', visualize=False,max_entries=50 )

            for idx, (b_q, c_q) in enumerate(zip(df_bench['question'].to_list(), df_cuad['question'].to_list())):

                full_text = quorum_bench.query(b_q)
                
                semantic = mirage_index_bench.vector_index.query(embedder[index.split('_')[-1]].embed(b_q), top_k=50)
                texts = rerancer(fulltext_search_results = full_text, vector_search_results = semantic)[:10]
                dict_index.append(index)
                dict_dataset.append('bench')
                dict_idx.append(idx)
                dict_text.append(mirage_index_bench.chunk_storage.get_texts_for_search_results(texts))
                dict_k.append(k1)
                dict_c.append(c)

                full_text = quorum_cuad.query(c_q)
                semantic = mirage_index_cuad.vector_index.query(embedder[index.split('_')[-1]].embed(c_q), top_k=50)
                texts = rerancer(fulltext_search_results = full_text, vector_search_results = semantic)[:10]
                dict_index.append(index)
                dict_dataset.append('cuad')
                dict_idx.append(idx)
                dict_text.append(mirage_index_cuad.chunk_storage.get_texts_for_search_results(texts))
                dict_k.append(k1)
                dict_c.append(c)

                progress_bar.update()

  0%|          | 0/4000 [00:00<?, ?it/s][32m2025-05-15 09:40:30.572[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mS_16_0.5_BAAI[0m
 10%|█         | 400/4000 [03:11<24:41,  2.43it/s]  [32m2025-05-15 09:43:42.233[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mS_8_0.5_thenlper[0m
 20%|██        | 800/4000 [07:42<33:18,  1.60it/s]  [32m2025-05-15 09:48:13.061[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mS_16_0.5_DeepPavlov[0m
 30%|███       | 1200/4000 [11:54<27:23,  1.70it/s]  [32m2025-05-15 09:52:25.081[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mS_16_0.5_intfloat[0m
 40%|████      | 1600/4000 [14:46<16:01,  2.49it/s]  [32m2025-05-15 09:55:16.995[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mS_32_0.5_BAAI[0m
 50%|█████     | 2000/4000 [17:34<13:34,  2.45it/s][32m2025-05-15 09:58:04.521[0m | [1mINFO    [0m | [

In [9]:
new = pd.DataFrame([dict_index, dict_dataset, dict_idx, dict_text, dict_k, dict_c]).transpose()
new.columns = ['file', 'dataset', 'question', 'texts', 'k1', 'c']

In [10]:
new.to_pickle('E:/results_extra_4.pkl')

In [11]:
new

Unnamed: 0,file,dataset,question,texts,k1,c
0,S_16_0.5_BAAI,bench,0,"[П. ОБЕСПЕЧЕНИЕ АВИАЦИОННОЙ БЕЗОПАСНОСТИ\r\n""...",1,16
1,S_16_0.5_BAAI,cuad,0,[TERM AND TERMINATION. 15.1 Term. This Agreeme...,1,16
2,S_16_0.5_BAAI,bench,1,[7. При базировании (временной стоянке) воз...,1,16
3,S_16_0.5_BAAI,cuad,1,"[1.3 ""Annual Report"" has the meaning set forth...",1,16
4,S_16_0.5_BAAI,bench,2,"[П. ОБЕСПЕЧЕНИЕ АВИАЦИОННОЙ БЕЗОПАСНОСТИ\r\n""...",1,16
...,...,...,...,...,...,...
7995,W_256_0.1_intfloat,cuad,77,[similar cooperative relationship with any thi...,2,16
7996,W_256_0.1_intfloat,bench,78,[Запрещено: ...,2,16
7997,W_256_0.1_intfloat,cuad,78,[negotiate in good faith the terms pursuant to...,2,16
7998,W_256_0.1_intfloat,bench,79,[что эксплуатант является получателем субсидий...,2,16
