In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json


with open('test_combinations.json', 'r', encoding='utf-8') as file:
    combinations = json.loads(file.read())

In [3]:
START_COMBINATION = 0
END_COMBINATION = 10

In [None]:
import datetime
from os import PathLike
import time
from typing import Dict

from loguru import logger

from mirage.embedders import HuggingFaceEmbedder
from mirage.index import MirageIndex
from mirage.index.chunk_storages import WhooshChunkStorage
from mirage.index.chunking_algorithms import WordCountingChunkingAlgorithm
from mirage.index.chunking_algorithms.NatashaSentenсeChunking import NatashaSentenceChunking
from mirage.index.raw_storages import FolderRawStorage
from mirage.index.vector_index.FaissVectorIndex import FaissIndexFlatIP, FaissIndexFlatL2

short_names = {
    'WordCountingChunkingAlgorithm': "WC_128_05_BAAI",
    'SentenceChunkingAlgorithm': 'SC'
}


def get_name(c):
    ch = 'W' if c['ChunkingAlgorithm']['method'] == 'WordCountingChunkingAlgorithm' else 'S'
    ch_par = '_'.join([str(i) for i in list(c['ChunkingAlgorithm']['params'].values())])
    e_params = c['Embedder']['params']['model'].split('/')[0]
    return 'indexes\\' + '_'.join([ch, ch_par, e_params])

get_name(combinations[12])



raw_storage = FolderRawStorage('data_txt')
def generate_index(combination: Dict, filepath_prefix: str) -> None:
    logger.info(combination)
    chunk_storage = WhooshChunkStorage(scoring_function='BM25F', normalizer=True)
    match combination['ChunkingAlgorithm']['method']:
        case 'WordCountingChunkingAlgorithm':
            chunking_algorithm = WordCountingChunkingAlgorithm(raw_storage=raw_storage, chunk_storage=chunk_storage, **combination['ChunkingAlgorithm']['params'])
        case 'SentenceChunkingAlgorithm':
            chunking_algorithm = NatashaSentenceChunking(raw_storage=raw_storage, chunk_storage=chunk_storage, **combination['ChunkingAlgorithm']['params'])
        case _:
            logger.error(combination['ChunkingAlgorithm'])
            raise ValueError('Unknown Chunking algorithm type')
    embedder = HuggingFaceEmbedder(model_name=combination['Embedder']['params']['model'])
    chunking_algorithm.execute()
    l2_index = FaissIndexFlatL2(dimensionality=embedder.get_dimensionality())
    ip_indx = FaissIndexFlatIP(dimensionality=embedder.get_dimensionality())
    embedder.convert_chunks_to_vector_index(chunk_storage=chunk_storage, vector_index=l2_index, visualize=True)
    start_copy_time = time.time()
    for vector_key_pair in l2_index:
        ip_indx.add(
            vector=vector_key_pair.vector,
            chunk_storage_key=vector_key_pair.chunk_storage_key
        )
    end_copy_time = time.time()
    logger.info(f"Copy of index time: {end_copy_time - start_copy_time}s.")
    l2_mirage = MirageIndex(
        raw_storage=raw_storage,
        chunk_storage=chunk_storage,
        chunking_algorithm=chunking_algorithm,
        vector_index=l2_index
    )
    ip_mirage = MirageIndex(
        raw_storage=raw_storage,
        chunk_storage=chunk_storage,
        chunking_algorithm=chunking_algorithm,
        vector_index=ip_indx
    )
    l2_mirage.save(filename_to_save=filepath_prefix + "_l2.mirage_index")
    ip_mirage.save(filename_to_save=filepath_prefix + "_ip.mirage_index")
    
    

for indx in range(START_COMBINATION, END_COMBINATION):
    combination = combinations[indx]
    generate_index(
        combination,
        get_name(combination)
    )

[32m2025-04-27 23:35:54.647[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_index[0m:[36m34[0m - [1m{'ChunkingAlgorithm': {'method': 'WordCountingChunkingAlgorithm', 'params': {'words_amount': 128, 'overlap': 0.1}}, 'Embedder': {'method': 'HuggingFaceEmbedder', 'params': {'model': 'intfloat/e5-small-v2'}}}[0m
[32m2025-04-27 23:35:58.379[0m | [1mINFO    [0m | [36mmirage.index.chunking_algorithms.WordCountingChunkingAlgorithm[0m:[36mchunk_a_document[0m:[36m28[0m - [1mReading a document... Приказ Росавиации от 28.12.2022 N 970-П  Об утверждении Норм.txt[0m
[32m2025-04-27 23:36:04.142[0m | [1mINFO    [0m | [36mmirage.index.chunking_algorithms.WordCountingChunkingAlgorithm[0m:[36mchunk_a_document[0m:[36m28[0m - [1mReading a document... Приказ_Минтранса_РФ_от_21_11_2005_N_139_ред_от_17_09_2010.rtf.txt[0m
[32m2025-04-27 23:36:04.874[0m | [1mINFO    [0m | [36mmirage.index.chunking_algorithms.WordCountingChunkingAlgorithm[0m:[36mchunk_a_document[0m

'W_2048_0.1_BAAI'