In [1]:
from docuverse import SearchEngine
from docuverse.engines.search_engine_config_params import DocUVerseConfig
from docuverse.utils.evaluator import EvaluationEngine

import os
os.environ['DOCUVERSE_CONFIG_PATH'] = os.path.abspath('../config')

Choose one of the following configurations: either dense/clapnq or elser/scifact - they are just for demonstration purposes, you can obviously mix and match in any way you see necessary/useful.

In [2]:
dense_encoder_model = "thenlper/gte-small"
clapnq_configuration={
    "retriever": {
        "db_engine": "es-dense",
        "model_name": dense_encoder_model,
        "top_k": 40,
        "actions": "ire",
        "max_doc_length": 512,
        "stride": 100,
        "index_name": f"clapnq_small_{dense_encoder_model.replace('/','_')}_512_100",
        "server": "resconvai",
        "doc_based": True,
        "verbose": True,
        "title_handling": "all",
        "data_format": "../config/clapnq_data_format.yml",
        "bulk_batch": 128,
        "num_preprocessor_threads": 1
    },
        # max_num_documents: 1000
    "reranker": None,
    "evaluate": {
        "eval_measure": "match,mrr",
        "ranks": "1,3,5,10"
    }
}

In [3]:
elser_model = ".elser_model_2_linux-x86_64"
sci_configuration={
    "retriever": {
        "db_engine": "es-elser",
        "model_name": elser_model,
        "top_k": 40,
        "actions": "ire",
        "max_doc_length": 512,
        "stride": 100,
        "index_name": f"scifact_{elser_model.replace('/','_')}_512_100",
        "server": "resconvai",
        "doc_based": True,
        "verbose": True,
        "title_handling": "all",
        "data_format": "../config/beir_data_format.yml",
        "bulk_batch": 128,
        "num_preprocessor_threads": 1
    },
        # max_num_documents: 1000
    "reranker": None,
    "evaluate": {
        "eval_measure": "match,mrr",
        "ranks": "1,3,5,10"
    }
}

In [5]:
def ingest_and_test(corpus_file, question_file, configuration, force_update=False):
    config = DocUVerseConfig(configuration)
    engine = SearchEngine(config)
    if force_update:
        corpus = engine.read_data(corpus_file, no_cache=force_update)
        engine.ingest(corpus, update=force_update)
    queries = engine.read_questions(question_file)
    
    output = engine.search(queries)
    
    scorer = EvaluationEngine(config.eval_config)
    res = scorer.compute_score(queries, output, model_name=config.index_name)
    print(f"Results:\n {res}")

In [None]:
ingest_and_test("data/clapnq/passages.tsv", "data/clapnq_small/question_dev_answerable.tsv", clapnq_configuration)

In [10]:
ingest_and_test("data/scifact/corpus.jsonl", "data/scifact/question_config.json", sci_configuration)

=== done initializing model
Hidden dimension for model: 384
Cache filename is /home/raduf/.local/share/elastic_ingestion/data__scifact__corpus.jsonl_512_100_all_gte-small.jsonl.bz2


Reading cache file:: 5720it [00:00, 18086.13it/s]
Computing statistics: 100%|██████████| 5720/5720 [00:00<00:00, 115148.49it/s]


Statistics:
Number of documents:5183      
Number of tiles:    5720      
#tiles per document:1.10      
                    Character-based:    Token-based:        
  Minimum length:   221                 70                  
  Maximum length:   2780                512                 
  Average length:   1415.2              320.2               
Char histogram:

# NumSamples = 5720; Min = 221.00; Max = 2780.00
# Mean = 1415.243182; Variance = 171130.316212; SD = 413.679001; Median 1397.000000
# each ∎ represents a count of 17
  221.0000 -   476.9000 [    29]: ∎
  476.9000 -   732.8000 [   203]: ∎∎∎∎∎∎∎∎∎∎∎
  732.8000 -   988.7000 [   648]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
  988.7000 -  1244.6000 [  1306]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 1244.6000 -  1500.5000 [  1137]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 1500.5000 -  1756.4000 [  1062]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 17

SystemExit: 0

In [7]:
sci_dense_configuration = sci_configuration
sci_configuration['retriever'].update(
    {
        "db_engine": "es-dense",
        "model_name": dense_encoder_model,
        "index_name": f"scifact_{dense_encoder_model.replace('/','_')}_512_100",
    }
)
ingest_and_test("data/scifact/corpus.jsonl", "data/scifact/question_config.json", sci_dense_configuration)

=== done initializing model
Hidden dimension for model: 384
Cache filename is /home/raduf/.local/share/elastic_ingestion/data__scifact__corpus.jsonl_512_100_all_gte-small.jsonl.bz2
Reading data/scifact/corpus.jsonl

Reading data/scifact/corpus.jsonl:: 5183it [00:00, 102919.54it/s]


 done: 54 milliseconds


Reading docs:: 100%|██████████| 5183/5183 [00:09<00:00, 573.48it/s]


Processed in 9 seconds, 39 milliseconds


Computing statistics: 100%|██████████| 5720/5720 [00:00<00:00, 551020.19it/s]


Statistics:
Number of documents:5183      
Number of tiles:    5720      
#tiles per document:1.10      
                    Character-based:    Token-based:        
  Minimum length:   221                 70                  
  Maximum length:   2780                512                 
  Average length:   1415.2              320.2               
Char histogram:

# NumSamples = 5720; Min = 221.00; Max = 2780.00
# Mean = 1415.243182; Variance = 171130.316212; SD = 413.679001; Median 1397.000000
# each ∎ represents a count of 17
  221.0000 -   476.9000 [    29]: ∎
  476.9000 -   732.8000 [   203]: ∎∎∎∎∎∎∎∎∎∎∎
  732.8000 -   988.7000 [   648]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
  988.7000 -  1244.6000 [  1306]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 1244.6000 -  1500.5000 [  1137]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 1500.5000 -  1756.4000 [  1062]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
 17

Ingesting dense documents: : 5760it [00:26, 216.12it/s]                        
Reading data/scifact/queries.jsonl:: 1109it [00:00, 139915.27it/s]
Processing queries: 100%|██████████| 231/231 [00:14<00:00, 16.09it/s]
Evaluating questions: 100%|██████████| 231/231 [00:00<00:00, 490574.29it/s]
Exception ignored in: <function DenseEmbeddingFunction.__del__ at 0x7f5e4579c280>
Traceback (most recent call last):
  File "/ssd5/raduf/sandbox/docuverse/docuverse/utils/embedding_function.py", line 43, in __del__
    self.stop_pool()
  File "/ssd5/raduf/sandbox/docuverse/docuverse/utils/embedding_function.py", line 53, in stop_pool
    self.model.stop_multi_process_pool()
TypeError: SentenceTransformer.stop_multi_process_pool() missing 1 required positional argument: 'pool'


Results:
 Model                                 M@1       M@3       M@5       M@10      MRR@1     MRR@3     MRR@5     MRR@10    
scifact_thenlper_gte-small_512_100    0.0       0.0       0.0       0.0       0.0       0.0       0.0       0.0       

