In [1]:
from docuverse import SearchEngine
from docuverse.engines.search_engine_config_params import DocUVerseConfig
from docuverse.utils.evaluator import EvaluationEngine
import os
os.environ['DOCUVERSE_CONFIG_PATH'] = os.path.abspath('../config')

In [2]:
dense_encoder_model = "thenlper/gte-small"
clapnq_configuration={
    "retriever": {
        "db_engine": "es-dense",
        "model_name": dense_encoder_model,
        "top_k": 40,
        "actions": "ire",
        "max_doc_length": 512,
        "stride": 100,
        "index_name": f"clapnq_small_{dense_encoder_model.replace('/','_')}_512_100",
        "server": "resconvai",
        "doc_based": True,
        "verbose": True,
        "title_handling": "all",
        "data_format": "../config/clapnq_data_format.yml",
        "bulk_batch": 128,
        "num_preprocessor_threads": 1
    },
        # max_num_documents: 1000
    "reranker": None,
    "evaluate": {
        "eval_measure": "match,mrr",
        "ranks": "1,3,5,10"
    }
}

In [None]:
elser_model = ".elser_model_2_linux-x86_64"
sci_configuration={
    "retriever": {
        "db_engine": "es-elser",
        "model_name": elser_model,
        "top_k": 40,
        "actions": "ire",
        "max_doc_length": 512,
        "stride": 100,
        "index_name": f"scifact_{elser_model.replace('/','_')}_512_100",
        "server": "resconvai",
        "doc_based": True,
        "verbose": True,
        "title_handling": "all",
        "data_format": "../config/beir_data_format.yml",
        "bulk_batch": 128,
        "num_preprocessor_threads": 1
    },
        # max_num_documents: 1000
    "reranker": None,
    "evaluate": {
        "eval_measure": "match,mrr",
        "ranks": "1,3,5,10"
    }
}

In [3]:
def ingest_and_test(corpus_file, question_file, configuration, update=True):
    config = DocUVerseConfig(configuration)
    engine = SearchEngine(config)
    corpus = engine.read_data(corpus_file, no_cache=update)
    engine.ingest(corpus, update=update)
    queries = engine.read_questions(question_file)
    
    output = engine.search(queries)
    
    scorer = EvaluationEngine(config.eval_config)
    res = scorer.compute_score(queries, output, model_name=config.index_name)
    print(f"Results:\n {res}")

In [4]:
ingest_and_test("data/clapnq/passages.tsv", "data/clapnq_small/question_dev_answerable.tsv", clapnq_configuration)

TypeError: unbound method dict.keys() needs an argument

In [None]:
ingest_and_test("data/scifact/corpus.jsonl", "data/scifact/question_config.json",
                sci_configuration, update=False)

In [2]:
corpus_file, question_file, configuration = "data/scifact/corpus.json", "data/scifact/question_config.json", sci_configuration
config = DocUVerseConfig(configuration)
engine = SearchEngine(config)
corpus = engine.read_data(corpus_file, no_cache=True)
engine.ingest(corpus, update=True)
queries = engine.read_questions(question_file)

output = engine.search(queries)

scorer = EvaluationEngine(config.eval_config)
res = scorer.compute_score(queries, output, model_name=config.index_name)
print(f"Results:\n {res}")