# Create small difficult corpus
 Notebook to create a small, yet difficult corpus, for quick experimentation. In many cases, there are a lot of documents to search through, and indexing these documents with various methods can take a very long time. To avoid lenghty experiments, yet making sure that the corpus oen searches through is "hard", this notebook creates a small corpus (e.g. 10000 documents) that contain:
  * all goldstandard documents
  * top 100 documents per query (obtained with some method - could be the model itself, another model, or bm25)
  * other random documents to pad to the desired size, if needed.

In [4]:
from docuverse.engines.search_engine_config_params import DocUVerseConfig
from docuverse.utils import open_stream, read_config_file
from docuverse.engines.search_engine import SearchEngine, SearchData
import os
import orjson
from  tqdm.auto import tqdm

In [5]:
# data_dir="/home/raduf/sandbox2/docuverse/benchmark/ibm_search"
#data_dir="/home/raduf/sandbox2/docuverse/benchmark/sap"
data_dir="/home/raduf/sandbox2/docuverse/benchmark/nq_new"
docuverse_dir = "/home/raduf/sandbox2/docuverse"
min_size = 0

In [86]:
# config_file = os.path.join("/home/raduf/sandbox2/docuverse/experiments/unified_search/ibmsw_milvus_dense.snowlake-m.test.yaml")
# expt_file = "experiments/unified_search/ibmsw_milvus_bm25.granite.test.yaml"
# expt_file = "experiments/unified_search/ibmsw_milvus_dense.granite-30m.test.yaml"
# expt_file = "experiments/sap/sap_milvus_dense.granite30.dev.flat.file.yaml"
expt_file = "experiments/nq_new/nq_milvus_dense.granite-149m.w512.test.yaml"
top_k=5
desired_output_file = f"corpus.dev.top{top_k}.jsonl"
config_file = os.path.join(docuverse_dir, expt_file)
config = DocUVerseConfig(config_file)
engine = SearchEngine(config)

Retrieval engine: milvus-dense
Running on the gpus:[1;31m['NVIDIA GeForce RTX 5090'][0m, attention: [33mflash_attention_2[0m
=== done initializing model


In [87]:
query_file = os.path.join(docuverse_dir, config.input_queries)
corpus_file = os.path.join(docuverse_dir, config.input_passages)
output_file = os.path.join(docuverse_dir, config.output_file)
if query_file.endswith(".jsonl"):
    query_json = query_file
else:
    query_config = read_config_file(query_file)
    query_json = query_config['question_file']
# goldstandard = query_config['goldstandard_file']
short_doc_output = os.path.join(data_dir, desired_output_file)

In [33]:
output_file

'/home/raduf/sandbox2/docuverse/output/nq-milvus-dense-granite149m-512-100-20250623.json'

In [24]:
for i, line in enumerate(open_stream(corpus_file)):
    # print(line)
    print(orjson.loads(line))
    if i==10:
        break

{'id': '837952315_208-721', 'text': 'Agriculture is a major industry in the United States, which is a net exporter of food. As of the 2007 census of agriculture, there were 2.2 million farms, covering an area of 922 million acres (3,730,000 km), an average of 418 acres (169 hectares) per farm. Although agricultural activity occurs in all states, it is particularly concentrated in the Great Plains, a vast expanse of flat, arable land in the center of the United States and in the region around the Great Lakes known as the Corn Belt.', 'title': 'Agriculture in the United States'}
{'id': '820613422_722-1382', 'text': "The United States was a leader in seed improvement i.e. hybridization and in expanding uses for crops from the work of George Washington Carver to the development of bioplastics and biofuels. The mechanization of farming and intensive farming have been major themes in U.S. history, including John Deere's steel plow, Cyrus McCormick's mechanical reaper, Eli Whitney's cotton gi

In [10]:
corpus = [orjson.loads(line) for line in tqdm(open_stream(corpus_file))]

0it [00:00, ?it/s]

In [None]:
len(corpus)

In [77]:
# Create dictionary mapping doc IDs to their full document content
documents = {}
# Loop through corpus and populate the document dictionary
id_header = config.data_template.id_header
for c in corpus:
    documents[c[id_header]] = c

In [None]:
queries = engine.read_questions(query_file)
# answers = engine.read_cache_file(extension=".retrieve.pkl.bz2")[0]
# Try to read the output, if not possible, then run the model
if os.path.exists(output_file):
    answers = engine.read_output(output_file)
else:
    engine.retriever.config.top_k = top_k
    answers=engine.search(queries)

In [78]:
documents['319702768_33418-33527']

{'id': '319702768_33418-33527',
 'text': 'Who Wants to Be a Millionaire? Who Wants to Be a Millionaire? (UK) Who Wants to Be a Millionaire? (US)',
 'title': 'Talk:Who Wants to Be a Millionaire?/Archive 1'}

In [80]:
relevant = {}
relevant_header = config.query_template.relevant_header
for q in queries:
    for r in q[relevant_header]:
        relevant[r] = 1

In [81]:
ansdocs = relevant.copy()
from docuverse.utils import get_orig_docid
for a in answers:
    for result in a[:top_k]:
        ansdocs[get_orig_docid(result['id'])] = 1

In [82]:
corpus_size = len(corpus)
import random
for i in tqdm(range(len(ansdocs), min_size)):
    while True:
        id = str(random.randint(0, corpus_size))
        if id not in ansdocs:
            ansdocs[str(id)] = 1
            break

0it [00:00, ?it/s]

0

In [88]:
import json
with open_stream(short_doc_output, write=True) as g:
    for docid in sorted(ansdocs.keys()):
        print(json.dumps(documents[docid]), file=g)
print(f"Saved the output in {short_doc_output}")

Saved the output in /home/raduf/sandbox2/docuverse/benchmark/nq_new/corpus.dev.top5.jsonl


In [84]:
short_doc_output

'/home/raduf/sandbox2/docuverse/output/nq-milvus-dense-granite149m-512-100-20250623.json'