# Create small difficult corpus
 Notebook to create a small, yet difficult corpus, for quick experimentation. In many cases, there are a lot of documents to search through, and indexing these documents with various methods can take a very long time. To avoid lenghty experiments, yet making sure that the corpus oen searches through is "hard", this notebook creates a small corpus (e.g. 10000 documents) that contain:
  * all goldstandard documents
  * top 100 documents per query (obtained with some method - could be the model itself, another model, or bm25)
  * other random documents to pad to the desired size, if needed.

In [None]:
from docuverse.engines.search_engine_config_params import DocUVerseConfig
from docuverse.utils import open_stream, read_config_file
from docuverse.engines.search_engine import SearchEngine, SearchData
import os
import orjson
from  tqdm.notebook import tqdm

In [None]:
# data_dir="/home/raduf/sandbox2/docuverse/benchmark/ibm_search"
data_dir="/home/raduf/sandbox2/docuverse/benchmark/sap"
docuverse_dir = "/home/raduf/sandbox2/docuverse"
min_size = 0

In [None]:
# config_file = os.path.join("/home/raduf/sandbox2/docuverse/experiments/unified_search/ibmsw_milvus_dense.snowlake-m.test.yaml")
# expt_file = "experiments/unified_search/ibmsw_milvus_bm25.granite.test.yaml"
# expt_file = "experiments/unified_search/ibmsw_milvus_dense.granite-30m.test.yaml"
expt_file = "experiments/sap/sap_milvus_dense.granite30.dev.flat.file.yaml"
top_k=100
output_file = f"corpus.dev.top{top_k}.jsonl"
config_file = os.path.join(docuverse_dir, expt_file)
config = DocUVerseConfig(config_file)
engine = SearchEngine(config)

In [None]:
query_file = os.path.join(docuverse_dir, config.retriever_config.input_queries)
corpus_file = os.path.join(docuverse_dir, config.retriever_config.input_passages)
if query_file.endswith(".jsonl"):
    query_json = query_file
else:
    query_config = read_config_file(query_file)
    query_json = query_config['question_file']
# goldstandard = query_config['goldstandard_file']
short_doc_output = os.path.join(data_dir, output_file)

In [None]:
short_doc_output

In [None]:
for i, line in enumerate(open_stream(corpus_file)):
    # print(line)
    print(orjson.loads(line))
    if i==10:
        break

In [None]:
corpus = [orjson.loads(line) for line in tqdm(open_stream(corpus_file))]

In [None]:
len(corpus)

In [None]:
# Create dictionary mapping doc IDs to their full document content
documents = {}
# Loop through corpus and populate the document dictionary
for c in corpus:
    documents[c['document_id']] = c

In [None]:
queries = engine.read_questions(engine.config.input_queries)
# answers = engine.read_cache_file(extension=".retrieve.pkl.bz2")[0]
engine.retriever.config.top_k = top_k
answers=engine.search(queries)

In [None]:
len(answers[0])

In [None]:
relevant = {}
for q in queries:
    for r in q['relevant']:
        relevant[r] = 1

In [None]:
ansdocs = relevant
from docuverse.utils import get_orig_docid
for a in answers:
    for result in a:
        ansdocs[get_orig_docid(result['id'])] = 1

In [None]:
corpus_size = len(corpus)
import random
for i in tqdm(range(len(ansdocs), min_size)):
    while True:
        id = str(random.randint(0, corpus_size))
        if id not in ansdocs:
            ansdocs[str(id)] = 1
            break

In [None]:
short_doc_output

In [None]:
import json
with open_stream(short_doc_output, write=True) as g:
    for docid in sorted(ansdocs.keys()):
        print(json.dumps(documents[docid]), file=g)

In [None]:
len(ansdocs)