Notebook to download dataset and prepare it in format suitable for testing and evaluation.


In [1]:
# If needed:
# !pip install beir tqdm

import os, json, pathlib
from tqdm import tqdm
from beir import util
from beir.datasets.data_loader import GenericDataLoader


  from tqdm.autonotebook import tqdm


In [2]:
DATA_DIR = "datasets"  # where to store raw BEIR data
os.makedirs(DATA_DIR, exist_ok=True)

dataset = "msmarco"  # BEIR's MSMARCO Passage
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"

data_path = util.download_and_unzip(url, DATA_DIR)
print("Downloaded & extracted to:", data_path)


datasets\msmarco.zip:   0%|          | 0.00/1.01G [00:00<?, ?iB/s]

Downloaded & extracted to: datasets\msmarco


In [3]:
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="dev")
print(f"Corpus docs: {len(corpus):,}")
print(f"Queries total: {len(queries):,}")
print(f"Queries with qrels: {len(qrels):,}")


  0%|          | 0/8841823 [00:00<?, ?it/s]

Corpus docs: 8,841,823
Queries total: 6,980
Queries with qrels: 6,980


In [5]:
OUT_DIR = "prepared/msmarco-dev"
os.makedirs(OUT_DIR, exist_ok=True)

# 3a) Save corpus.jsonl
with open(pathlib.Path(OUT_DIR) / "corpus.jsonl", "w", encoding="utf-8") as f:
    for doc_id, fields in tqdm(corpus.items(), desc="Saving corpus.jsonl"):
        rec = {
            "_id": doc_id,
            "title": fields.get("title", ""),
            "text": fields.get("text", "")
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# 3b) Save queries.tsv (qid \t query)  -- fixed
with open(pathlib.Path(OUT_DIR) / "queries.tsv", "w", encoding="utf-8") as f:
    for qid, text in tqdm(queries.items(), desc="Saving queries.tsv"):
        clean_text = text.replace("\n", " ").strip()  # precompute to avoid backslash in f-string
        f.write(f"{qid}\t{clean_text}\n")

# 3c) Save qrels.tsv (qid \t 0 \t docid \t rel)
with open(pathlib.Path(OUT_DIR) / "qrels.tsv", "w", encoding="utf-8") as f:
    for qid, rels in tqdm(qrels.items(), desc="Saving qrels.tsv"):
        for doc_id, rel in rels.items():
            f.write(f"{qid}\t0\t{doc_id}\t{rel}\n")

print("Saved to:", OUT_DIR)


Saving corpus.jsonl: 100%|██████████| 8841823/8841823 [01:22<00:00, 106890.29it/s]
Saving queries.tsv: 100%|██████████| 6980/6980 [00:00<00:00, 774278.44it/s]
Saving qrels.tsv: 100%|██████████| 6980/6980 [00:00<00:00, 631089.50it/s]

Saved to: prepared/msmarco-dev





In [6]:
# Create a subset with the first N queries that have qrels
N = 200  # tweak as needed
subset_qids = [qid for qid in qrels.keys()][:N]

# Collect gold doc IDs from qrels
subset_gold_doc_ids = set()
for qid in subset_qids:
    subset_gold_doc_ids.update(qrels[qid].keys())

SUB_DIR = f"prepared/msmarco-dev-subset-{N}"
os.makedirs(SUB_DIR, exist_ok=True)

# 4a) Save subset corpus.jsonl
with open(pathlib.Path(SUB_DIR) / "corpus.jsonl", "w", encoding="utf-8") as f:
    for doc_id in tqdm(subset_gold_doc_ids, desc="Saving subset corpus.jsonl"):
        fields = corpus[doc_id]
        rec = {
            "_id": doc_id,
            "title": fields.get("title", ""),
            "text": fields.get("text", "")
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# 4b) Save subset queries.tsv  -- fixed
with open(pathlib.Path(SUB_DIR) / "queries.tsv", "w", encoding="utf-8") as f:
    for qid in tqdm(subset_qids, desc="Saving subset queries.tsv"):
        clean_text = queries[qid].replace("\n", " ").strip()
        f.write(f"{qid}\t{clean_text}\n")

# 4c) Save subset qrels.tsv
with open(pathlib.Path(SUB_DIR) / "qrels.tsv", "w", encoding="utf-8") as f:
    for qid in tqdm(subset_qids, desc="Saving subset qrels.tsv"):
        for doc_id, rel in qrels[qid].items():
            f.write(f"{qid}\t0\t{doc_id}\t{rel}\n")

print("Saved subset to:", SUB_DIR)


Saving subset corpus.jsonl: 100%|██████████| 211/211 [00:00<00:00, 70338.43it/s]
Saving subset queries.tsv: 100%|██████████| 200/200 [00:00<00:00, 199823.92it/s]
Saving subset qrels.tsv: 100%|██████████| 200/200 [00:00<00:00, 198265.37it/s]

Saved subset to: prepared/msmarco-dev-subset-200





In [9]:
# Create a subset with the first N queries that have qrels
N = 1000  # tweak as needed
subset_qids = [qid for qid in qrels.keys()][:N]

# Collect gold doc IDs from qrels
subset_gold_doc_ids = set()
for qid in subset_qids:
    subset_gold_doc_ids.update(qrels[qid].keys())

SUB_DIR = f"prepared/msmarco-dev-subset-{N}"
os.makedirs(SUB_DIR, exist_ok=True)

# 4a) Save subset corpus.jsonl
with open(pathlib.Path(SUB_DIR) / "corpus.jsonl", "w", encoding="utf-8") as f:
    for doc_id in tqdm(subset_gold_doc_ids, desc="Saving subset corpus.jsonl"):
        fields = corpus[doc_id]
        rec = {
            "_id": doc_id,
            "title": fields.get("title", ""),
            "text": fields.get("text", "")
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# 4b) Save subset queries.tsv  -- fixed
with open(pathlib.Path(SUB_DIR) / "queries.tsv", "w", encoding="utf-8") as f:
    for qid in tqdm(subset_qids, desc="Saving subset queries.tsv"):
        clean_text = queries[qid].replace("\n", " ").strip()
        f.write(f"{qid}\t{clean_text}\n")

# 4c) Save subset qrels.tsv
with open(pathlib.Path(SUB_DIR) / "qrels.tsv", "w", encoding="utf-8") as f:
    for qid in tqdm(subset_qids, desc="Saving subset qrels.tsv"):
        for doc_id, rel in qrels[qid].items():
            f.write(f"{qid}\t0\t{doc_id}\t{rel}\n")

print("Saved subset to:", SUB_DIR)

Saving subset corpus.jsonl: 100%|██████████| 1053/1053 [00:00<00:00, 54111.76it/s]
Saving subset queries.tsv: 100%|██████████| 1000/1000 [00:00<00:00, 667139.18it/s]
Saving subset qrels.tsv: 100%|██████████| 1000/1000 [00:00<00:00, 396662.00it/s]

Saved subset to: prepared/msmarco-dev-subset-1000





In [20]:
import os, json, pathlib, random
from tqdm import tqdm

# ---- Config ----
N = 1000              # number of queries to keep
M_NEG = 50_000        # number of extra non-gold docs to add
SEED = 42             # for reproducibility
SUB_DIR = f"prepared/msmarco-dev-subset-{N}-plus-{M_NEG}-neg"
os.makedirs(SUB_DIR, exist_ok=True)
random.seed(SEED)

# 1) Keep first N queries that have qrels
subset_qids = list(qrels.keys())[:N]

# 2) Collect all gold doc IDs from qrels for those queries
gold_doc_ids = set()
for qid in subset_qids:
    gold_doc_ids.update(qrels[qid].keys())

print(f"Queries kept: {len(subset_qids)}")
print(f"Gold docs in subset: {len(gold_doc_ids):,}")

# 3) Sample M_NEG extra non-gold docs from the full corpus
all_doc_ids = list(corpus.keys())
candidate_negs = [d for d in all_doc_ids if d not in gold_doc_ids]
print(f"Sampling negatives from pool of {len(candidate_negs):,} docs...")

if M_NEG > len(candidate_negs):
    print(f"Requested {M_NEG} negatives, but only {len(candidate_negs)} available. Using all.")
    sampled_negs = candidate_negs
else:
    sampled_negs = random.sample(candidate_negs, M_NEG)

# Final doc set = gold ∪ sampled negatives
subset_doc_ids = list(gold_doc_ids.union(sampled_negs))
print(f"Final subset corpus size: {len(subset_doc_ids):,} (gold + negatives)")

# 4a) Save subset corpus.jsonl
with open(pathlib.Path(SUB_DIR) / "corpus.jsonl", "w", encoding="utf-8") as f:
    for doc_id in tqdm(subset_doc_ids, desc="Saving subset corpus.jsonl"):
        fields = corpus[doc_id]
        rec = {
            "_id": doc_id,
            "title": fields.get("title", ""),
            "text": fields.get("text", "")
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# 4b) Save subset queries.tsv (N queries)
with open(pathlib.Path(SUB_DIR) / "queries.tsv", "w", encoding="utf-8") as f:
    for qid in tqdm(subset_qids, desc="Saving subset queries.tsv"):
        clean_text = queries[qid].replace("\n", " ").strip()
        f.write(f"{qid}\t{clean_text}\n")

# 4c) Save subset qrels.tsv (unchanged labels, only for kept queries)
with open(pathlib.Path(SUB_DIR) / "qrels.tsv", "w", encoding="utf-8") as f:
    for qid in tqdm(subset_qids, desc="Saving subset qrels.tsv"):
        for doc_id, rel in qrels[qid].items():
            f.write(f"{qid}\t0\t{doc_id}\t{rel}\n")

print("Saved subset to:", SUB_DIR)

Queries kept: 1000
Gold docs in subset: 1,053
Sampling negatives from pool of 8,840,770 docs...
Final subset corpus size: 51,053 (gold + negatives)


Saving subset corpus.jsonl: 100%|██████████| 51053/51053 [00:00<00:00, 91280.06it/s]
Saving subset queries.tsv: 100%|██████████| 1000/1000 [00:00<00:00, 396025.30it/s]
Saving subset qrels.tsv: 100%|██████████| 1000/1000 [00:00<00:00, 500215.15it/s]

Saved subset to: prepared/msmarco-dev-subset-1000-plus-50000-neg





In [19]:
# Read a couple of lines back to verify
import itertools, json

print("\nFull set samples:")
with open(pathlib.Path(SUB_DIR) / "corpus.jsonl", "r", encoding="utf-8") as f:
    for line in itertools.islice(f, 2):
        print(json.loads(line)["_id"], "…", json.loads(line)["text"][:80].replace("\n"," "), "…")

with open(pathlib.Path(SUB_DIR) / "queries.tsv", "r", encoding="utf-8") as f:
    for line in itertools.islice(f, 2):
        print("Query row:", line.strip())

with open(pathlib.Path(SUB_DIR) / "qrels.tsv", "r", encoding="utf-8") as f:
    for line in itertools.islice(f, 2):
        print("Qrels row:", line.strip())



Full set samples:
1825845 … Caledonia Country Club is located at 303 Park Place in the Village of Caledonia, …
4944092 … Seven key provisions of the revised scaffolding standard: A Ã¢-mid rails must be …
Query row: 300674	how many years did william bradford serve as governor of plymouth colony?
Query row: 125705	define preventive
Qrels row: 300674	0	7067032	1
Qrels row: 125705	0	7067056	1
