In [12]:
# If needed:
# !pip install numpy tqdm

import json, pathlib, math
from typing import Dict, List, Tuple
import numpy as np
from tqdm import tqdm


In [23]:
# Point to your prepared dataset folder
DATA_DIR = "prepared/msmarco-dev-subset-1000-plus-50000-neg"   # or "prepared/msmarco-dev"

# Load queries.tsv
queries: Dict[str, str] = {}
with open(pathlib.Path(DATA_DIR) / "queries.tsv", "r", encoding="utf-8") as f:
    for line in f:
        qid, qtext = line.rstrip("\n").split("\t", 1)
        queries[qid] = qtext

# Load qrels.tsv  (qid \t 0 \t docid \t rel)
qrels: Dict[str, Dict[str, int]] = {}
with open(pathlib.Path(DATA_DIR) / "qrels.tsv", "r", encoding="utf-8") as f:
    for line in f:
        qid, _, docid, rel = line.strip().split("\t")
        qrels.setdefault(qid, {})[docid] = int(rel)

print(f"Loaded queries: {len(queries):,} | qrels: {len(qrels):,}")


Loaded queries: 1,000 | qrels: 1,000


In [19]:
def ndcg_at_k(ranked: List[str], truth: Dict[str,int], k=10) -> float:
    dcg = 0.0
    for i, doc_id in enumerate(ranked[:k], start=1):
        rel = truth.get(doc_id, 0)
        if rel > 0:
            dcg += rel / math.log2(i + 1)
    idcg = 0.0
    for i, rel in enumerate(sorted(truth.values(), reverse=True)[:k], start=1):
        idcg += rel / math.log2(i + 1)
    return dcg / idcg if idcg > 0 else 0.0

def mrr_at_k(ranked: List[str], truth: Dict[str,int], k=10) -> float:
    for i, doc_id in enumerate(ranked[:k], start=1):
        if truth.get(doc_id, 0) > 0:
            return 1.0 / i
    return 0.0

def recall_at_k(ranked: List[str], truth: Dict[str,int], k=10) -> float:
    rel_docs = [d for d, r in truth.items() if r > 0]
    if not rel_docs:
        return 0.0
    hits = sum(1 for d in ranked[:k] if d in truth and truth[d] > 0)
    return hits / len(rel_docs)

def precision_at_k(ranked: List[str], truth: Dict[str,int], k=10) -> float:
    if k == 0:
        return 0.0
    hits = sum(1 for d in ranked[:k] if truth.get(d, 0) > 0)
    return hits / k

def average_precision_at_k(ranked: List[str], truth: Dict[str,int], k=10) -> float:
    rel_docs = [d for d, r in truth.items() if r > 0]
    if not rel_docs:
        return 0.0
    ap, hits = 0.0, 0
    for i, d in enumerate(ranked[:k], start=1):
        if truth.get(d, 0) > 0:
            hits += 1
            ap += hits / i
    return ap / len(rel_docs)


In [26]:
RUNS_DIR = "runs"
json_path = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top10.json"

with open(json_path, "r", encoding="utf-8") as f:
    results = json.load(f)  # dict[qid] = [doc_ids]

print("Loaded results for", len(results), "queries")


Loaded results for 1000 queries


In [27]:
qids_eval = [qid for qid in results if qid in qrels]

ndcg = np.mean([ndcg_at_k(results[qid], qrels[qid], k=10) for qid in qids_eval])
mrr  = np.mean([mrr_at_k(results[qid], qrels[qid], k=10) for qid in qids_eval])
rec  = np.mean([recall_at_k(results[qid], qrels[qid], k=10) for qid in qids_eval])
prec = np.mean([precision_at_k(results[qid], qrels[qid], k=10) for qid in qids_eval])
mapk = np.mean([average_precision_at_k(results[qid], qrels[qid], k=10) for qid in qids_eval])

print(f"Evaluated on {len(qids_eval)} queries")
print(f"nDCG@10     = {ndcg:.4f}")
print(f"MRR@10      = {mrr:.4f}")
print(f"Recall@10   = {rec:.4f}")
print(f"Precision@10= {prec:.4f}")
print(f"MAP@10      = {mapk:.4f}")


Evaluated on 1000 queries
nDCG@10     = 0.9217
MRR@10      = 0.9037
Recall@10   = 0.9828
Precision@10= 0.1034
MAP@10      = 0.9004
