In [17]:
# If needed:
# !pip install fastembed numpy tqdm

import json, pathlib
from typing import Dict, List
import numpy as np
from tqdm import tqdm

from fastembed import SparseTextEmbedding  # MiniCOIL lives here
import torch
from sentence_transformers import CrossEncoder


In [8]:
# Use your subset or full prepared set
DATA_DIR = "prepared/msmarco-dev-subset-1000-plus-50000-neg"   # or "prepared/msmarco-dev"

# Path to the previously saved dense run (top-10 per query)
RUNS_DIR = "runs"
IN_JSON  = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top10.json"  # set this to your saved file
# If you only have TSV instead, set IN_TSV and leave IN_JSON=None
IN_TSV   = None  # e.g., pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top10.tsv"

TOPK = 10
OUT_JSON = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top{TOPK}_rerank_lateint.json"
OUT_TSV  = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top{TOPK}_rerank_lateint.tsv"


In [9]:
# queries.tsv
queries: Dict[str, str] = {}
with open(pathlib.Path(DATA_DIR) / "queries.tsv", "r", encoding="utf-8") as f:
    for line in f:
        qid, qtext = line.rstrip("\n").split("\t", 1)
        queries[qid] = qtext

# qrels.tsv (optional for eval later)
qrels: Dict[str, Dict[str,int]] = {}
with open(pathlib.Path(DATA_DIR) / "qrels.tsv", "r", encoding="utf-8") as f:
    for line in f:
        qid, _, docid, rel = line.strip().split("\t")
        qrels.setdefault(qid, {})[docid] = int(rel)

# corpus.jsonl (for text lookup)
corpus: Dict[str, Dict[str,str]] = {}
with open(pathlib.Path(DATA_DIR) / "corpus.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        corpus[rec["_id"]] = {"title": rec.get("title",""), "text": rec.get("text","")}


In [10]:
def load_run_json(path: pathlib.Path) -> Dict[str, List[str]]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_run_tsv(path: pathlib.Path) -> Dict[str, List[str]]:
    res: Dict[str, List[str]] = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) < 2: 
                continue
            qid, docid = parts[0], parts[1]
            res.setdefault(qid, []).append(docid)
    return res

if IN_JSON and pathlib.Path(IN_JSON).exists():
    base_run = load_run_json(IN_JSON)
elif IN_TSV and pathlib.Path(IN_TSV).exists():
    base_run = load_run_tsv(IN_TSV)
else:
    raise FileNotFoundError("No input run found. Set IN_JSON or IN_TSV correctly.")

# Keep only queries we have text for (and clamp to TOPK if needed)
for qid, docs in list(base_run.items()):
    base_run[qid] = docs[:TOPK]
len(base_run)


1000

In [12]:
# Initialize MiniCOIL
minicoil = SparseTextEmbedding(model_name="Qdrant/minicoil-v1")  # downloads from HF

def sparse_dot(q_idx: np.ndarray, q_val: np.ndarray, d_idx: np.ndarray, d_val: np.ndarray) -> float:
    """Efficient sparse dot: intersect by walking two sorted index arrays."""
    i = j = 0
    score = 0.0
    # (FastEmbed returns indices sorted; if not, np.argsort first)
    while i < len(q_idx) and j < len(d_idx):
        if q_idx[i] == d_idx[j]:
            score += float(q_val[i]) * float(d_val[j])
            i += 1; j += 1
        elif q_idx[i] < d_idx[j]:
            i += 1
        else:
            j += 1
    return score

def score_minicoil(query_text: str, doc_texts: List[str]) -> np.ndarray:
    # Embed query (one vector)
    q_emb = list(minicoil.query_embed(query_text))[0]   # SparseEmbedding(values, indices)
    q_idx = q_emb.indices
    q_val = q_emb.values

    # Embed documents (iterator over SparseEmbedding)
    d_embs = list(minicoil.embed(doc_texts))

    scores = np.zeros(len(doc_texts), dtype=np.float32)
    for k, de in enumerate(d_embs):
        scores[k] = sparse_dot(q_idx, q_val, de.indices, de.values)
    return scores


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

stopwords.txt:   0%|          | 0.00/743 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


minicoil.triplet.model.vocab: 0.00B [00:00, ?B/s]

minicoil.triplet.model.npy:   0%|          | 0.00/157M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

In [13]:
reranked: Dict[str, List[str]] = {}
qids_eval = [qid for qid in base_run if qid in queries]

for qid in tqdm(qids_eval, desc="Reranking with MiniCOIL"):
    cand_ids = base_run[qid]
    cand_texts = [ (corpus[d]["title"] + " " + corpus[d]["text"]).strip() for d in cand_ids ]
    # optional hard cap on very long passages (speed); MiniCOIL is robust
    cand_texts = [t[:4096] for t in cand_texts]
    s = score_minicoil(queries[qid], cand_texts)
    order = np.argsort(-s)
    reranked[qid] = [cand_ids[i] for i in order]

# Save
OUT_JSON.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(reranked, f, indent=2)

with open(OUT_TSV, "w", encoding="utf-8") as f:
    for qid, docs in reranked.items():
        for rank, docid in enumerate(docs, start=1):
            f.write(f"{qid}\t{docid}\t{rank}\n")

print("Saved:", OUT_JSON)
print("Saved:", OUT_TSV)


Reranking with MiniCOIL: 100%|██████████| 1000/1000 [03:31<00:00,  4.72it/s]

Saved: runs\msmarco-dev-subset-1000-plus-50000-neg_dense_top10_rerank_lateint.json
Saved: runs\msmarco-dev-subset-1000-plus-50000-neg_dense_top10_rerank_lateint.tsv





In [15]:
import math

def ndcg_at_k(ranked, truth, k=10):
    dcg = 0.0
    for i,d in enumerate(ranked[:k], start=1):
        rel = truth.get(d,0)
        if rel>0: dcg += rel / math.log2(i+1)
    idcg = sum(rel/math.log2(i+1) for i,rel in enumerate(sorted(truth.values(), reverse=True)[:k], start=1))
    return dcg/idcg if idcg>0 else 0.0

def mrr_at_k(ranked, truth, k=10):
    for i,d in enumerate(ranked[:k], start=1):
        if truth.get(d,0)>0: return 1.0/i
    return 0.0

def recall_at_k(ranked, truth, k=10):
    rels = [d for d,r in truth.items() if r>0]
    if not rels: return 0.0
    return sum(1 for d in ranked[:k] if truth.get(d,0)>0)/len(rels)

def precision_at_k(ranked, truth, k=10):
    return sum(1 for d in ranked[:k] if truth.get(d,0)>0)/k

def ap_at_k(ranked, truth, k=10):
    rels = [d for d,r in truth.items() if r>0]
    if not rels: return 0.0
    ap,h = 0.0,0
    for i,d in enumerate(ranked[:k], start=1):
        if truth.get(d,0)>0:
            h += 1; ap += h/i
    return ap/len(rels)
def eval_run(run):
    qids = [qid for qid in run if qid in qrels]
    ndcg = np.mean([ndcg_at_k(run[qid], qrels[qid], 10) for qid in qids])
    mrr  = np.mean([mrr_at_k(run[qid],  qrels[qid], 10) for qid in qids])
    rec  = np.mean([recall_at_k(run[qid],qrels[qid],10) for qid in qids])
    prec = np.mean([precision_at_k(run[qid],qrels[qid],10) for qid in qids])
    mapk = np.mean([ap_at_k(run[qid],   qrels[qid],10) for qid in qids])
    return {"queries": len(qids), "nDCG@10":ndcg, "MRR@10":mrr, "Recall@10":rec, "Precision@10":prec, "MAP@10":mapk}

print("Before:", eval_run(base_run))
print("After :", eval_run(reranked))

Before: {'queries': 1000, 'nDCG@10': np.float64(0.921747962526825), 'MRR@10': np.float64(0.9036996031746032), 'Recall@10': np.float64(0.9828333333333332), 'Precision@10': np.float64(0.10340000000000002), 'MAP@10': np.float64(0.9004077380952381)}
After : {'queries': 1000, 'nDCG@10': np.float64(0.8628394472241399), 'MRR@10': np.float64(0.8268853174603176), 'Recall@10': np.float64(0.9828333333333332), 'Precision@10': np.float64(0.10340000000000002), 'MAP@10': np.float64(0.8217142857142857)}


In [18]:

# Previously saved dense retrieval results (top-k per query)
RUNS_DIR = "runs"
IN_JSON  = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top10.json"  # set to your file
IN_TSV   = None  # e.g., pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top10.tsv"

TOPK = 10

# Output
OUT_JSON = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top{TOPK}_rerank_crossenc.json"
OUT_TSV  = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top{TOPK}_rerank_crossenc.tsv"

# Cross-Encoder model (fast, MS MARCO-tuned)
CE_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 64
MAX_SEQ_LEN = 256   # tradeoff: 256~384 usually good for passages

In [19]:
# Initialize Cross-Encoder
ce = CrossEncoder(CE_MODEL, device=DEVICE, max_length=MAX_SEQ_LEN)

reranked_ce: Dict[str, List[str]] = {}
qids_eval = [qid for qid in base_run if qid in queries]

for qid in tqdm(qids_eval, desc="Reranking (Cross-Encoder)"):
    cand_ids = base_run[qid]
    # Build (query, passage) pairs
    cand_texts = [ (corpus[d]["title"] + " " + corpus[d]["text"]).strip() for d in cand_ids ]
    # Truncate passages hard to keep within MAX_SEQ_LEN after tokenization (cheap guard)
    cand_texts = [t[:4096] for t in cand_texts]

    pairs = [(queries[qid], t) for t in cand_texts]
    scores = ce.predict(pairs, batch_size=BATCH_SIZE)  # higher = more relevant
    order = np.argsort(-scores)
    reranked_ce[qid] = [cand_ids[i] for i in order]

# Save runs
OUT_JSON.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(reranked_ce, f, indent=2)

with open(OUT_TSV, "w", encoding="utf-8") as f:
    for qid, docs in reranked_ce.items():
        for rank, docid in enumerate(docs, start=1):
            f.write(f"{qid}\t{docid}\t{rank}\n")

print("Saved:", OUT_JSON)
print("Saved:", OUT_TSV)


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

  try:
Reranking (Cross-Encoder): 100%|██████████| 1000/1000 [02:43<00:00,  6.11it/s]


Saved: runs\msmarco-dev-subset-1000-plus-50000-neg_dense_top10_rerank_crossenc.json
Saved: runs\msmarco-dev-subset-1000-plus-50000-neg_dense_top10_rerank_crossenc.tsv


In [20]:
print("Before:", eval_run(base_run))
print("After :", eval_run(reranked_ce))


Before: {'queries': 1000, 'nDCG@10': np.float64(0.921747962526825), 'MRR@10': np.float64(0.9036996031746032), 'Recall@10': np.float64(0.9828333333333332), 'Precision@10': np.float64(0.10340000000000002), 'MAP@10': np.float64(0.9004077380952381)}
After : {'queries': 1000, 'nDCG@10': np.float64(0.9537821760077454), 'MRR@10': np.float64(0.9455095238095239), 'Recall@10': np.float64(0.9828333333333332), 'Precision@10': np.float64(0.10340000000000002), 'MAP@10': np.float64(0.9427373015873015)}


In [23]:
def compare_three(qid, run_dense, run_minicoil, run_ce, k=5):
    print(f"\nQID: {qid}")
    print("Query:", queries[qid])
    print("="*140)
    print(f"{'Dense (before)':45} | {'MiniCOIL rerank':45} | {'Cross-Encoder rerank':45}")
    print("-"*140)
    for i in range(k):
        # dense
        docid_b = run_dense[qid][i]
        rel_b = qrels.get(qid, {}).get(docid_b, 0)
        mark_b = "✓" if rel_b>0 else "✗"
        text_b = (corpus[docid_b]["text"][:40]).replace("\n"," ")
        left = f"{i+1:2d}. {mark_b} {docid_b} {text_b}..."

        # minicoil
        docid_m = run_minicoil[qid][i]
        rel_m = qrels.get(qid, {}).get(docid_m, 0)
        mark_m = "✓" if rel_m>0 else "✗"
        text_m = (corpus[docid_m]["text"][:40]).replace("\n"," ")
        mid = f"{i+1:2d}. {mark_m} {docid_m} {text_m}..."

        # cross-encoder
        docid_c = run_ce[qid][i]
        rel_c = qrels.get(qid, {}).get(docid_c, 0)
        mark_c = "✓" if rel_c>0 else "✗"
        text_c = (corpus[docid_c]["text"][:40]).replace("\n"," ")
        right = f"{i+1:2d}. {mark_c} {docid_c} {text_c}..."

        print(f"{left:45} | {mid:45} | {right:45}")

import random

def sample_three(run_dense, run_minicoil, run_ce, n=3):
    qids = list(run_dense.keys())
    random.shuffle(qids)
    shown = 0
    for qid in qids:
        if qid not in run_minicoil or qid not in run_ce:
            continue
        # skip queries with no relevant docs
        if not qrels.get(qid, {}):
            continue
        compare_three(qid, run_dense, run_minicoil, run_ce, k=5)
        shown += 1
        if shown >= n:
            break

# Example: show 3 queries side-by-side
sample_three(base_run, reranked, reranked_ce, n=3)


QID: 1090086
Query: technology working group definition
Dense (before)                                | MiniCOIL rerank                               | Cross-Encoder rerank                         
--------------------------------------------------------------------------------------------------------------------------------------------
 1. ✗ 598183 Current Practice and Perceptions of Grou... |  1. ✗ 6756074 Applied information technology (AIT) is ... |  1. ✗ 8090876 What is a person who studies technology ...
 2. ✗ 8090876 What is a person who studies technology ... |  2. ✗ 8090876 What is a person who studies technology ... |  2. ✗ 7784310 Collaborative software is a broad concep...
 3. ✗ 371375 , predicts that within the next ten year... |  3. ✗ 598183 Current Practice and Perceptions of Grou... |  3. ✗ 6756074 Applied information technology (AIT) is ...
 4. ✗ 6756074 Applied information technology (AIT) is ... |  4. ✗ 371375 , predicts that within the next ten year... |  4. ✗ 2569

🔹 QID: 1090086

Query: technology working group definition

Dense (before): Brings in documents broadly about technology or groups, but nothing that clearly defines “technology working group.”

MiniCOIL rerank: Tries to reorder based on token-level matches like technology and group. It still doesn’t capture the intended “definition,” so results shuffle around but remain off-target.

Cross-Encoder rerank: Uses full query–document interactions. The top doc is still not great (mentions “person who studies technology”), but notice by rank 2 it surfaces “Collaborative software…” which is closer to a working group sense.
👉 Neither stage fully nails the definition, but cross-encoder is nudging results slightly closer to intent.

🔹 QID: 1083721

Query: what does hair tint do

Dense (before): Immediately retrieves the relevant passage (✓: Hair tint is a type of hair dye…). Dense vectors are already strong for this easy factual query.

MiniCOIL rerank: Keeps the same relevant doc at rank 1. Lower ranks shuffle (sometimes in irrelevant beauty/vision tint directions), because token-level overlap can be misleading (tint also appears in lens tint).

Cross-Encoder rerank: Keeps the correct answer firmly at rank 1, and also promotes other cosmetically relevant docs higher than e.g. vision tint.
👉 For simple factual questions with a clear keyword, all methods succeed, but cross-encoder preserves and stabilizes relevance.

🔹 QID: 1086354

Query: what can help dogs sleep

Dense (before): Top doc is a Q&A about sleeping pills for dogs (not what the query asked), while the actually relevant doc (For senior dogs with aches and pains… can help dogs sleep better) is at rank 3.

MiniCOIL rerank: Boosts that relevant doc (senior dogs with aches and pains…) to rank 1 — nice example where sparse token-level matching helps (help dogs sleep appears literally).

Cross-Encoder rerank: Drops the relevant doc back to rank 2, because it weights semantic context: the “sleeping pills” doc (rank 1) seems closer to a “direct answer.”
👉 This shows the trade-off: MiniCOIL nailed literal matching and surfaced the correct doc, while cross-encoder misjudged slightly (preferring the pills doc).

🧩 Takeaways

Dense retrieval is strong at topical similarity but not exact intent.

MiniCOIL reranker helps when queries are literal and the words match (like help dogs sleep), but can be noisy when terms are polysemous (tint).

Cross-Encoder reranker usually wins on average because it models full sentence interactions, but it can over-prioritize superficially “answer-like” docs (e.g., pills vs natural remedies).

That’s why in practice, many systems use:

Dense retriever (broad coverage)

Sparse reranker (MiniCOIL/ColBERT) for token-level matches

Cross-Encoder reranker for final precision
