In [16]:
# If needed:
# !pip install sentence-transformers qdrant-client  tqdm numpy

import os, json, pathlib
from typing import List, Dict, Tuple
from tqdm import tqdm
import numpy as np
import torch

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct


In [18]:
SUB_DIR = "prepared/msmarco-dev-subset-1000-plus-50000-neg"  # <-- change to 'prepared/msmarco-dev' to use full dev set

# Load corpus.jsonl
corpus: Dict[str, Dict[str, str]] = {}
with open(pathlib.Path(SUB_DIR) / "corpus.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        corpus[rec["_id"]] = {"title": rec.get("title", ""), "text": rec.get("text", "")}

# Load queries.tsv
queries: Dict[str, str] = {}
with open(pathlib.Path(SUB_DIR) / "queries.tsv", "r", encoding="utf-8") as f:
    for line in f:
        qid, qtext = line.rstrip("\n").split("\t", 1)
        queries[qid] = qtext

print(f"Loaded subset → docs: {len(corpus):,} | queries: {len(queries):,}")


Loaded subset → docs: 51,053 | queries: 1,000


In [12]:
torch.cuda.is_available()

True

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [19]:
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)

DENSE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

encoder = SentenceTransformer(DENSE_MODEL, device=device)

doc_ids = list(corpus.keys())
doc_texts = [ (corpus[_id]["title"] + " " + corpus[_id]["text"]).strip() for _id in doc_ids ]

EMB_PATH = pathlib.Path(CACHE_DIR) / f"{pathlib.Path(SUB_DIR).name}_dense.npy"

if EMB_PATH.exists():
    doc_vecs = np.load(EMB_PATH)
    print("Loaded cached doc embeddings:", doc_vecs.shape)
else:
    doc_vecs = encoder.encode(
        doc_texts,
        batch_size=256,
        show_progress_bar=True,
        normalize_embeddings=True
    ).astype(np.float32)
    np.save(EMB_PATH, doc_vecs)
    print("Encoded and cached doc embeddings:", doc_vecs.shape)


Loaded cached doc embeddings: (51053, 384)


In [20]:
client = QdrantClient(url="http://localhost:6333") # no Docker required

COLLECTION = "msmarco_subset_dense"
dim = doc_vecs.shape[1]

# (Re)create collection
if COLLECTION in [c.name for c in client.get_collections().collections]:
    client.delete_collection(COLLECTION)

client.create_collection(
    collection_name=COLLECTION,
    vectors_config=VectorParams(size=dim, distance=Distance.COSINE),
)

def batched(xs, n=512):
    for i in range(0, len(xs), n):
        yield xs[i:i+n]

# IMPORTANT: cast IDs to int for Qdrant
points = []
for i in range(len(doc_ids)):
    pid = int(doc_ids[i])            # <-- cast string "7071642" to int 7071642
    points.append(
        PointStruct(
            id=pid,
            vector=doc_vecs[i].tolist(),
            payload={
                "doc_id": doc_ids[i],                     # keep original string id in payload (optional)
                "title": corpus[doc_ids[i]]["title"],
                "text": corpus[doc_ids[i]]["text"],
            },
        )
    )

for batch in tqdm(list(batched(points, 512)), desc="Upserting"):
    client.upsert(collection_name=COLLECTION, points=batch)

print("Qdrant collection ready.")


Upserting: 100%|██████████| 100/100 [00:58<00:00,  1.72it/s]

Qdrant collection ready.





In [21]:
def search_topk(query_text: str, k: int = 10):
    q_vec = encoder.encode([query_text], normalize_embeddings=True)[0].astype(np.float32)
    hits = client.search(collection_name=COLLECTION, query_vector=q_vec.tolist(), limit=k) #retrieval
    return [(str(h.id), float(h.score)) for h in hits]  # cast back to str


# Try on first 3 queries
some_qids = list(queries.keys())[:3]
for qid in some_qids:
    q = queries[qid]
    hits = search_topk(q, k=10)
    print("\nQID:", qid)
    print("Q  :", q)
    for rank, (doc_id, score) in enumerate(hits, 1):
        title = corpus[doc_id]["title"] or "(no title)"
        print(f"{rank:>2}. {doc_id}  score={score:.4f}  | {title[:80]}")



QID: 300674
Q  : how many years did william bradford serve as governor of plymouth colony?
 1. 7067032  score=0.7953  | (no title)
 2. 5974044  score=0.5225  | (no title)
 3. 1524426  score=0.5023  | (no title)
 4. 3100192  score=0.5021  | (no title)
 5. 3422344  score=0.4922  | (no title)
 6. 5040220  score=0.4154  | (no title)
 7. 5948130  score=0.3955  | (no title)
 8. 2638802  score=0.3927  | (no title)
 9. 4501975  score=0.3787  | (no title)
10. 3159717  score=0.3748  | (no title)

QID: 125705
Q  : define preventive
 1. 7067056  score=0.8361  | (no title)
 2. 952184  score=0.5326  | (no title)
 3. 6724079  score=0.5305  | (no title)
 4. 6464884  score=0.4908  | (no title)
 5. 7301131  score=0.4253  | (no title)
 6. 85696  score=0.4220  | (no title)
 7. 2448091  score=0.4217  | (no title)
 8. 3617038  score=0.4201  | (no title)
 9. 7814229  score=0.3969  | (no title)
10. 6876723  score=0.3897  | (no title)

QID: 94798
Q  : color overlay photoshop
 1. 7067181  score=0.7403  | (no t

  hits = client.search(collection_name=COLLECTION, query_vector=q_vec.tolist(), limit=k) #retrieval


In [22]:
DATA_DIR = SUB_DIR
K = 10  # top-k docs per query
results = {}   # dict[qid] = [doc_ids]

for qid, qtext in tqdm(queries.items(), desc=f"Retrieving top-{K} for all queries"):
    hits = search_topk(qtext, k=K)
    results[qid] = [doc_id for doc_id, _ in hits]

print("Retrieved results for", len(results), "queries.")

# Save to JSON (easy to reload in Python)
RUNS_DIR = "runs"
os.makedirs(RUNS_DIR, exist_ok=True)
json_path = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top{K}.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)
print("Saved JSON run file ->", json_path)

# Also save to TSV (qid \t docid \t rank)
tsv_path = pathlib.Path(RUNS_DIR) / f"{pathlib.Path(DATA_DIR).name}_dense_top{K}.tsv"
with open(tsv_path, "w", encoding="utf-8") as f:
    for qid, docs in results.items():
        for rank, docid in enumerate(docs, start=1):
            f.write(f"{qid}\t{docid}\t{rank}\n")
print("Saved TSV run file ->", tsv_path)

  hits = client.search(collection_name=COLLECTION, query_vector=q_vec.tolist(), limit=k) #retrieval
Retrieving top-10 for all queries: 100%|██████████| 1000/1000 [00:36<00:00, 27.26it/s]

Retrieved results for 1000 queries.
Saved JSON run file -> runs\msmarco-dev-subset-1000-plus-50000-neg_dense_top10.json
Saved TSV run file -> runs\msmarco-dev-subset-1000-plus-50000-neg_dense_top10.tsv



