In [None]:
#!pip install datasets ir_datasets rank_bm25 sentence_transformers keras langchain

In [1]:
## Import Libraries

In [None]:
from datasets import load_dataset
from ir_datasets import load as load_ir
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
from datasets import load_dataset

ds = load_dataset("parquet", data_files="train-00000-of-00047.parquet", split="train[:50]")


In [4]:
print(ds[0].keys())


dict_keys(['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer'])


In [5]:
print(ds[0]["entity_pages"].keys())



dict_keys(['doc_source', 'filename', 'title', 'wiki_context'])


In [6]:
print(ds[0]["search_results"].keys())

dict_keys(['description', 'filename', 'rank', 'title', 'url', 'search_context'])


In [7]:
from collections import defaultdict

# Limit for teaching purposes (you can increase)
N = len(ds)  # or smaller, e.g., 50

doc_texts = []        # all unique docs for FAISS
doc_meta  = []        # optional metadata
doc_key_to_id = {}    # map text -> index for deduplication

q_gold = {}       # question idx -> set(doc_id) (gold)
q_candidates = {} # question idx -> list(doc_id) (candidate pool)

def add_doc(text, meta=None):
    t = text.strip()
    if not t:
        return None
    if t in doc_key_to_id:
        return doc_key_to_id[t]
    doc_id = len(doc_texts)
    doc_texts.append(t)
    doc_meta.append(meta)
    doc_key_to_id[t] = doc_id
    return doc_id

for i in range(N):
    item = ds[i]
    
    # --- Gold docs ---
    gold_ids = set()
    for txt in item["entity_pages"]["wiki_context"]:
        did = add_doc(txt, meta={"type":"gold"})
        if did is not None:
            gold_ids.add(did)
    
    # --- Candidate docs ---
    cand_ids = []
    for txt in item["search_results"]["search_context"]:
        did = add_doc(txt, meta={"type":"candidate"})
        if did is not None:
            cand_ids.append(did)
    
    # Ensure gold docs are in candidate list
    for gid in list(gold_ids):
        if gid not in cand_ids:
            cand_ids.append(gid)
    
    q_gold[i] = gold_ids
    q_candidates[i] = cand_ids

# Summary
print("Processed questions:", N)
print("Unique documents in corpus:", len(doc_texts))
avg_cands = sum(len(v) for v in q_candidates.values()) / max(1, N)
avg_golds = sum(len(v) for v in q_gold.values()) / max(1, N)
print(f"Avg candidates per question: {avg_cands:.2f}")
print(f"Avg gold docs per question: {avg_golds:.2f}")


Processed questions: 50
Unique documents in corpus: 528
Avg candidates per question: 10.76
Avg gold docs per question: 1.66


In [8]:
import faiss
import numpy as np
import time
from sentence_transformers import SentenceTransformer

# Step 2a: Build embeddings for all candidate docs
model = SentenceTransformer('all-MiniLM-L6-v2')

# Count approximate tokens (split by space)
num_tokens = sum(len(doc.split()) for doc in doc_texts)
print("Approximate total tokens in candidate docs:", num_tokens)
print("Total candidate docs:", len(doc_texts))

# Measure time for embedding
start_time = time.time()
cand_embeddings = model.encode(doc_texts, convert_to_numpy=True, show_progress_bar=True)
end_time = time.time()
print(f"Time taken to embed {len(doc_texts)} docs ({num_tokens} tokens approx): {end_time - start_time:.2f} seconds")

# Step 2b: Build FAISS index
start_index_time = time.time()
embedding_dim = cand_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance
index.add(cand_embeddings)
end_index_time = time.time()
print(f"Time taken to add embeddings to FAISS index: {end_index_time - start_index_time:.2f} seconds")
print("FAISS index built with", index.ntotal, "documents")


Approximate total tokens in candidate docs: 1532383
Total candidate docs: 528


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Time taken to embed 528 docs (1532383 tokens approx): 51.51 seconds
Time taken to add embeddings to FAISS index: 0.00 seconds
FAISS index built with 528 documents


In [9]:
# Pick one question
i = 0
question = ds[i]["question"]
gold_ids = q_gold[i]  # set of gold doc IDs

print("Question:", question)
print("Gold doc IDs:", gold_ids)

# Embed the question
q_emb = model.encode(question, convert_to_numpy=True)

# Retrieve top-k docs
top_k = 5
D, I = index.search(np.expand_dims(q_emb, axis=0), top_k)
retrieved_ids = I[0]

print("\nTop-k retrieved doc IDs:", retrieved_ids)

# Show retrieved snippets and indicate gold
for rank, doc_id in enumerate(retrieved_ids, start=1):
    snippet = doc_texts[doc_id][:300].replace("\n", " ")
    is_gold = "✅ GOLD" if doc_id in gold_ids else "❌"
    print(f"\nRank {rank} - Doc ID {doc_id} {is_gold}:\n{snippet} ...")

# Compute metrics for this single question
precision = len(gold_ids & set(retrieved_ids)) / top_k
recall = len(gold_ids & set(retrieved_ids)) / max(1, len(gold_ids))
rr = 0
for rank, doc_id in enumerate(retrieved_ids):
    if doc_id in gold_ids:
        rr = 1 / (rank + 1)
        break

print(f"\nMetrics for this question:")
print(f"Precision@{top_k}: {precision:.3f}")
print(f"Recall@{top_k}: {recall:.3f}")
print(f"Reciprocal Rank (RR): {rr:.3f}")


Question: Who was President when the first Peanuts cartoon was published?
Gold doc IDs: {0}

Top-k retrieved doc IDs: [5 1 4 0 3]

Rank 1 - Doc ID 5 ❌:
A Brief History of Charles Schulz's 'Peanuts' Comic Strip - TIME Follow @TIME When Alex Davis was 2 years old, he pointed to a drawing his father had done and exclaimed, "Snoopy!" The problem: his father was Jim Davis, the creator of Garfield, and the picture was of the cat he made famous. Charles S ...

Rank 2 - Doc ID 1 ❌:
Peanuts | Peanuts Wiki | Fandom powered by Wikia Charles M. Schulz drawing Snoopy . Peanuts is a syndicated daily and Sunday comic strip written and illustrated by Charles M. Schulz , which ran from October 2, 1950, to February 13, 2000 (the day after Schulz's death). In total 17,897 different Peanu ...

Rank 3 - Doc ID 4 ❌:
Peanuts by Charles Schulz  | Read Comic Strips at GoComics.com Share Link Explore Peanuts ...

Rank 4 - Doc ID 0 ✅ GOLD:
Peanuts is a syndicated daily and Sunday American comic strip written and

In [10]:
from sentence_transformers import CrossEncoder

# Load cross-encoder (fine-tuned for relevance / QA)
cross_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Example: single question
i = 0
question = ds[i]["question"]
gold_ids = q_gold[i]

# Step 1: retrieve top-k using FAISS
top_k = 5
q_emb = model.encode(question, convert_to_numpy=True)  # use your sentence-transformer
D, I = index.search(np.expand_dims(q_emb, axis=0), top_k)
retrieved_ids = I[0]

# Step 2: prepare pairs for cross-encoder scoring
pairs = [[question, doc_texts[doc_id]] for doc_id in retrieved_ids]

# Step 3: compute relevance scores
scores = cross_model.predict(pairs)

# Step 4: sort by score (descending)
sorted_idx = np.argsort(scores)[::-1]
reranked_ids = [retrieved_ids[idx] for idx in sorted_idx]

# Step 5: display reranked docs and compute metrics
print("Reranked top-k doc IDs:", reranked_ids)
for rank, doc_id in enumerate(reranked_ids, start=1):
    snippet = doc_texts[doc_id][:300].replace("\n", " ")
    is_gold = "✅ GOLD" if doc_id in gold_ids else "❌"
    print(f"\nRank {rank} - Doc ID {doc_id} {is_gold}:\n{snippet} ...")

# Step 6: compute metrics after reranking
precision = len(gold_ids & set(reranked_ids)) / top_k
recall = len(gold_ids & set(reranked_ids)) / max(1, len(gold_ids))
rr = 0
for rank, doc_id in enumerate(reranked_ids):
    if doc_id in gold_ids:
        rr = 1 / (rank + 1)
        break

print(f"\nMetrics after reranking:")
print(f"Precision@{top_k}: {precision:.3f}")
print(f"Recall@{top_k}: {recall:.3f}")
print(f"Reciprocal Rank (RR): {rr:.3f}")


Reranked top-k doc IDs: [1, 0, 5, 3, 4]

Rank 1 - Doc ID 1 ❌:
Peanuts | Peanuts Wiki | Fandom powered by Wikia Charles M. Schulz drawing Snoopy . Peanuts is a syndicated daily and Sunday comic strip written and illustrated by Charles M. Schulz , which ran from October 2, 1950, to February 13, 2000 (the day after Schulz's death). In total 17,897 different Peanu ...

Rank 2 - Doc ID 0 ✅ GOLD:
Peanuts is a syndicated daily and Sunday American comic strip written and illustrated by Charles M. Schulz, which ran from October 2, 1950, to February 13, 2000, continuing in reruns afterward. The strip is the most popular and influential in the history of comic strips, with 17,897 strips published ...

Rank 3 - Doc ID 5 ❌:
A Brief History of Charles Schulz's 'Peanuts' Comic Strip - TIME Follow @TIME When Alex Davis was 2 years old, he pointed to a drawing his father had done and exclaimed, "Snoopy!" The problem: his father was Jim Davis, the creator of Garfield, and the picture was of the cat he m

In [13]:
#!pip install chromadb
import chromadb
from chromadb.config import Settings

# Initialize Chroma client with new configuration
client = chromadb.Client(Settings())


In [15]:
# Install Chroma if not already
# pip install chromadb



# Create a collection
collection_name = "candidate_docs"
if collection_name in [c.name for c in client.list_collections()]:
    client.delete_collection(collection_name)
collection = client.create_collection(name=collection_name)

# Add documents + embeddings to Chroma
ids = [str(i) for i in range(len(doc_texts))]  # simple string IDs
metadatas = [{"source": "candidate_doc"} for _ in doc_texts]  # optional metadata

collection.add(
    documents=doc_texts,
    embeddings=cand_embeddings.tolist(),  # Chroma expects list of lists
    ids=ids,
    metadatas=metadatas
)

print(f"Chroma collection '{collection_name}' created with {len(doc_texts)} documents.")


Chroma collection 'candidate_docs' created with 528 documents.


In [18]:
# ----------------- Single Question Retrieval Demo -----------------
sample = ds[0]
question = sample["question"]
gold_docs = set(sample["entity_pages"]["wiki_context"])  # ground truth docs

print("Question:", question)
print("Gold doc snippet:", list(gold_docs)[0][:300], "...")

# Embed question
query_emb = model.encode(question).tolist()

# Retrieve top-k from Chroma
top_k = 5
results = collection.query(
    query_embeddings=[query_emb],
    n_results=top_k
)

retrieved_docs = results['documents'][0]
retrieved_ids = results['ids'][0]  # get document IDs from Chroma

# Print ranked docs with gold check
print("\nTop-k retrieved docs:")
for rank, (doc_id, doc) in enumerate(zip(retrieved_ids, retrieved_docs), start=1):
    check = "✅ GOLD" if doc in gold_docs else "❌"
    snippet = doc[:500].replace("\n", " ")  # first 500 chars, remove line breaks
    print(f"Rank {rank} - Doc ID {doc_id} {check}:\n{snippet}\n")

# Compute metrics
retrieved_set = set(retrieved_docs)
precision = len(gold_docs & retrieved_set) / top_k
recall = len(gold_docs & retrieved_set) / max(1, len(gold_docs))

rr = 0
for rank, doc in enumerate(retrieved_docs):
    if doc in gold_docs:
        rr = 1 / (rank + 1)
        break

print("Metrics after retrieval:")
print(f"Precision@{top_k}: {precision:.3f}")
print(f"Recall@{top_k}: {recall:.3f}")
print(f"Reciprocal Rank (RR): {rr:.3f}")


Question: Who was President when the first Peanuts cartoon was published?
Gold doc snippet: Peanuts is a syndicated daily and Sunday American comic strip written and illustrated by Charles M. Schulz, which ran from October 2, 1950, to February 13, 2000, continuing in reruns afterward. The strip is the most popular and influential in the history of comic strips, with 17,897 strips published ...

Top-k retrieved docs:
Rank 1 - Doc ID 5 ❌:
A Brief History of Charles Schulz's 'Peanuts' Comic Strip - TIME Follow @TIME When Alex Davis was 2 years old, he pointed to a drawing his father had done and exclaimed, "Snoopy!" The problem: his father was Jim Davis, the creator of Garfield, and the picture was of the cat he made famous. Charles Schulz's black-and-white dog is so beloved, though, that a lasagna-loving cat can't even compete. Saturday, Oct. 2, marks 60 years since Schulz's first Peanuts strip hit newspapers. Since then, Snoopy,

Rank 2 - Doc ID 1 ❌:
Peanuts | Peanuts Wiki | Fandom powe

In [19]:
from sentence_transformers import CrossEncoder

# ----------------- Single Question Retrieval + Rerank -----------------
sample = ds[0]
question = sample["question"]
gold_docs = set(sample["entity_pages"]["wiki_context"])  # ground truth docs

print("Question:", question)
print("Gold doc snippet:", list(gold_docs)[0][:300], "...")

# Embed question
query_emb = model.encode(question).tolist()

# Retrieve top-k from Chroma
top_k = 5
results = collection.query(
    query_embeddings=[query_emb],
    n_results=top_k
)

retrieved_docs = results['documents'][0]
retrieved_ids = results['ids'][0]  # doc IDs

# ----------------- Rerank with Cross-Encoder -----------------
cross_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
pairs = [[question, doc] for doc in retrieved_docs]
scores = cross_model.predict(pairs)

# Sort by scores descending
reranked = sorted(zip(scores, retrieved_ids, retrieved_docs), reverse=True)
reranked_scores, reranked_ids, reranked_docs = zip(*reranked)

# Print reranked top-k
print("\nReranked top-k docs:")
for rank, (doc_id, doc) in enumerate(zip(reranked_ids, reranked_docs), start=1):
    check = "✅ GOLD" if doc in gold_docs else "❌"
    snippet = doc[:500].replace("\n", " ")
    print(f"Rank {rank} - Doc ID {doc_id} {check}:\n{snippet}\n")

# ----------------- Compute Metrics -----------------
retrieved_set = set(reranked_docs)
precision = len(gold_docs & retrieved_set) / top_k
recall = len(gold_docs & retrieved_set) / max(1, len(gold_docs))

rr = 0
for rank, doc in enumerate(reranked_docs):
    if doc in gold_docs:
        rr = 1 / (rank + 1)
        break

print("Metrics after reranking:")
print(f"Precision@{top_k}: {precision:.3f}")
print(f"Recall@{top_k}: {recall:.3f}")
print(f"Reciprocal Rank (RR): {rr:.3f}")


Question: Who was President when the first Peanuts cartoon was published?
Gold doc snippet: Peanuts is a syndicated daily and Sunday American comic strip written and illustrated by Charles M. Schulz, which ran from October 2, 1950, to February 13, 2000, continuing in reruns afterward. The strip is the most popular and influential in the history of comic strips, with 17,897 strips published ...

Reranked top-k docs:
Rank 1 - Doc ID 1 ❌:
Peanuts | Peanuts Wiki | Fandom powered by Wikia Charles M. Schulz drawing Snoopy . Peanuts is a syndicated daily and Sunday comic strip written and illustrated by Charles M. Schulz , which ran from October 2, 1950, to February 13, 2000 (the day after Schulz's death). In total 17,897 different Peanuts strips were published. The strip was one of the most popular and influential in the history of the medium, and considered the most beloved comic strips of all time. It was "arguably the longest sto

Rank 2 - Doc ID 0 ✅ GOLD:
Peanuts is a syndicated daily an