# RapidFire AI â€” RAG Track Notebook (Resume Retrieval)\n
\n
End-to-end: Load dataset -> Define configurations -> Evaluate retrieval metrics\n
\n
**Dataset layout:**\n
```\n
dataset/\n
  corpus.jsonl\n
  queries.jsonl\n
  qrels.tsv\n
```

## 0) Install Dependencies

In [2]:
import sys
!{sys.executable} -m pip install -q datasets pandas langchain langchain-community langchain-huggingface langchain-text-splitters beautifulsoup4 sentence-transformers scikit-learn tiktoken faiss-cpu lxml

In [4]:
import sys
!{sys.executable} -m pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.20.1-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------------------------- -- 1.6/1.7 MB 16.8 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 11.5 MB/s  0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.20.1


## 1) Imports

In [1]:
import os
import re
import json
import math
import random
from pathlib import Path
from typing import Dict, Any, List as PyList, Optional
import pandas as pd
from datasets import load_dataset

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from bs4 import BeautifulSoup




## 2) Load Dataset

In [3]:
from pathlib import Path

dataset_dir = Path("dataset")

assert (dataset_dir / "corpus.jsonl").exists(), f"Missing: {dataset_dir / 'corpus.jsonl'}"
assert (dataset_dir / "queries.jsonl").exists(), f"Missing: {dataset_dir / 'queries.jsonl'}"
assert (dataset_dir / "qrels.tsv").exists(), f"Missing: {dataset_dir / 'qrels.tsv'}"


In [4]:
queries_ds = load_dataset(
    "json",
    data_files=str(dataset_dir / "queries.jsonl"),
    split="train"
)

# This rename does nothing (same names), so it's safe to remove
# queries_ds = queries_ds.rename_columns({"query": "query", "query_id": "query_id"})

queries_ds = queries_ds.map(lambda x: {"query_id": int(x["query_id"])})


In [5]:
qrels = pd.read_csv(dataset_dir / "qrels.tsv", sep="\t")

qrels = qrels.rename(
    columns={
        "query-id": "query_id",
        "corpus-id": "corpus_id",
        "score": "relevance",
    }
)

qrels["query_id"] = qrels["query_id"].astype(int)
qrels["corpus_id"] = qrels["corpus_id"].astype(int)

print(f"Loaded {len(queries_ds)} queries and {len(qrels)} relevance judgments")

Loaded 15 queries and 594 relevance judgments


In [6]:
import json
from langchain_core.documents import Document

# Load corpus using simple JSON reading (no jq dependency)
corpus_docs = []

with open(dataset_dir / "corpus.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)

        # Extract text and metadata
        text = record.get("text_plain", "")
        metadata = {
            "corpus_id": int(record["doc_id"]),
            "person_id": str(record.get("person_id", record.get("doc_id"))),
            "raw_html": (
                record.get("resume_html")
                or record.get("text_html")
                or record.get("Resume_html")
                or record.get("html")
            ),
        }

        corpus_docs.append(Document(page_content=text, metadata=metadata))

print(f"Loaded {len(corpus_docs)} corpus documents")


Loaded 2483 corpus documents


## 3) HTML-Aware Splitting Utilities

In [7]:
from typing import Optional, List as PyList
from bs4 import BeautifulSoup

try:
    # Newer LangChain
    from langchain_core.documents import Document
except Exception:
    # Older LangChain
    from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter



class HTMLSectionTextSplitter:
    """LangChain-compatible splitter for HTML resume sections"""

    def __init__(
        self,
        inner_splitter: Optional[RecursiveCharacterTextSplitter] = None,
        min_section_chars: int = 200,
    ):
        self.inner_splitter = inner_splitter
        self.min_section_chars = min_section_chars

    def split_documents(self, docs: PyList[Document]) -> PyList[Document]:
        out_docs = []

        for d in docs:
            md = dict(d.metadata or {})
            html = md.get("raw_html") or d.page_content

            # Simple section extraction
            soup = BeautifulSoup(html, "lxml")
            text = soup.get_text(" ", strip=True)

            if len(text) < self.min_section_chars:
                continue

            section_md = {**md, "chunking": "html_section"}
            section_doc = Document(page_content=text, metadata=section_md)

            if self.inner_splitter:
                out_docs.extend(
                    self.inner_splitter.split_documents([section_doc])
                )
            else:
                out_docs.append(section_doc)

        return out_docs


## 4) Retrieval Metrics

In [8]:
import math
from typing import Dict, List as PyList, Set

def _ndcg_at_k(ranked_ids: PyList[int], relevant_set: Set[int], k: int) -> float:
    rels = [1 if doc_id in relevant_set else 0 for doc_id in ranked_ids[:k]]
    dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(rels))

    ideal_rels = [1] * min(k, len(relevant_set)) + [0] * max(0, k - len(relevant_set))
    idcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal_rels))

    return (dcg / idcg) if idcg > 0 else 0.0

def _rr(ranked_ids: PyList[int], relevant_set: Set[int]) -> float:
    for i, doc_id in enumerate(ranked_ids):
        if doc_id in relevant_set:
            return 1.0 / (i + 1)
    return 0.0

def compute_metrics(
    retrieved_docs: PyList[PyList[int]],
    ground_truth_docs: PyList[PyList[int]],
    ks: PyList[int] = [5, 10],
) -> Dict[str, float]:
    total = len(retrieved_docs)
    if total == 0:
        return {f"Precision@{k}": 0.0 for k in ks} | {f"Recall@{k}": 0.0 for k in ks} | {f"NDCG@{k}": 0.0 for k in ks} | {"MRR": 0.0}

    sums: Dict[str, float] = {}
    for k in ks:
        sums[f"Precision@{k}"] = 0.0
        sums[f"Recall@{k}"] = 0.0
        sums[f"NDCG@{k}"] = 0.0
    sums["MRR"] = 0.0

    for retrieved, gt in zip(retrieved_docs, ground_truth_docs):
        rel = set(gt)
        for k in ks:
            topk = retrieved[:k]
            tp = sum(1 for x in topk if x in rel)
            precision = tp / len(topk) if topk else 0.0
            recall = tp / len(rel) if rel else 0.0

            sums[f"Precision@{k}"] += precision
            sums[f"Recall@{k}"] += recall
            sums[f"NDCG@{k}"] += _ndcg_at_k(retrieved, rel, k)

        sums["MRR"] += _rr(retrieved, rel)

    for key in sums:
        sums[key] /= total

    return sums

## 5) Define Configurations

In [9]:
import random

SEED = 42
random.seed(SEED)

BATCH_SIZE = 64

# Splitters
base_splitters = [
    (
        "chunk256_overlap32",
        RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            "gpt2", chunk_size=256, chunk_overlap=32
        ),
    ),
]

html_section_splitters = [
    ("html_sections_only",
     HTMLSectionTextSplitter(inner_splitter=None, min_section_chars=200)),

    ("html_sections_then_chunk128o32",
     HTMLSectionTextSplitter(
         inner_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(
             "gpt2", chunk_size=128, chunk_overlap=32
         ),
         min_section_chars=200,
     )),

    ("html_sections_then_chunk256o32",
     HTMLSectionTextSplitter(
         inner_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(
             "gpt2", chunk_size=256, chunk_overlap=32
         ),
         min_section_chars=200,
     )),
]


SPLITTERS = base_splitters + html_section_splitters

# Embeddings
EMBEDDERS = [
    (
        "all-MiniLM-L6-v2",
        {
            "model_name": "sentence-transformers/all-MiniLM-L6-v2",
            "model_kwargs": {"device": "cpu"},
            "encode_kwargs": {"normalize_embeddings": True, "batch_size": BATCH_SIZE},
        },
    ),
]

SEARCH_TYPES = ["similarity"]
TOPK = [10]
INDEX_SCHEMES = ["faiss_flat", "faiss_ivf"]

print(
    f"Total configurations: {len(SPLITTERS) * len(EMBEDDERS) * len(SEARCH_TYPES) * len(TOPK)}"
)


Total configurations: 4


## 6) Build RAG Pipeline and Evaluate

In [13]:
from langchain_community.vectorstores import FAISS

def build_rag_pipeline(
    corpus_docs,
    splitter_obj,
    embed_kwargs,
    search_type,
    k,
    index_scheme="faiss_flat",   # NEW
    ivf_nlist=256,               # NEW (tweakable)
):
    """
    Build retriever:
      - chunk docs
      - embed
      - index into FAISS with either Flat or IVF
      - return retriever (similarity or MMR)
    """

    # 1) chunk
    chunked_docs = splitter_obj.split_documents(corpus_docs)

    # 2) embedder
    embeddings = HuggingFaceEmbeddings(**embed_kwargs)

    # 3) build index
    if index_scheme == "faiss_flat":
        vs = FAISS.from_documents(chunked_docs, embeddings)

    elif index_scheme == "faiss_ivf":
        # IVF needs training; FAISS wrapper provides a helper
        # NOTE: For small corpora IVF may be worse than flat (expected).
        vs = FAISS.from_documents(chunked_docs, embeddings)
        # Convert underlying index to IVF (best-effort)
        try:
            import faiss
            dim = vs.index.d
            quantizer = faiss.IndexFlatIP(dim)  # because we normalize embeddings => inner product ~ cosine
            ivf = faiss.IndexIVFFlat(quantizer, dim, int(ivf_nlist), faiss.METRIC_INNER_PRODUCT)

            # train + add vectors
            xb = vs.index.reconstruct_n(0, vs.index.ntotal)
            ivf.train(xb)
            ivf.add(xb)

            vs.index = ivf
        except Exception as e:
            raise RuntimeError(f"Failed to build IVF index. Install faiss and ensure CPU/GPU faiss is available. Error: {e}")

    else:
        raise ValueError(f"Unknown index_scheme: {index_scheme}. Use 'faiss_flat' or 'faiss_ivf'.")

    # 4) retriever
    if search_type == "mmr":
        retriever = vs.as_retriever(search_type="mmr", search_kwargs={"k": int(k)})
    else:
        retriever = vs.as_retriever(search_type="similarity", search_kwargs={"k": int(k)})

    return retriever


In [14]:
def evaluate_config(
    corpus_docs,
    splitter_name,
    splitter_obj,
    embed_name,
    embed_kwargs,
    search_type,
    index_scheme,   # NEW
    k,
):

    """Evaluate a single configuration"""

    config_name = f"{splitter_name}__emb={embed_name}__{search_type}__index={index_scheme}__k={k}"
    print(f"\nEvaluating: {config_name}")

    # Build pipeline
    retriever = build_rag_pipeline(
    corpus_docs,
    splitter_obj,
    embed_kwargs,
    search_type,
    k,
    index_scheme=index_scheme,   # NEW
)

    # Retrieve for all queries
    retrieved_documents = []
    for query in queries_ds["query"]:
        retrieved = retriever.invoke(query)  # list[Document]
        ids = []
        for d in retrieved:
            if "corpus_id" in (d.metadata or {}):
                ids.append(int(d.metadata["corpus_id"]))
        retrieved_documents.append(ids)

    # Get ground truth
    ground_truth_documents = [
        qrels.loc[qrels["query_id"] == int(qid), "corpus_id"].astype(int).tolist()
        for qid in queries_ds["query_id"]
    ]

    # Compute metrics
    metrics = compute_metrics(retrieved_documents, ground_truth_documents)

    print(f"Results for {config_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

    return {"config": config_name, "metrics": metrics}


In [15]:
# Run evaluations
results = []

total_runs = (
    len(SPLITTERS) * len(EMBEDDERS) * len(SEARCH_TYPES) * len(INDEX_SCHEMES) * len(TOPK)
)
run_i = 0

for splitter_name, splitter_obj in SPLITTERS:
    for embed_name, embed_kwargs in EMBEDDERS:
        for stype in SEARCH_TYPES:
            for index_scheme in INDEX_SCHEMES:
                for k in TOPK:
                    run_i += 1
                    print(f"\n[{run_i}/{total_runs}] Running: "
                          f"{splitter_name} | {embed_name} | {stype} | {index_scheme} | k={k}")

                    result = evaluate_config(
                        corpus_docs=corpus_docs,
                        splitter_name=splitter_name,
                        splitter_obj=splitter_obj,
                        embed_name=embed_name,
                        embed_kwargs=embed_kwargs,
                        search_type=stype,
                        index_scheme=index_scheme,
                        k=k,
                    )
                    results.append(result)

print("\n" + "=" * 60)
print(f"Completed {len(results)} evaluations")



[1/8] Running: chunk256_overlap32 | all-MiniLM-L6-v2 | similarity | faiss_flat | k=10

Evaluating: chunk256_overlap32__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10


  embeddings = HuggingFaceEmbeddings(**embed_kwargs)


Results for chunk256_overlap32__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10:
  Precision@5: 0.3467
  Recall@5: 0.0741
  NDCG@5: 0.3257
  Precision@10: 0.3067
  Recall@10: 0.1169
  NDCG@10: 0.3057
  MRR: 0.4211

[2/8] Running: chunk256_overlap32 | all-MiniLM-L6-v2 | similarity | faiss_ivf | k=10

Evaluating: chunk256_overlap32__emb=all-MiniLM-L6-v2__similarity__index=faiss_ivf__k=10
Results for chunk256_overlap32__emb=all-MiniLM-L6-v2__similarity__index=faiss_ivf__k=10:
  Precision@5: 0.2000
  Recall@5: 0.0268
  NDCG@5: 0.2100
  Precision@10: 0.1733
  Recall@10: 0.0465
  NDCG@10: 0.1874
  MRR: 0.3133

[3/8] Running: html_sections_only | all-MiniLM-L6-v2 | similarity | faiss_flat | k=10

Evaluating: html_sections_only__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10
Results for html_sections_only__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10:
  Precision@5: 0.3067
  Recall@5: 0.0502
  NDCG@5: 0.3183
  Precision@10: 0.2600
  Recall@10: 0.0800
  NDCG@10: 0

## 7) Summary of Results

In [16]:
# Display results as DataFrame (sorted + clean)
results_data = []
for r in results:
    row = {"Configuration": r["config"]}
    row.update(r["metrics"])   # metrics are already scalars
    results_data.append(row)

results_df = pd.DataFrame(results_data)

# Choose sort metric (change if you prefer another)
sort_metric = "NDCG@10" if "NDCG@10" in results_df.columns else (
    "NDCG@5" if "NDCG@5" in results_df.columns else None
)
if sort_metric:
    results_df = results_df.sort_values(by=sort_metric, ascending=False)

# Optional: round numeric columns for readability
num_cols = results_df.select_dtypes(include="number").columns
results_df[num_cols] = results_df[num_cols].round(4)

print("\nFinal Results (sorted):")
print(results_df.to_string(index=False))

# Optional: save to CSV so you can reuse it in your notebook / summary doc
results_df.to_csv("retrieval_sweep_results.csv", index=False)
print("\nSaved: retrieval_sweep_results.csv")



Final Results (sorted):
                                                                           Configuration  Precision@5  Recall@5  NDCG@5  Precision@10  Recall@10  NDCG@10    MRR
html_sections_then_chunk128o32__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10       0.3867    0.0695  0.3812        0.3400     0.1308   0.3491 0.5711
            chunk256_overlap32__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10       0.3467    0.0741  0.3257        0.3067     0.1169   0.3057 0.4211
html_sections_then_chunk256o32__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10       0.3467    0.0741  0.3257        0.3067     0.1169   0.3057 0.4211
            html_sections_only__emb=all-MiniLM-L6-v2__similarity__index=faiss_flat__k=10       0.3067    0.0502  0.3183        0.2600     0.0800   0.2817 0.5430
 html_sections_then_chunk256o32__emb=all-MiniLM-L6-v2__similarity__index=faiss_ivf__k=10       0.2000    0.0268  0.2100        0.1733     0.0465   0.1874 0.3133
         