In [1]:
# =============================================================================
# 🧪 Parameter Comparison Experiment - RAG Components
# =============================================================================
# Goal: Compare multiple embedding models, retrieval top-k values, and
# selection strategies (concat vs. MMR) on a subset of the RAG mini-Wikipedia dataset.
# =============================================================================

Setup & Imports

In [2]:
!pip install -q datasets evaluate transformers sentence-transformers faiss-cpu pandas numpy matplotlib


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import time
import numpy as np
import pandas as pd
import faiss
import matplotlib.pyplot as plt
from typing import List
from datasets import load_dataset
import evaluate
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

Configuration Dictionary

In [4]:
EXPERIMENT_CFG = {
    "embedding_models": [
        ("sentence-transformers/all-MiniLM-L6-v2", "miniLM6v2", 384),
        ("sentence-transformers/all-mpnet-base-v2", "mpnetv2", 512),
    ],
    "max_chars_per_chunk": 600,
    "retrieval_k_values": [3, 5, 10],
    "context_selection_modes": ["concat", "mmr"],
    "context_budget_chars": 2000,
    "sample_limit": 50,      # Change to 200 for longer runs
    "batch_size": 64,
}
print(EXPERIMENT_CFG)

{'embedding_models': [('sentence-transformers/all-MiniLM-L6-v2', 'miniLM6v2', 384), ('sentence-transformers/all-mpnet-base-v2', 'mpnetv2', 512)], 'max_chars_per_chunk': 600, 'retrieval_k_values': [3, 5, 10], 'context_selection_modes': ['concat', 'mmr'], 'context_budget_chars': 2000, 'sample_limit': 50, 'batch_size': 64}


Load and Inspect Datasets

In [5]:
wiki_corpus = load_dataset("rag-datasets/rag-mini-wikipedia", name="text-corpus")
qa_pairs = load_dataset("rag-datasets/rag-mini-wikipedia", name="question-answer")

docs_split = wiki_corpus["passages"]
qa_test = qa_pairs["test"]

print("Corpus size:", len(docs_split))
print("QA size:", len(qa_test))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

data/passages.parquet/part.0.parquet:   0%|          | 0.00/797k [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/3200 [00:00<?, ? examples/s]

data/test.parquet/part.0.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/918 [00:00<?, ? examples/s]

Corpus size: 3200
QA size: 918


Text Chunking Utilities

In [6]:
def segment_text(text: str, chunk_len: int) -> List[str]:
    """Split text into segments with fixed max length."""
    if not text:
        return []
    return [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]

def construct_document_segments(corpus_data, chunk_len: int):
    """Convert corpus into list of segmented docs."""
    segments = []
    for idx, row in enumerate(corpus_data):
        passage = row.get("text") or row.get("passage") or ""
        for j, piece in enumerate(segment_text(passage, chunk_len)):
            segments.append({"id": f"{idx}-{j}", "text": piece})
    return segments

documents = construct_document_segments(docs_split, EXPERIMENT_CFG["max_chars_per_chunk"])
print("Total text chunks generated:", len(documents))

Total text chunks generated: 4046


Embedding & Index Construction

In [7]:
def create_vector_index(model_name: str, expected_dim: int):
    """Embed all text chunks and create FAISS index."""
    embedder = SentenceTransformer(model_name)
    all_texts = [doc["text"] for doc in documents]

    # Safely encode (no unsupported args)
    vectors = embedder.encode(
        all_texts,
        batch_size=EXPERIMENT_CFG["batch_size"],
        show_progress_bar=True,
        normalize_embeddings=True,
        convert_to_numpy=True,
    ).astype("float32")

    # Build FAISS index
    index = faiss.IndexFlatIP(vectors.shape[1])
    index.add(vectors)

    print(f"✅ Index built for {model_name}: {vectors.shape[0]} vectors, dim={vectors.shape[1]}")
    return {"encoder": embedder, "index": index, "embeddings": vectors}


index_by_model = {}
for model_id, alias, dim in EXPERIMENT_CFG["embedding_models"]:
    print(f"🧩 Building FAISS index for: {alias}")
    index_by_model[alias] = create_vector_index(model_id, dim)

🧩 Building FAISS index for: miniLM6v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

✅ Index built for sentence-transformers/all-MiniLM-L6-v2: 4046 vectors, dim=384
🧩 Building FAISS index for: mpnetv2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

✅ Index built for sentence-transformers/all-mpnet-base-v2: 4046 vectors, dim=768


Retrieval Function

In [8]:
def retrieve_candidates(bundle, query: str, top_k: int):
    """Return top-k FAISS hits for a query."""
    q_vec = bundle["encoder"].encode([query], normalize_embeddings=True).astype("float32")
    distances, indices = bundle["index"].search(q_vec, top_k)
    return list(indices[0])

def mmr_select(bundle, query: str, pool_indices: List[int], select_k=5, lambda_weight=0.5):
    """Apply Maximal Marginal Relevance reranking."""
    emb = bundle["embeddings"]
    q_vec = bundle["encoder"].encode([query], normalize_embeddings=True).astype("float32")[0]
    selected, q_sims = [], emb[pool_indices] @ q_vec

    while len(selected) < min(select_k, len(pool_indices)):
        if not selected:
            j = int(np.argmax(q_sims))
        else:
            sim_selected = emb[pool_indices] @ emb[selected].T
            redundancy = sim_selected.max(axis=1)
            mmr_score = lambda_weight * q_sims - (1 - lambda_weight) * redundancy
            j = int(np.argmax(mmr_score))
        selected.append(pool_indices[j])
        pool_indices.pop(j)
        q_sims = np.delete(q_sims, j, axis=0)
    return selected

Context Assembly per Query

In [9]:
def assemble_context(bundle, query, top_k, mode, budget=2000):
    """Construct combined context from retrieved docs."""
    recall_k = max(top_k * 3, top_k) if mode == "mmr" else top_k
    raw_hits = retrieve_candidates(bundle, query, recall_k)
    chosen = raw_hits[:top_k] if mode == "concat" else mmr_select(bundle, query, raw_hits, select_k=top_k)

    combined, used_chars = [], 0
    for idx in chosen:
        text = documents[idx]["text"]
        if used_chars + len(text) > budget:
            combined.append(text[:budget - used_chars])
            break
        combined.append(text)
        used_chars += len(text)
    return "\n\n".join(combined)

Prompt Constructor

In [11]:
def concise_editor_prompt(context, question):
    """
    Persona-style prompt for factual, context-limited answers.
    """
    return (
        "You are a concise and factual encyclopedia editor. "
        "Answer ONLY using the provided context. "
        "If the information is missing, reply 'I don't know.'\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\nAnswer:"
    )

Load Generator

In [12]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
gen_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def produce_answer(prompt, max_new_tokens=128):
    """Run FLAN-T5 on a prompt."""
    return gen_model(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"].strip()

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


Evlauation Function

In [13]:
squad_metric = evaluate.load("squad")

def evaluate_combination(model_key, top_k, mode, n_samples=None):
    """Evaluate model-index combination for retrieval strategy."""
    bundle = index_by_model[model_key]
    total = len(qa_test) if n_samples is None else min(n_samples, len(qa_test))
    predictions, references = [], []

    for i in range(total):
        q = qa_test[i]["question"]
        gold = qa_test[i]["answer"] if "answer" in qa_test[i] else qa_test[i]["answers"]
        gold_text = gold if isinstance(gold, str) else gold[0]

        ctx = assemble_context(bundle, q, top_k, mode)
        prompt = concise_editor_prompt(ctx, q)
        answer = produce_answer(prompt)

        predictions.append({"id": str(i), "prediction_text": answer})
        references.append({"id": str(i), "answers": {"text": [gold_text], "answer_start": [0]}})
    return squad_metric.compute(predictions=predictions, references=references)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Run Parameter Sweep

In [14]:
experiment_records = []

for model_id, alias, dim in EXPERIMENT_CFG["embedding_models"]:
    for k_val in EXPERIMENT_CFG["retrieval_k_values"]:
        for strategy in EXPERIMENT_CFG["context_selection_modes"]:
            print(f"\n🚀 Evaluating {alias} | top_k={k_val} | mode={strategy}")
            metrics = evaluate_combination(alias, k_val, strategy, n_samples=EXPERIMENT_CFG["sample_limit"])
            record = {"model": alias, "top_k": k_val, "strategy": strategy, **metrics}
            print(" ->", record)
            experiment_records.append(record)

results_df = pd.DataFrame(experiment_records)
print("\n=== Final Results Table ===")
display(results_df)



🚀 Evaluating miniLM6v2 | top_k=3 | mode=concat


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


 -> {'model': 'miniLM6v2', 'top_k': 3, 'strategy': 'concat', 'exact_match': 38.0, 'f1': 45.4290059613589}

🚀 Evaluating miniLM6v2 | top_k=3 | mode=mmr
 -> {'model': 'miniLM6v2', 'top_k': 3, 'strategy': 'mmr', 'exact_match': 32.0, 'f1': 37.87654012654012}

🚀 Evaluating miniLM6v2 | top_k=5 | mode=concat


Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors


 -> {'model': 'miniLM6v2', 'top_k': 5, 'strategy': 'concat', 'exact_match': 40.0, 'f1': 48.34119468124045}

🚀 Evaluating miniLM6v2 | top_k=5 | mode=mmr
 -> {'model': 'miniLM6v2', 'top_k': 5, 'strategy': 'mmr', 'exact_match': 30.0, 'f1': 37.569295816431364}

🚀 Evaluating miniLM6v2 | top_k=10 | mode=concat
 -> {'model': 'miniLM6v2', 'top_k': 10, 'strategy': 'concat', 'exact_match': 42.0, 'f1': 50.178287413070024}

🚀 Evaluating miniLM6v2 | top_k=10 | mode=mmr
 -> {'model': 'miniLM6v2', 'top_k': 10, 'strategy': 'mmr', 'exact_match': 30.0, 'f1': 37.280668365450964}

🚀 Evaluating mpnetv2 | top_k=3 | mode=concat
 -> {'model': 'mpnetv2', 'top_k': 3, 'strategy': 'concat', 'exact_match': 46.0, 'f1': 50.06593406593406}

🚀 Evaluating mpnetv2 | top_k=3 | mode=mmr
 -> {'model': 'mpnetv2', 'top_k': 3, 'strategy': 'mmr', 'exact_match': 42.0, 'f1': 47.3092673992674}

🚀 Evaluating mpnetv2 | top_k=5 | mode=concat
 -> {'model': 'mpnetv2', 'top_k': 5, 'strategy': 'concat', 'exact_match': 46.0, 'f1': 51.843

Unnamed: 0,model,top_k,strategy,exact_match,f1
0,miniLM6v2,3,concat,38.0,45.429006
1,miniLM6v2,3,mmr,32.0,37.87654
2,miniLM6v2,5,concat,40.0,48.341195
3,miniLM6v2,5,mmr,30.0,37.569296
4,miniLM6v2,10,concat,42.0,50.178287
5,miniLM6v2,10,mmr,30.0,37.280668
6,mpnetv2,3,concat,46.0,50.065934
7,mpnetv2,3,mmr,42.0,47.309267
8,mpnetv2,5,concat,46.0,51.843712
9,mpnetv2,5,mmr,36.0,43.704823
