Imports and Setup

In [2]:
import sys, subprocess, pkgutil, os, time

def install_module(mod, pip_name=None):
    """Install a module if not already available."""
    pip_name = pip_name or mod
    if pkgutil.find_loader(mod) is None:
        print(f"Installing {pip_name} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pip_name])
    else:
        print(f"OK: {mod}")

for mod, pip_name in [
    ("datasets","datasets"),
    ("sentence_transformers","sentence-transformers"),
    ("transformers","transformers"),
    ("faiss","faiss-cpu"),
    ("numpy","numpy"),
    ("pandas","pandas"),
    ("evaluate","evaluate")
]:
    install_module(mod, pip_name)

  if pkgutil.find_loader(mod) is None:


OK: datasets
OK: sentence_transformers
OK: transformers
Installing faiss-cpu ...
OK: numpy
OK: pandas
Installing evaluate ...


In [3]:
ADV_RAG_CFG = {
    "encoder_model": "sentence-transformers/all-MiniLM-L6-v2",  # 384-dim embeddings
    "chunk_size_chars": 600,
    "embed_batch": 64,
    "retrieval_candidates": 20,      # per vector query
    "n_query_vectors": 3,            # number of multi-vectors per query
    "rerank_top_k": 5,               # passages kept after reranking
    "context_limit": 2000,           # character budget for the context
    "use_openai_api": False,         # switch True to use OpenAI API
}
print("Active configuration:", ADV_RAG_CFG)

Active configuration: {'encoder_model': 'sentence-transformers/all-MiniLM-L6-v2', 'chunk_size_chars': 600, 'embed_batch': 64, 'retrieval_candidates': 20, 'n_query_vectors': 3, 'rerank_top_k': 5, 'context_limit': 2000, 'use_openai_api': False}


Load Datasets

In [4]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import evaluate

wiki_corpus = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus")
qa_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")

print("Corpus loaded:", wiki_corpus)
print("QA loaded:", qa_dataset)

corpus_split = wiki_corpus["passages"]
qa_split = qa_dataset["test"]

LIMIT_DOCS = 1000
corpus_split = corpus_split.select(range(min(LIMIT_DOCS, len(corpus_split))))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

data/passages.parquet/part.0.parquet:   0%|          | 0.00/797k [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/3200 [00:00<?, ? examples/s]

data/test.parquet/part.0.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/918 [00:00<?, ? examples/s]

Corpus loaded: DatasetDict({
    passages: Dataset({
        features: ['passage', 'id'],
        num_rows: 3200
    })
})
QA loaded: DatasetDict({
    test: Dataset({
        features: ['question', 'answer', 'id'],
        num_rows: 918
    })
})


Chunk Corpus

In [5]:
def split_text_blocks(text, limit=ADV_RAG_CFG["chunk_size_chars"]):
    """Divide text into equal-sized character chunks."""
    if not text:
        return []
    return [text[i:i+limit] for i in range(0, len(text), limit)]

corpus_chunks = []
for doc_id, item in enumerate(corpus_split):
    raw_text = item.get("text") or item.get("passage") or ""
    for j, seg in enumerate(split_text_blocks(raw_text)):
        corpus_chunks.append({"chunk_id": f"{doc_id}-{j}", "content": seg})

print("Total text chunks:", len(corpus_chunks))
print("Example:", corpus_chunks[0]["chunk_id"], corpus_chunks[0]["content"][:100], "...")

Total text chunks: 1289
Example: 0-0 Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the ...


Embed Corpus and Build FAISS

In [6]:
from sentence_transformers import SentenceTransformer
import faiss

vector_encoder = SentenceTransformer(ADV_RAG_CFG["encoder_model"])
chunk_texts = [c["content"] for c in corpus_chunks]

embeddings = vector_encoder.encode(
    chunk_texts,
    batch_size=ADV_RAG_CFG["embed_batch"],
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True
).astype("float32")

faiss_index = faiss.IndexFlatIP(embeddings.shape[1])
faiss_index.add(embeddings)
chunk_ids = np.array([c["chunk_id"] for c in corpus_chunks])

print(f"✅ Index ready | dimension={embeddings.shape[1]} | vectors={faiss_index.ntotal}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

✅ Index ready | dimension=384 | vectors=1289


Text Generator (FLAN-T5)

In [7]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
text_generator = pipeline("text2text-generation", model=t5_model, tokenizer=t5_tokenizer)

def clean_text(text: str) -> str:
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text.replace("’","'").replace("“","\"").replace("”","\"").replace("–","-").replace("—","-")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


Multi-Vector Query Generation

In [8]:
def create_query_vectors(question: str, num_vectors: int = ADV_RAG_CFG["n_query_vectors"]):
    """
    Generate several semantically distinct dense vectors for a question.
    (Simulates multi-vector retrieval.)
    """
    q_clean = clean_text(question)
    # Slightly vary prompts to create diverse embeddings
    prompts = [
        f"{q_clean}",
        f"Detailed: {q_clean}",
        f"Contextual meaning of: {q_clean}"
    ][:num_vectors]
    query_vecs = vector_encoder.encode(prompts, normalize_embeddings=True).astype("float32")
    return query_vecs

Multi-Vector Retrieval

In [9]:
def retrieve_candidates(question, k_per_vec=ADV_RAG_CFG["retrieval_candidates"]):
    """Search FAISS index with multiple dense query vectors."""
    q_vectors = create_query_vectors(question)
    combined = []
    for qv in q_vectors:
        D, I = faiss_index.search(np.expand_dims(qv, axis=0), k_per_vec)
        for idx, score in zip(I[0], D[0]):
            combined.append((idx, score))
    # Deduplicate and keep highest similarity per passage
    best = {}
    for idx, s in combined:
        if idx not in best or s > best[idx]:
            best[idx] = s
    results = [(idx, sc) for idx, sc in best.items()]
    results.sort(key=lambda x: x[1], reverse=True)
    return results

Reranking with Cross-Encoder

In [10]:
from transformers import AutoTokenizer
from sentence_transformers import CrossEncoder

rerank_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
rerank_tokenizer = AutoTokenizer.from_pretrained(rerank_model_name)
rerank_model = CrossEncoder(rerank_model_name, max_length=768)

def truncate_tokens(text, tokenizer, limit):
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) > limit:
        ids = ids[:limit]
    return tokenizer.decode(ids, skip_special_tokens=True)

def rerank_passages(query, candidates, top_k=ADV_RAG_CFG["rerank_top_k"], q_limit=64, p_limit=704):
    """Rerank retrieved chunks using a cross-encoder."""
    q_trimmed = truncate_tokens(query, rerank_tokenizer, q_limit)
    pairs, ids = [], []
    for idx, _ in candidates:
        passage = corpus_chunks[idx]["content"]
        p_trimmed = truncate_tokens(passage, rerank_tokenizer, p_limit)
        pairs.append((q_trimmed, p_trimmed))
        ids.append(idx)

    if not pairs:
        return []

    scores = rerank_model.predict(pairs, convert_to_numpy=True, batch_size=64, show_progress_bar=False)
    order = np.argsort(-scores)[:top_k]
    return [(int(ids[i]), float(scores[i])) for i in order]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Context Assembly and Prompt

In [11]:
def assemble_context(chosen, budget=ADV_RAG_CFG["context_limit"]):
    """Build the grounded context block with inline citation tags."""
    context_parts, citations, used = [], [], 0
    for idx, score in chosen:
        text = corpus_chunks[idx]["content"]
        cid = chunk_ids[idx]
        tag = f"[{cid} | {score:.3f}]"
        snippet = f"{tag}\n{text}"
        if used + len(snippet) > budget:
            context_parts.append(snippet[:budget - used])
            citations.append({"id": cid, "score": float(score)})
            break
        context_parts.append(snippet)
        citations.append({"id": cid, "score": float(score)})
        used += len(snippet)
    return "\n\n".join(context_parts), citations

def factual_prompt(context, question):
    """Prompt enforcing grounded factual answers."""
    return (
        "You are a factual and concise assistant. "
        "Answer ONLY using the provided context. "
        "If missing, reply 'I don't know.' Include short inline citations.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )

Full Answering Pipeline

In [12]:
def advanced_rag_answer(question):
    """End-to-end advanced RAG: multi-vector retrieval + reranking."""
    retrieved = retrieve_candidates(question)
    ranked = rerank_passages(question, retrieved)
    context, refs = assemble_context(ranked, ADV_RAG_CFG["context_limit"])

    if ADV_RAG_CFG["use_openai_api"] and os.getenv("OPENAI_API_KEY"):
        from openai import OpenAI
        client = OpenAI()
        prompt = factual_prompt(context, question)
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2,
            max_tokens=256
        )
        answer = resp.choices[0].message.content.strip()
    else:
        prompt = factual_prompt(context, question)
        answer = text_generator(prompt, max_new_tokens=256)[0]["generated_text"].strip()

    return answer, refs

Example Run

In [14]:
test_question = qa_split[0]["question"]
print("❓ Question:", test_question)

final_ans, ref_list = advanced_rag_answer(test_question)
print("\n💬 Answer:\n", final_ans)
print("\n📚 Citations:", ref_list[:ADV_RAG_CFG["rerank_top_k"]])

❓ Question: Was Abraham Lincoln the sixteenth President of the United States?


Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors



💬 Answer:
 Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination.

📚 Citations: [{'id': np.str_('278-0'), 'score': 10.308937072753906}, {'id': np.str_('319-0'), 'score': 8.121996879577637}, {'id': np.str_('198-0'), 'score': -0.44947102665901184}, {'id': np.str_('383-0'), 'score': -0.734786331653595}, {'id': np.str_('281-0'), 'score': -0.9723389148712158}]
