Imports and Setup

In [10]:
# Core utilities
import sys
import subprocess
import pkgutil
import numpy as np
import pandas as pd
import torch

# HuggingFace datasets & evaluation
from datasets import load_dataset
import evaluate

# Transformers (models, pipelines)
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    pipeline
)

# Sentence Transformers (embeddings)
from sentence_transformers import SentenceTransformer

# Milvus vector DB
from pymilvus import (
    MilvusClient,
    FieldSchema,
    CollectionSchema,
    DataType
)

Load and Inspect Dataset

In [11]:
print("Loading Wikipedia mini dataset...")

df_passages = pd.read_parquet(
    "hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet"
)

# Show stats
print(f"\nDataset size: {df_passages.shape}")
print("Null values:", df_passages.isnull().sum().to_dict())

# Drop nulls
df_passages = df_passages.dropna()
print(f"After cleanup: {df_passages.shape}")

# Limit dataset for demo
N_SAMPLES = 1000
df_passages = df_passages.head(N_SAMPLES)

# Show a preview
print("\nSample passage:\n", df_passages.iloc[0]['passage'][:300], "...")

Loading Wikipedia mini dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Dataset size: (3200, 1)
Null values: {'passage': 0}
After cleanup: (3200, 1)

Sample passage:
 Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area. ...


Load QA Pairs

In [12]:
qa_data = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")
test_split = qa_data["test"]

print(f"Total Q&A pairs: {len(test_split)}")
print("Example Question:", test_split[0]["question"])
print("Example Answer:", test_split[0]["answer"])

README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

data/test.parquet/part.0.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/918 [00:00<?, ? examples/s]

Total Q&A pairs: 918
Example Question: Was Abraham Lincoln the sixteenth President of the United States?
Example Answer: yes


Chunking Function

In [13]:
def split_into_chunks(text, chunk_size=600):
    """Cut text into equal chunks."""
    if not text or pd.isna(text):
        return []
    text = str(text)
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

Create Chunks

In [14]:
chunks = []
for idx, row in df_passages.iterrows():
    pieces = split_into_chunks(row["passage"])
    for j, seg in enumerate(pieces):
        chunks.append({
            "chunk_id": f"{idx}-{j}",
            "content": seg,
            "source": idx
        })

print(f"Total chunks created: {len(chunks)}")
print("Example chunk:\n", chunks[0]["content"][:250], "...")

Total chunks created: 1289
Example chunk:
 Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area. ...


In [15]:
print("Generating embeddings...")

encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

texts = [c["content"] for c in chunks]
embeds = encoder.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True
).astype("float32")

print("Embedding shape:", embeds.shape)

Generating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Embedding shape: (1289, 384)


Setup Milvus Database

In [18]:
print("Setting up Milvus collection...")

field_id = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False)
field_text = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=3000)
field_vec = FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=embeds.shape[1])

schema = CollectionSchema([field_id, field_text, field_vec])

milvus = MilvusClient("rag_mini_demo.db")
milvus.create_collection("wiki_chunks", schema=schema)

# Insert data
to_insert = [
    {"id": i, "text": chunks[i]["content"], "vector": embeds[i].tolist()}
    for i in range(len(chunks))
]
milvus.insert("wiki_chunks", to_insert)

print("Inserted entities:", milvus.get_collection_stats("wiki_chunks")["row_count"])

Setting up Milvus collection...
Inserted entities: 1289


Indexing

In [19]:
index_params = milvus.prepare_index_params()
index_params.add_index("vector", metric_type="COSINE")

milvus.create_index("wiki_chunks", index_params)
milvus.load_collection("wiki_chunks")

print("Index ready.")

Index ready.


Load FLAN-T5 Generator

In [20]:
model_name = "google/flan-t5-large"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    generator = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )
    print("FLAN-T5 ready ✅")
except Exception as e:
    print("FLAN-T5 load failed:", e)
    generator = None

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


FLAN-T5 ready ✅


Retrieval & Answer Functions

In [21]:
def retrieve(query, k=5):
    """Search Milvus for similar chunks."""
    vec = encoder.encode([query], normalize_embeddings=True).astype("float32")[0].tolist()
    results = milvus.search("wiki_chunks", data=[vec], limit=k, output_fields=["id", "text"])

    hits = []
    for r in results[0]:
        hits.append((r["id"], r["entity"]["text"], r["distance"]))
    return hits


def answer_query(query, k=5, max_len=1800):
    """Retrieve context and run FLAN-T5 generation."""
    results = retrieve(query, k)
    context = "\n\n".join([t[1] for t in results])[:max_len]

    if not generator:
        return "[FLAN unavailable]", results

    prompt = (
        "Answer the question using only the given context.\n"
        "If you cannot answer, say 'I don't know.'\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\nAnswer:"
    )

    output = generator(prompt, max_length=512, temperature=0.2)[0]["generated_text"]
    return output, results

In [22]:
q1 = "What are the three sections of a beetle?"
print("🔍 Query:", q1)

hits = retrieve(q1, 3)
for hid, txt, score in hits:
    print(f"ID: {hid} | Score: {score:.4f}")
    print("Text:", txt[:150], "...\n")

🔍 Query: What are the three sections of a beetle?
ID: 1281 | Score: 0.3709
Text: s as generally assumed, which would necessitate splitting the traditional Pelecaniformes in three. ...

ID: 1274 | Score: 0.3135
Text: The Megadyptes - Eudyptes clade occurs at similar latitudes (though not as far north as the Galapagos Penguin), has its highest diversity in the New Z ...

ID: 1269 | Score: 0.2790
Text: Pygoscelis contains species with a fairly simple black-and-white head pattern; their distribution is intermediate, centered on Antarctic coasts but ex ...



In [23]:
sample_q = test_split[0]["question"]
print("Q:", sample_q)

ans, ctx = answer_query(sample_q, k=5)

print("\n=== Generated Answer ===")
print(ans)

print("\n=== Retrieved Chunks ===")
for i, (cid, txt, s) in enumerate(ctx, 1):
    print(f"[{i}] ID: {cid} | Score: {s:.4f}")
    print(txt[:120], "...\n")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Q: Was Abraham Lincoln the sixteenth President of the United States?


Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== Generated Answer ===
Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination.

=== Retrieved Chunks ===
[1] ID: 339 | Score: 0.7095
Young Abraham Lincoln ...

[2] ID: 320 | Score: 0.6434
Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from Ma ...

[3] ID: 381 | Score: 0.5896
On November 6, 1860, Lincoln was elected as the 16th President of the United States, beating Democrat Stephen A. Douglas ...

[4] ID: 882 | Score: 0.5569
Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States (1825 1829) ...

[5] ID: 480 | Score: 0.5484
* American School, Lincoln's economic views. ...



In [24]:
# Load the same QA dataset
ds_eval = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer")
qa_eval = ds_eval["test"]  # use the test split
squad_metric = evaluate.load("squad")

print("QA evaluation size:", len(qa_eval))
print("Sample QA:", {k: qa_eval[0][k] for k in ("question", "answer")})

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

QA evaluation size: 918
Sample QA: {'question': 'Was Abraham Lincoln the sixteenth President of the United States?', 'answer': 'yes'}


Prompting Strategies

In [25]:
def build_prompt_instruction(context, question):
    return (
        "Respond ONLY with information found in the context. "
        "If the context doesn't provide an answer, respond with 'I don't know.'\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )

def build_prompt_cot(context, question):
    return (
        "You are a methodical thinker. Base your reasoning ONLY on the context. "
        "If there's not enough information, say 'I don't know.' Proceed logically, step by step.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )

def build_prompt_persona(context, question):
    return (
        "You are an encyclopedia-style responder. Use ONLY the given context. "
        "If you cannot find an answer in the context, respond with 'I don't know.' Be clear, accurate, and concise.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    )

PROMPTS = {
    "instruction": build_prompt_instruction,
    "cot": build_prompt_cot,
    "persona": build_prompt_persona,
}

Local Generation (FLAN-T5)

In [26]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

try:
    tok_eval = AutoTokenizer.from_pretrained("google/flan-t5-base")
    mdl_eval = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
    pipe_eval = pipeline("text2text-generation", model=mdl_eval, tokenizer=tok_eval)

    def generate_answer(prompt, max_new_tokens=128, temperature=0.0):
        result = pipe_eval(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]
        return result.strip()

    print("✅ Using local Transformers: google/flan-t5-base")
except Exception as e:
    print("⚠️ Local Transformers unavailable:", e)
    pipe_eval = None

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


✅ Using local Transformers: google/flan-t5-base


Retrieval Wrapper for Eval

In [27]:
def get_top1_context(query):
    """Retrieve the single most relevant chunk."""
    hits = retrieve(query, k=1)  # reuse your earlier retrieve()
    return hits[0][1] if hits else ""

In [28]:
def evaluate_strategy(strategy_name, n_samples=None):
    """Evaluate one prompting strategy on subset/full QA set."""
    builder = PROMPTS[strategy_name]
    preds, refs = [], []

    total = len(qa_eval) if n_samples is None else min(n_samples, len(qa_eval))

    for i in range(total):
        q = qa_eval[i]["question"]
        gold = qa_eval[i]["answer"] if "answer" in qa_eval[i] else qa_eval[i]["answers"]
        gold_text = gold if isinstance(gold, str) else gold[0]

        ctx = get_top1_context(q)
        prompt = builder(ctx, q)
        pred = generate_answer(prompt)

        preds.append({"id": str(i), "prediction_text": pred})
        refs.append({"id": str(i), "answers": {"text": [gold_text], "answer_start": [0]}})

    return squad_metric.compute(predictions=preds, references=refs), preds

Sanity Check

In [29]:
for name in PROMPTS:
    metrics, _ = evaluate_strategy(name, n_samples=20)
    print(f"{name} (20 samples):", metrics)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


instruction (20 samples): {'exact_match': 30.0, 'f1': 44.55011655011655}
cot (20 samples): {'exact_match': 15.0, 'f1': 27.979597562930895}
persona (20 samples): {'exact_match': 30.0, 'f1': 43.641025641025635}


Batched Eval per strategy (100 samples)

In [30]:
N_SAMPLES = 100
results = {}

for name in PROMPTS:
    print(f"\n→ Evaluating {name} on {N_SAMPLES} samples (GPU)")
    metrics, _ = evaluate_strategy(name, n_samples=N_SAMPLES)
    results[name] = metrics

print("\n=== Results ===")
for k,v in results.items():
    print(k, v)


→ Evaluating instruction on 100 samples (GPU)

→ Evaluating cot on 100 samples (GPU)

→ Evaluating persona on 100 samples (GPU)

=== Results ===
instruction {'exact_match': 17.0, 'f1': 20.7685332211648}
cot {'exact_match': 7.0, 'f1': 13.376291029729087}
persona {'exact_match': 20.0, 'f1': 23.634334086965666}
