##Using RAG (Local: Sentence-Transformers + FAISS + FLAN-T5)

In [None]:
!pip -q install pandas numpy sentence-transformers faiss-cpu "transformers>=4.40.0" accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h

##Loading the CSV file from drive

In [None]:
import pandas as pd
import numpy as np

csv_path = "/content/drive/MyDrive/coding_challenge/mle_screening_dataset.csv"
df = pd.read_csv(csv_path)
df.head(3)

Unnamed: 0,question,answer
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...


##Convert each document into string

In [None]:
def row_to_text(row):
    # compact, readable representation: "col=value; col2=value; ..."
    parts = []
    for c in df.columns:
        v = row[c]
        if pd.isna(v):
            continue
        sv = str(v)
        if len(sv) > 200:
            sv = sv[:200] + "…"
        parts.append(f"{c}={sv}")
    return "; ".join(parts)

docs = df.apply(row_to_text, axis=1).tolist()
len(docs), docs[1]

(16406,
 'question=What is (are) Glaucoma ?; answer=The optic nerve is a bundle of more than 1 million nerve fibers. It connects the retina to the brain.')

##Embed and build FAISS index

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # 384-dim, fast & strong
batch = 128

embs = []
for i in range(0, len(docs), batch):
    embs.append(model.encode(docs[i:i+batch], show_progress_bar=False))
embs = np.vstack(embs).astype("float32")

# Normalize (so inner product ≈ cosine)
faiss.normalize_L2(embs)

index = faiss.IndexFlatIP(embs.shape[1])
index.add(embs)
index.ntotal


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

16406

##Retriever

In [None]:
def retrieve(question, k=8):
    q = model.encode([question]).astype("float32")
    faiss.normalize_L2(q)
    scores, idxs = index.search(q, k)  # inner product
    hits = []
    for score, i in zip(scores[0], idxs[0]):
        if i == -1:
            continue
        hits.append({"row_id": int(i), "score": float(score), "text": docs[i]})
    return hits

# Quick test:
retrieve("Who has the highest accuracy?", k=5)

[{'row_id': 9185,
  'score': 0.23538407683372498,
  'text': 'question=what research (or clinical trials) is being done for Tremor ?; answer=The National Institute of Neurological Disorders and Stroke, a unit of the National Institutes of Health (NIH) within the U.S. Department of Health and Human Services, is the nations leading federal f…'},
 {'row_id': 8733,
  'score': 0.23406271636486053,
  'text': "question=what research (or clinical trials) is being done for Essential Tremor ?; answer=The National Institute of Neurological Disorders and Stroke, a unit of the National Institutes of Health (NIH) within the U.S. Department of Health and Human Services, is the nation's leading federal …"},
 {'row_id': 9213,
  'score': 0.2128297984600067,
  'text': 'question=what research (or clinical trials) is being done for CADASIL ?; answer=The National Institute of Neurological Disorders and Stroke (NINDS) conducts stroke research and clinical trials at its laboratories and clinics at the National

#####Local generator (FLAN-T5). Use “small” on CPU; switch to “base” if you have GPU.



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

GEN_MODEL = "google/flan-t5-small"  # try "google/flan-t5-base" if you have GPU
tok = AutoTokenizer.from_pretrained(GEN_MODEL)
gen = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL, torch_dtype=torch.float32, device_map="auto")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## RAG QA function (retrieve → synthesize)

In [None]:
def rag_answer(question, k=8, max_new_tokens=128):
    hits = retrieve(question, k=k)
    if not hits:
        return "I couldn't find anything relevant in the CSV.", pd.DataFrame()

    # Build context with explicit row ids (citations)
    context_lines = []
    for h in hits:
        context_lines.append(f"[row {h['row_id']}] {h['text']}")
    context = "\n".join(context_lines)

    prompt = (
        "You are an AI assistant specialized in providing information based on the provided documents. "
    "Answer the user's question truthfully and concisely using *only* the following context. "
    "For each statement in your answer, indicate the source row ID(s) like [row 12, row 45]. "
    "If the answer is not present in the context, respond with 'Information not available in the provided documents.'.\n\n"
    f"Question: {question}\n\nContext:\n{context}\n\nAnswer:"
)
    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=2048).to(gen.device)
    out = gen.generate(**inputs, max_new_tokens=max_new_tokens)
    answer = tok.decode(out[0], skip_special_tokens=True)

    # Return answer + a small view of the retrieved rows for transparency
    import pandas as pd
    row_ids = [h["row_id"] for h in hits]
    view = df.iloc[row_ids].copy()
    view.insert(0, "row_id", row_ids)
    view.insert(1, "score", [h["score"] for h in hits])
    return answer, view

# Example:
ans, ctx = rag_answer("What are the symptoms of Glaucoma?", k=6)
print(ans)
display(ctx.head())

[row 0]


Unnamed: 0,row_id,score,question,answer
5,5,0.599668,What are the symptoms of Glaucoma ?,"At first, open-angle glaucoma has no symptoms...."
12221,12221,0.540391,What is (are) early-onset glaucoma ?,Glaucoma is a group of eye disorders in which ...
2270,2270,0.532081,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
0,0,0.526198,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
4,4,0.494626,How to prevent Glaucoma ?,"At this time, we do not know how to prevent gl..."


##RAGAS based evaluation

In [None]:
from datasets import Dataset

# Prepare data for RAGAS evaluation
# We need a dataset with columns: 'question', 'answer', 'contexts', 'ground_truth' (optional)

# Let's create a small sample dataset for evaluation for demonstration purposes
questions = ["What is Glaucoma?", "How to prevent Glaucoma?", "Who is at risk for Glaucoma?"]
ground_truths = [
    "Glaucoma is a group of eye diseases that damage the optic nerve.",
    "Currently, there is no known way to prevent glaucoma.",
    "Anyone can develop glaucoma, but some groups are at higher risk."
    ]

data = {'question': questions, 'ground_truth': ground_truths}
eval_dataset = Dataset.from_dict(data)

# Now, we need to generate answers and retrieve contexts for these questions using our RAG system
answers = []
contexts = []

for question in questions:
    ans, ctx_df = rag_answer(question, k=8) # Using k=8 as in the rag_answer function
    answers.append(ans)
    contexts.append((ctx_df['question'] + " " + ctx_df['answer']).tolist()) # Extract text from the context DataFrame

eval_dataset = eval_dataset.add_column("answer", answers)
eval_dataset = eval_dataset.add_column("contexts", contexts)

print(eval_dataset)

Dataset({
    features: ['question', 'ground_truth', 'answer', 'contexts'],
    num_rows: 3
})


In [None]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
)

# Define the metrics to use for evaluation
metrics = [
    answer_relevancy,
    faithfulness,
    context_recall,
]

# Run the evaluation
result = evaluate(
    eval_dataset,
    metrics=metrics,
)

# Print the results
print(result)

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

{'answer_relevancy': 0.0000, 'faithfulness': 1.0000, 'context_recall': 1.0000}
