# MedQuAD RAG (Mini Showcase)
**Goal:** run a tiny, reproducible demo on a small sample corpus.
No big downloads; shows retrieval → answer.


In [None]:
!pip -q install sentence-transformers faiss-cpu pandas scikit-learn tqdm

## 1) Create a tiny sample dataset (4 QA pairs)

In [None]:
import pandas as pd
data = pd.DataFrame([
  {"question":"Can ibuprofen cause stomach pain?",
   "answer":"Ibuprofen is an NSAID. It can cause gastrointestinal irritation including stomach pain, heartburn, and rarely bleeding or ulcers."},
  {"question":"What is a normal A1C level?",
   "answer":"For many adults, an A1C below 5.7% is considered normal, 5.7–6.4% indicates prediabetes, and 6.5% or higher may indicate diabetes."},
  {"question":"Do antihistamines cause drowsiness?",
   "answer":"Some antihistamines, especially first-generation agents, can cause drowsiness. Newer, second-generation antihistamines are less sedating."},
  {"question":"What is a myocardial infarction?",
   "answer":"A myocardial infarction is a heart attack, which occurs when blood flow to part of the heart is blocked."}
])
data.head()

## 2) Build passages (2–3 sentence chunks)

In [None]:
import re
SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-Z0-9(])')
def chunk_passages(text, window=3, overlap=1):
    sents = [s.strip() for s in SENT_SPLIT.split(str(text)) if s.strip()]
    chunks, i = [], 0
    while i < len(sents):
        ch = " ".join(sents[i:i+window])
        if ch: chunks.append(ch)
        i += max(1, window - overlap)
    return chunks or [str(text).strip()]

corpus = []
qid2docids = {}
for qi, row in data.iterrows():
    chunks = chunk_passages(row['answer'])
    ids = []
    for j, ch in enumerate(chunks):
        pid = f'd{qi}_{j}'
        corpus.append({'doc_id': pid, 'text': ch})
        ids.append(pid)
    qid2docids[f'q{qi}'] = set(ids)
import pandas as pd
corpus_df = pd.DataFrame(corpus)
corpus_df

## 3) Embed + FAISS index (small model for speed)

In [None]:
from sentence_transformers import SentenceTransformer
import faiss, numpy as np
model = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v4')
passage_ids = corpus_df['doc_id'].tolist()
passages = corpus_df['text'].tolist()
emb = model.encode(passages, convert_to_numpy=True, normalize_embeddings=True)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
def dense_retrieve(query, topn=10):
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q, topn)
    return [(passage_ids[i], float(D[0][j])) for j, i in enumerate(I[0])]


## 4) ask() → top-1 extractive answer

In [None]:
def ask(query, topn=10):
    cand = dense_retrieve(query, topn=topn)
    if not cand: return '(no result)', []
    pid = cand[0][0]
    text = corpus_df.set_index('doc_id').loc[pid, 'text']
    return text, cand
ans, cand = ask('Can ibuprofen cause stomach pain?')
ans

## 5) Tiny IR metrics (MRR@10 / Recall@10)

In [None]:
import numpy as np
def mrr_at_k(rel, ranked, k=10):
    for i, pid in enumerate(ranked[:k], 1):
        if pid in rel: return 1.0 / i
    return 0.0
def recall_at_k(rel, ranked, k=10):
    return len(rel & set(ranked[:k])) / (len(rel) or 1)
mrr = rec = 0.0
for qid in qid2docids:
    q = data.iloc[int(qid[1:])]['question']
    _, cand = ask(q, topn=10)
    ranked = [p for p,_ in cand]
    rel = qid2docids[qid]
    mrr += mrr_at_k(rel, ranked, 10)
    rec += recall_at_k(rel, ranked, 10)
N = len(qid2docids)
{'MRR@10': round(mrr/N, 3), 'Recall@10': round(rec/N, 3)}