# Mini RAG Eval (Local, Toy Data)
Quick recall@k check on a tiny corpus. Replace the toy docs with your chunks.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

docs = [
    ("doc1", "The sky is blue because molecules scatter sunlight."),
    ("doc2", "Plants make food through photosynthesis using sunlight."),
    ("doc3", "HTTP GET retrieves data; POST sends data to the server."),
]
corpus = [t for _, t in docs]
ids = [d for d, _ in docs]

vec = TfidfVectorizer()
doc_vecs = vec.fit_transform(corpus)

def top_k(query, k=2):
    qv = vec.transform([query])
    scores = cosine_similarity(qv, doc_vecs).flatten()
    order = scores.argsort()[::-1][:k]
    return [(ids[i], scores[i]) for i in order]

test_set = [
    ("why is the sky blue", ["doc1"]),
    ("how do plants make food", ["doc2"]),
    ("what is http get", ["doc3"]),
]

def hit_rate(k=2):
    hits = 0
    for q, expected in test_set:
        retrieved = [doc_id for doc_id, _ in top_k(q, k)]
        if any(e in retrieved for e in expected):
            hits += 1
    rate = hits / len(test_set)
    print(f"Hit rate @{k}: {rate:.2f} ({hits}/{len(test_set)})")

for k in (1, 2, 3):
    hit_rate(k)

print(top_k("why is the sky blue", k=2))


## How to adapt
- Swap `docs` with your chunk texts and IDs.
- Replace TF-IDF with your embedding store for realism.
- Expand `test_set` with real (query, expected_ids) pairs.
- Add citation/faithfulness checks for a full RAG eval.