<a href="https://colab.research.google.com/github/reychely/food-rag-web/blob/main/IR_Project_RAG_APP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# ============================================================
# BLOCK 0: Settings + Paths (Drive or Local)
# What: define project folders for data + storage
# Why: keeps your indexed data + outputs persistent
# ============================================================

import os, json, re, hashlib, math, time
from pathlib import Path

IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except:
    IN_COLAB = False

# If you are in Colab, keep using your Drive path
BASE = "/content/drive/MyDrive/IR_RAG_App" if IN_COLAB else str(Path.cwd() / "IR_RAG_App")
DATA_DIR = Path(BASE) / "data" / "raw"
STORAGE_DIR = Path(BASE) / "storage"

DATA_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_DIR.mkdir(parents=True, exist_ok=True)

print("‚úÖ IN_COLAB:", IN_COLAB)
print("‚úÖ DATA_DIR:", DATA_DIR)
print("‚úÖ STORAGE_DIR:", STORAGE_DIR)

# List files
files = sorted([p for p in DATA_DIR.rglob("*") if p.is_file()])
print(f"üìÑ Files found: {len(files)}")
for p in files[:15]:
    print(" -", p.name)


‚úÖ IN_COLAB: True
‚úÖ DATA_DIR: /content/drive/MyDrive/IR_RAG_App/data/raw
‚úÖ STORAGE_DIR: /content/drive/MyDrive/IR_RAG_App/storage
üìÑ Files found: 59
 - Animal_nutrition.txt
 - CRON-diet.txt
 - Calorie_restriction.txt
 - Child_Nutrition_Act.txt
 - Cottage_cheese.txt
 - Dal.txt
 - Diet_(nutrition).txt
 - Dietary_Guidelines_for_Americans.txt
 - Dietary_Guidelines_for_Americans_2020-2025.pdf
 - Eating a balanced diet - NHS.html
 - Eating.txt
 - Empty_calories.txt
 - Equine_nutrition.txt
 - Essential_amino_acid.txt
 - Flavonoid.txt


In [30]:
# ============================================================
# BLOCK 1: Install dependencies (fast, only if missing)
# What: install required libs for RAG pipeline
# Why: avoids slow re-install each run
# ============================================================

import importlib, sys, subprocess

def pip_install_if_missing(pkgs):
    missing = []
    for mod, pip_name in pkgs:
        try:
            importlib.import_module(mod)
        except:
            missing.append(pip_name)
    if missing:
        print("üì¶ Installing:", missing)
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + missing)
        print("‚úÖ Installed missing packages")
    else:
        print("‚úÖ All dependencies already installed")

pip_install_if_missing([
    ("numpy", "numpy"),
    ("tqdm", "tqdm"),
    ("pypdf", "pypdf"),
    ("docx", "python-docx"),
    ("bs4", "beautifulsoup4"),
    ("requests", "requests"),
    ("sentence_transformers", "sentence-transformers"),
    ("faiss", "faiss-cpu"),
    ("rank_bm25", "rank_bm25"),
    ("symspellpy", "symspellpy"),
    ("nltk", "nltk"),
    ("transformers", "transformers"),
    ("torch", "torch"),
])


‚úÖ All dependencies already installed


In [31]:
# ============================================================
# BLOCK 2: Imports + NLTK resources
# What: init NLP helpers (lemmatizer, wordnet)
# Why: better lexical retrieval + safer query expansion
# ============================================================

import numpy as np
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

lemmatizer = WordNetLemmatizer()
print("‚úÖ NLTK ready")


‚úÖ NLTK ready


In [32]:
# ============================================================
# BLOCK 3: Text cleaning + normalization
# What: consistent cleaning for docs + queries
# Why: improves BM25 + reduces noise in embeddings
# ============================================================

_token_re = re.compile(r"[a-zA-Z]{2,}")

def normalize_for_bm25(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep english letters only
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_text_general(text: str) -> str:
    # remove obvious boilerplate/noise + normalize spaces
    text = text.replace("\x00", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [33]:
# ============================================================
# BLOCK 4: Load documents (txt/pdf/docx/html)
# What: read multiple formats into a unified list
# Why: supports expanding your corpus beyond .txt
# ============================================================

from typing import List, Dict
from pypdf import PdfReader
import docx
from bs4 import BeautifulSoup

def load_txt(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def load_pdf(path: Path) -> str:
    reader = PdfReader(str(path))
    pages = []
    for pg in reader.pages:
        pages.append(pg.extract_text() or "")
    return "\n".join(pages)

def load_docx(path: Path) -> str:
    d = docx.Document(str(path))
    return "\n".join([p.text for p in d.paragraphs])

def load_html(path: Path) -> str:
    raw = path.read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(raw, "html.parser")
    # drop scripts/styles
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return soup.get_text(" ", strip=True)

LOADERS = {
    ".txt": load_txt,
    ".pdf": load_pdf,
    ".docx": load_docx,
    ".html": load_html,
    ".htm": load_html,
}

def load_one_file(p: Path) -> str:
    ext = p.suffix.lower()
    if ext not in LOADERS:
        return ""
    try:
        t = LOADERS[ext](p)
        return clean_text_general(t)
    except Exception as e:
        print("‚ö†Ô∏è Failed:", p.name, "|", e)
        return ""

def load_corpus(data_dir: Path, min_chars=200) -> List[Dict]:
    docs = []
    for p in sorted([x for x in data_dir.rglob("*") if x.is_file()]):
        text = load_one_file(p)
        # filter tiny/noisy docs
        if len(text) < min_chars:
            continue
        docs.append({
            "source": p.name,
            "path": str(p),
            "ext": p.suffix.lower(),
            "text": text
        })
    return docs

docs = load_corpus(DATA_DIR, min_chars=200)
print(f"‚úÖ Loaded {len(docs)} docs (min_chars=200)")
if docs:
    print("Example:", docs[0]["source"], "| chars:", len(docs[0]["text"]))


‚úÖ Loaded 58 docs (min_chars=200)
Example: Animal_nutrition.txt | chars: 6142


In [38]:
# ============================================================
# BLOCK 5: Chunking + Dedup (exact duplicates)
# What: split docs into overlapping chunks; remove duplicates
# Why: chunking improves retrieval, dedup avoids repetitive answers
# ============================================================

CHUNK_SIZE = 900
CHUNK_OVERLAP = 150

def chunk_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks = []
    n = len(text)
    step = max(1, chunk_size - overlap)
    start = 0
    while start < n:
        end = min(start + chunk_size, n)
        c = text[start:end].strip()
        if c:
            chunks.append(c)
        start += step
    return chunks

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8", errors="ignore")).hexdigest()

chunks = []
seen_hash = set()

for d in docs:
    parts = chunk_text(d["text"])
    for i, c in enumerate(parts):
        h = sha1(c)
        if h in seen_hash:
            continue
        seen_hash.add(h)
        chunks.append({
            "source": d["source"],
            "path": d["path"],
            "ext": d["ext"],
            "chunk_id": f"{d['source']}::chunk{i}",
            "text": c
        })

print(f"‚úÖ Created {len(chunks)} unique chunks from {len(docs)} docs")
print("Preview:", chunks[0]["chunk_id"], "|", chunks[0]["text"][:160], "...")


‚úÖ Created 1637 unique chunks from 58 docs
Preview: Animal_nutrition.txt::chunk0 | Animal nutrition focuses on the dietary nutrients needs of animals, primarily those in agriculture and food production, but also in zoos, aquariums, and wildlif ...


In [39]:
# ============================================================
# BLOCK 6: Embeddings + FAISS index (semantic search)
# What: embed chunks + build FAISS vector index
# Why: handles paraphrases + semantic similarity
# ============================================================

import faiss
from sentence_transformers import SentenceTransformer

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMB_MODEL)

texts = [c["text"] for c in chunks]
emb = embedder.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
emb = np.asarray(emb, dtype="float32")

dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine similarity because normalized
index.add(emb)

faiss_path = STORAGE_DIR / "faiss.index"
meta_path = STORAGE_DIR / "chunks_meta.json"

faiss.write_index(index, str(faiss_path))
meta_path.write_text(json.dumps(chunks, ensure_ascii=False), encoding="utf-8")

print("‚úÖ Embeddings shape:", emb.shape)
print("‚úÖ FAISS index size:", index.ntotal)
print("‚úÖ Saved:", faiss_path)
print("‚úÖ Saved:", meta_path)


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

‚úÖ Embeddings shape: (1637, 384)
‚úÖ FAISS index size: 1637
‚úÖ Saved: /content/drive/MyDrive/IR_RAG_App/storage/faiss.index
‚úÖ Saved: /content/drive/MyDrive/IR_RAG_App/storage/chunks_meta.json


In [40]:
# ============================================================
# BLOCK 7: BM25 index (lexical search) + vocab frequencies
# What: tokenize (lemmatize) + build BM25; build vocab freq
# Why: BM25 helps exact matches (ingredients, terms, acronyms)
# ============================================================

from rank_bm25 import BM25Okapi
from collections import Counter

def tokenize_lemma(text: str) -> List[str]:
    t = normalize_for_bm25(text)
    toks = _token_re.findall(t)
    toks = [lemmatizer.lemmatize(x) for x in toks]
    return toks

bm25_tokens = [tokenize_lemma(c["text"]) for c in tqdm(chunks, desc="BM25 tokenize")]
bm25 = BM25Okapi(bm25_tokens)

bm25_vocab = Counter()
for toks in bm25_tokens:
    bm25_vocab.update(toks)

print("‚úÖ BM25 ready | vocab size:", len(bm25_vocab))


BM25 tokenize: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1637/1637 [00:02<00:00, 668.88it/s]


‚úÖ BM25 ready | vocab size: 10727


In [43]:
# ============================================================
# BLOCK 8: Spell correction (SymSpell) + safe overrides
# What: correct typos without semantic drift (is->his, to->two)
# Why: user queries are messy; safer correction improves retrieval
# ============================================================

# download common English words (50k)
import urllib.request
vocab_file = STORAGE_DIR / "en_vocab.txt"
if not vocab_file.exists():
    urllib.request.urlretrieve(
        "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2018/en/en_50k.txt",
        str(vocab_file)
    )
print("‚úÖ en_vocab.txt:", vocab_file)

from symspellpy import SymSpell, Verbosity

# stopwords we never correct
STOPWORDS = {
    "what","why","how","which","when","where","who",
    "is","are","was","were","to","of","in","on","for","with",
    "and","or","not","a","an","the","this","that","it"
}

# common typo overrides (nutrition-friendly)
COMMON_TYPO_OVERRIDES = {
    "wht": "what",
    "wats": "what",
    "whats": "what",
    "healty": "healthy",
    "helthy": "healthy",
    "unhealty": "unhealthy",
    "calory": "calorie",
}

def edit_distance(a: str, b: str) -> int:
    m, n = len(a), len(b)
    dp = list(range(n + 1))
    for i in range(1, m + 1):
        prev = dp[0]
        dp[0] = i
        for j in range(1, n + 1):
            cur = dp[j]
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[j] = min(dp[j] + 1, dp[j-1] + 1, prev + cost)
            prev = cur
    return dp[n]

# init symspell
symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# load vocab sets
EN_VOCAB_SET = set()
with open(vocab_file, "r", encoding="utf-8") as f:
    for line in f:
        w = line.strip().lower()
        if len(w) >= 2:
            EN_VOCAB_SET.add(w)
            symspell.create_dictionary_entry(w, 1000)

DOMAIN_VOCAB_SET = set(bm25_vocab.keys())
for term, freq in bm25_vocab.items():
    if len(term) >= 2:
        symspell.create_dictionary_entry(term, int(freq))

KNOWN_WORDS = EN_VOCAB_SET | DOMAIN_VOCAB_SET

def symspell_fix_query(q: str) -> str:
    qn = normalize_for_bm25(q)
    out = []

    for tok in qn.split():
        if tok in COMMON_TYPO_OVERRIDES:
            out.append(COMMON_TYPO_OVERRIDES[tok])
            continue
        if tok in STOPWORDS or len(tok) <= 2:
            out.append(tok)
            continue
        if tok in KNOWN_WORDS:
            out.append(tok)
            continue

        sugg = symspell.lookup(tok, Verbosity.TOP, max_edit_distance=2)
        if not sugg:
            out.append(tok); continue

        best = sugg[0].term
        if best not in KNOWN_WORDS:
            out.append(tok); continue
        if edit_distance(tok, best) > 2:
            out.append(tok); continue

        # block known bad swaps
        BAD_SWAPS = {("wht","who"), ("to","two"), ("is","his")}
        if (tok, best) in BAD_SWAPS:
            out.append(tok); continue

        out.append(best)

    return " ".join(out)

# sanity
print("TEST1:", symspell_fix_query("Why is junk food considered unhealhy?"))
print("TEST2:", symspell_fix_query("Wht is healty to eat?"))


‚úÖ en_vocab.txt: /content/drive/MyDrive/IR_RAG_App/storage/en_vocab.txt
TEST1: why is junk food considered unhealthy
TEST2: what is healthy to eat


In [44]:
# ============================================================
# BLOCK 9: Query expansion (lightweight + domain hints)
# What: add a few related terms (synonyms + nutrition hints)
# Why: helps when docs use different wording than query
# ============================================================

DOMAIN_HINTS = {
    "healthy": ["balanced", "nutrient-dense", "whole foods", "vegetables", "fruits", "fiber"],
    "unhealthy": ["junk", "processed", "ultra-processed", "high sugar", "high salt", "high fat"],
    "junk": ["fast food", "ultra-processed", "snacks", "sugary drinks"],
    "calories": ["kcal", "kilocalories", "food energy"],
    "nutrients": ["vitamins", "minerals", "macronutrients", "micronutrients"],
}

def _wn_synonyms(word: str, max_syn=1):
    out = []
    for s in wn.synsets(word):
        for l in s.lemmas():
            w = l.name().replace("_"," ").lower()
            if w != word and w.isalpha() and w not in out:
                out.append(w)
            if len(out) >= max_syn:
                return out
    return out

def expand_query_domain(q: str) -> str:
    base = normalize_for_bm25(q)
    toks = base.split()

    extra = []
    for t in toks:
        if t not in STOPWORDS and len(t) >= 4:
            extra.extend(_wn_synonyms(t, max_syn=1))

    for key, adds in DOMAIN_HINTS.items():
        if key in toks:
            extra.extend(adds)

    seen = set(toks)
    final_extra = []
    for e in extra:
        e = normalize_for_bm25(e)
        if not e or e in seen:
            continue
        seen.add(e)
        final_extra.append(e)
        if len(final_extra) >= 12:
            break

    return base + (" " + " ".join(final_extra) if final_extra else "")


In [45]:
# ============================================================
# BLOCK 10: Query preparation (typo fix -> rewrite -> expand)
# What: builds retrieval query + debug info
# Why: prevents NameError and ensures consistent pipeline
# ============================================================

def prepare_query(q: str) -> dict:
    original = q
    typo_fixed = symspell_fix_query(q)

    rewritten = typo_fixed.strip()
    if not rewritten.endswith("?"):
        rewritten += "?"

    expanded = expand_query_domain(typo_fixed)

    return {
        "original": original,
        "typo_fixed": typo_fixed,
        "rewritten": rewritten,
        "expanded": expanded
    }


In [46]:
# ============================================================
# BLOCK 11: Hybrid retrieval (FAISS + BM25) + dedupe candidates
# What: merge semantic and lexical scores
# Why: best of both worlds + fewer missed matches
# ============================================================

def faiss_search(query: str, k=60):
    qemb = embedder.encode([query], normalize_embeddings=True).astype("float32")
    D, I = index.search(qemb, k)
    out = []
    for score, idx_ in zip(D[0], I[0]):
        out.append((int(idx_), float(score)))
    return out

def bm25_search(query: str, k=60):
    toks = tokenize_lemma(query)
    scores = bm25.get_scores(toks)
    top_idx = np.argsort(scores)[::-1][:k]
    return [(int(i), float(scores[i])) for i in top_idx]

def retrieve_hybrid_candidates(query: str, faiss_k=60, bm25_k=60, alpha=0.6):
    fa = faiss_search(query, k=faiss_k)
    bm = bm25_search(query, k=bm25_k)

    # normalize bm25 to [0,1] roughly
    bm_scores = np.array([s for _, s in bm], dtype="float32")
    bm_max = float(bm_scores.max()) if len(bm_scores) else 1.0

    merged = {}
    for idx_, s in fa:
        merged[idx_] = merged.get(idx_, 0.0) + alpha * s
    for idx_, s in bm:
        merged[idx_] = merged.get(idx_, 0.0) + (1 - alpha) * (s / bm_max if bm_max > 0 else 0.0)

    # build candidate list
    cands = []
    for idx_, score in merged.items():
        d = dict(chunks[idx_])
        d["hybrid_score"] = float(score)
        cands.append(d)

    cands.sort(key=lambda x: x["hybrid_score"], reverse=True)
    return cands


In [47]:
# ============================================================
# BLOCK 12: Reranker (cross-encoder) for better precision
# What: rerank candidates using query+chunk pair scoring
# Why: improves relevance, reduces noisy contexts
# ============================================================

from sentence_transformers import CrossEncoder

RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(RERANK_MODEL)

def rerank_top(query: str, cands: List[Dict], limit=120):
    cands = cands[:limit]
    pairs = [(query, c["text"]) for c in cands]
    scores = reranker.predict(pairs, batch_size=32, show_progress_bar=False)
    for c, s in zip(cands, scores):
        c["rerank_score"] = float(s)
    cands.sort(key=lambda x: x["rerank_score"], reverse=True)
    return cands

def pick_diverse(ranked: List[Dict], k=6, max_per_source=1):
    picked = []
    per_src = {}
    for d in ranked:
        src = d["source"]
        per_src[src] = per_src.get(src, 0) + 1
        if per_src[src] > max_per_source:
            continue
        picked.append(d)
        if len(picked) >= k:
            break
    return picked


In [51]:
# ============================================================
# BLOCK 13: Answer generation (FLAN-T5) + clean output + citations
# What: generate short grounded answer using retrieved sources
# Why: better UX, avoids hallucination, shows citations in video
# ============================================================

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

LLM_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
llm = llm.to(device)

def clean_answer(text: str, max_sentences: int = 4) -> str:
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    sents = re.split(r"(?<=[.!?])\s+", text)
    sents = [s.strip() for s in sents if s.strip()]
    sents = sents[:max_sentences]
    fixed = []
    for s in sents:
        fixed.append(s[:1].upper() + s[1:] if len(s) > 1 else s.upper())
    out = " ".join(fixed).strip()
    if out and out[-1] not in ".!?":
        out += "."
    return out

def build_prompt(question: str, contexts: List[Dict]) -> str:
    srcs = "\n\n".join([f"[{i+1}] {c['text']}" for i, c in enumerate(contexts)])
    return (
        "You answer questions about food, diet, nutrition.\n"
        "Use ONLY the provided sources.\n"
        "Write 3-5 short sentences in simple language.\n"
        "If the sources are not enough, say: I don't have enough information in the indexed data.\n"
        "End with: Sources: [1], [2], ...\n\n"
        f"Question: {question}\n\n"
        f"Sources:\n{srcs}\n\n"
        "Answer:"
    )

def retrieve_dynamic(question: str, max_k=6, rerank_min=0.2, max_per_source=1,
                     faiss_k=60, bm25_k=60, alpha=0.6, fused_limit=120):
    dbg = prepare_query(question)
    query_for_retrieval = dbg["expanded"]

    cands = retrieve_hybrid_candidates(query_for_retrieval, faiss_k=faiss_k, bm25_k=bm25_k, alpha=alpha)
    ranked = rerank_top(dbg["rewritten"], cands, limit=fused_limit)

    # filter weak rerank
    ranked = [d for d in ranked if d.get("rerank_score", -1e9) >= rerank_min]

    # diversify sources
    ctx = pick_diverse(ranked, k=max_k, max_per_source=max_per_source)
    return ctx, dbg

def rag_answer(question: str, max_k=6, rerank_min=0.2, max_new_tokens=140, max_per_source=1):
    ctxs, dbg = retrieve_dynamic(question, max_k=max_k, rerank_min=rerank_min, max_per_source=max_per_source)

    print("\n" + "="*70)
    print("QUESTION:", question)
    print("üõ†Ô∏è Query debug:")
    for k, v in dbg.items():
        print(f" - {k:9s}: {v}")

    if not ctxs:
        print("\n‚úÖ ANSWER:\nI don't have enough information in the indexed data.")
        return "I don't have enough information in the indexed data.", []

    print(f"\nüìå SOURCES USED (K={len(ctxs)}):")
    for i, c in enumerate(ctxs, 1):
        print(f"{i}) {c['source']} | {c['chunk_id']} | rerank={c['rerank_score']:.3f}")

    prompt = build_prompt(question, ctxs)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(device)

    with torch.no_grad():
        out = llm.generate(**inputs, max_new_tokens=max_new_tokens, num_beams=4)

    ans = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    ans = clean_answer(ans, max_sentences=4)

    print("\n‚úÖ ANSWER:\n", ans)
    return ans, ctxs


In [49]:
# ============================================================
# BLOCK 14: Demo (your problematic typo query included)
# What: quick test to confirm pipeline is stable
# Why: validates spelling + retrieval + generation end-to-end
# ============================================================

_ = rag_answer("Why is junk food considered unhealthy?")
_ = rag_answer("What are calories and how are they measured?")
_ = rag_answer("Wht is healty to eat?")
_ = rag_answer("difference between macronutrients and micronutrients")



QUESTION: Why is junk food considered unhealthy?
üõ†Ô∏è Query debug:
 - original : Why is junk food considered unhealthy?
 - typo_fixed: why is junk food considered unhealthy
 - rewritten: why is junk food considered unhealthy?
 - expanded : why is junk food considered unhealthy debris nutrient see insalubrious processed ultra processed high sugar high salt high fat fast food snacks sugary drinks

üìå SOURCES USED (K=2):
1) JunkFood.pdf | JunkFood.pdf::chunk10 | rerank=5.656
2) Junk food and your health _ healthdirect.html | Junk food and your health _ healthdirect.html::chunk3 | rerank=3.807

‚úÖ ANSWER:
 Junk foods are mainly made up by using a lot of saturated fats which are unhealthy after digestion and release a lot of toxins into the body. Moreover, it lacks vitamins and minerals which are necessary to have good health and immunity to fight diseases. The practice of high consumption of junk foods like magi noodles, burgers, sandwiches, hot dogs, patties, pastries, popcorn, pot

In [50]:
# ============================================================
# BLOCK 15: Tiny Evaluation (Recall@K on a mini set)
# What: simple offline check of retrieval quality
# ============================================================

# Define a small gold set manually (for demo)
# Each query maps to keywords that should appear in at least one retrieved chunk.
EVAL_QUERIES = [
    ("What are calories?", ["kilocalorie", "kcal", "food energy", "calorie"]),
    ("Why is junk food unhealthy?", ["high sugar", "high salt", "high fat", "ultra-processed", "low nutrients"]),
    ("What is a balanced diet?", ["balanced", "nutrient", "vegetables", "fruits", "whole"]),
]

def recall_at_k(question: str, must_contain: List[str], k=5):
    ctxs, dbg = retrieve_dynamic(question, max_k=k, rerank_min=0.0)
    joined = " ".join([c["text"].lower() for c in ctxs])
    hits = sum(1 for kw in must_contain if kw.lower() in joined)
    return hits / max(1, len(must_contain)), ctxs

for q, kws in EVAL_QUERIES:
    r, ctxs = recall_at_k(q, kws, k=5)
    print("\nQ:", q)
    print("Recall@5:", round(r, 3), "| hits:", int(r*len(kws)), "/", len(kws))



Q: What are calories?
Recall@5: 1.0 | hits: 4 / 4

Q: Why is junk food unhealthy?
Recall@5: 0.2 | hits: 1 / 5

Q: What is a balanced diet?
Recall@5: 0.4 | hits: 2 / 5
