Multi-Metric RAG Evaluation System (MM-RAG-Eval)

Focuses on the fact that you are measuring everything together:

Accuracy vs GT

Hallucination

Robustness

Similarity metrics




In [None]:
# Optimized RAG Hyperparameter Evaluation Pipeline (memory-friendly)
# ----------------------------------------------------------------
# - Loads LLM once (with max context window)
# - Builds VectorStoreIndex once per chunk size (caches indices)
# - Evaluates each (temp, chunk, context) config reusing the index & model
# - Uses prompt-based 'temperature emulation' to avoid reloading LLM for each temp
#
# Notes:
# - If you want *strict numeric* temperature changes, uncomment the LLM re-init block
#   (but that reloads the heavy model per temperature and will increase RAM/time).
# - For production evaluation you may prefer to load multiple LLMs on GPU nodes or use
#   an API (OpenAI, Vertex) to vary temperature cheaply.
#
# Requirements (install in Colab/Jupyter before running):
# !pip install -q llama-index ragas datasets rouge-score nltk sentencepiece transformers
# !pip install -q llama-cpp-python  # optional: only if using LlamaCPP locally
# nltk data:
# import nltk; nltk.download("punkt")
# ----------------------------------------------------------------

import os
import logging
import time
import pandas as pd
import numpy as np

# NLP / metrics
import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity

# LlamaIndex imports
from llama_index.core import Settings, VectorStoreIndex, Document, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("optimized_rag_eval")

# -------------------------
# 1. User Data & Ground Truth
# -------------------------
DOCUMENT_SOURCES = [
    "/content/book (1).pdf",        # local PDF (ensure exists)
    {"text": """Cancer is a disease in which abnormal cells divide uncontrollably and can invade nearby tissues.
These abnormal cells may also spread to other parts of the body through the blood and lymph systems.
Cancer can start almost anywhere in the human body. There are many types of cancer including breast cancer,
lung cancer, prostate cancer, and blood cancers like leukemia.

Common symptoms of cancer include unexplained weight loss, fatigue, lumps, prolonged cough, and changes in bowel habits.
Treatment options include chemotherapy, radiation, surgery, immunotherapy, and targeted therapy.
Early detection significantly improves survival rates.
"""},
]

TEST_QUESTIONS = [
    "What is cancer?",
    "What are common symptoms of cancer?",
    "How can cancer spread in the body?"
]

GROUND_TRUTH = {
    "What is cancer?":
        "Cancer is a disease where abnormal cells divide uncontrollably and invade nearby tissues.",
    "What are common symptoms of cancer?":
        "Common symptoms include weight loss, fatigue, lumps, cough, and changes in bowel habits.",
    "How can cancer spread in the body?":
        "Cancer can spread through the blood and lymphatic systems to other parts of the body.",
}

# -------------------------
# 2. Hyperparameter grid (edit)
# -------------------------
# temperatures = [0.1, 0.7, 1.0]          # we will emulate these via prompt instructions (faster)
# chunk_sizes = [500, 300, 200]          # build one index per chunk size (cached)
# # choose the max context window you want to support and load the LLM with that
context_windows = [8000, 4000, 2000]   # for the model we will use the max 8000 (loaded once)

temperatures = [0.1, 0.5, 0.9]
max_new_tokens_list = [128, 256, 512]
chunk_sizes = [128, 256, 512]
# -------------------------
# 3. Embedding & Metric utils (pre-initialize to avoid repeated overhead)
# -------------------------
embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large")
Settings.embed_model = embed_model

rouge = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
smooth = SmoothingFunction().method1

def cosine_sim(a, b):
    try:
        v1 = embed_model.get_text_embedding(a)
        v2 = embed_model.get_text_embedding(b)
        return float(cosine_similarity([v1], [v2])[0][0])
    except Exception as e:
        logger.warning(f"Cosine embedding error: {e}")
        return 0.0

def compute_rouge(pred, ref):
    try:
        sc = rouge.score(ref, pred)
        return float(sc["rouge1"].fmeasure), float(sc["rougeL"].fmeasure)
    except Exception as e:
        logger.warning(f"ROUGE error: {e}")
        return 0.0, 0.0

def compute_bleu(pred, ref):
    try:
        pt = word_tokenize(pred.lower())
        rt = [word_tokenize(ref.lower())]
        return float(sentence_bleu(rt, pt, smoothing_function=smooth))
    except Exception:
        return 0.0

# Heuristic classification of accuracy
def classify_accuracy(cosine_score, rouge1, bleu):
    if cosine_score >= 0.75 or (rouge1 >= 0.5 and bleu >= 0.4):
        return "Yes"
    if cosine_score >= 0.4 or rouge1 >= 0.25 or bleu >= 0.15:
        return "Partial"
    return "No"

def hallucination_from_accuracy(acc_label):
    return {"Yes": "Low", "Partial": "Medium", "No": "High"}[acc_label]

def robustness_score(acc_label, hall_label):
    acc_map = {"Yes": 1.0, "Partial": 0.6, "No": 0.0}
    hall_map = {"Low": 1.0, "Medium": 0.5, "High": 0.0}
    return round((0.7 * acc_map[acc_label]) + (0.3 * hall_map[hall_label]), 2)

# -------------------------
# 4. Build documents once (load PDF + text)
# -------------------------
def build_documents(sources):
    docs = []
    for s in sources:
        if isinstance(s, str) and os.path.exists(s):
            try:
                reader = SimpleDirectoryReader(input_files=[s])
                loaded = reader.load_data()
                docs.extend(loaded)
                logger.info(f"Loaded {len(loaded)} docs from {s}")
            except Exception as e:
                logger.warning(f"Failed to load {s}: {e}")
        elif isinstance(s, dict) and "text" in s:
            docs.append(Document(text=s["text"]))
        else:
            logger.info(f"Skipping source: {s}")
    return docs

docs = build_documents(DOCUMENT_SOURCES)
if not docs:
    raise RuntimeError("No documents found. Check DOCUMENT_SOURCES.")

# -------------------------
# 5. Build & cache indices for each chunk size (only once)
# -------------------------
index_cache = {}
for csize in set(chunk_sizes):
    Settings.chunk_size = csize
    logger.info(f"Building index for chunk_size={csize} (this may take a moment)")
    idx = VectorStoreIndex.from_documents(docs)
    index_cache[csize] = idx
logger.info("Index cache built for chunk sizes: " + ", ".join(map(str, index_cache.keys())))

# -------------------------
# 6. Load the LLM once with the MAX context window (memory efficient)
#     - we set context_window to max(context_windows) to support all runs
# -------------------------
max_context = max(context_windows)
# Replace model_url below with your local GGUF path if you have one (recommended for speed)
LLM_MODEL_URL = None  # e.g. "/mnt/models/mistral-7b-instruct.gguf" or keep None to use default remote (slower)

llm_kwargs = dict(
    model_url = LLM_MODEL_URL or 'https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
    temperature = 0.1,                 # default; we will emulate other temps via prompt (explained)
    max_new_tokens = 256,
    context_window = max_context,
    messages_to_prompt = messages_to_prompt,
    completion_to_prompt = completion_to_prompt,
    verbose = False,
)

logger.info("Loading LLM once (this is the heaviest step).")
try:
    llm = LlamaCPP(**llm_kwargs)
    Settings.llm = llm
    logger.info("LLM loaded.")
except Exception as e:
    raise RuntimeError("Failed to initialize LlamaCPP. If you don't have llama-cpp installed or model accessible, switch to an API-based LLM. Error: " + str(e))

# -------------------------
# 7. Query helper: uses index (cached) and the single LLM
#    We use a small prompt-prefix to hint at temperature style instead of reloading LLM for each temperature.
#    If you want exact numeric temperature behavior, see the commented 'strict-temp' option below.
# -------------------------
def temperature_prompt_hint(temp):
    # conservative -> low-temp style; balanced -> mid; creative -> high-temp style
    if temp <= 0.2:
        return "Answer concisely and factually. Do not add extra speculation or make up facts."
    if temp <= 0.6:
        return "Answer accurately. You may paraphrase and summarize but stay factual."
    return "Answer creatively; if unsure, offer plausible hypotheses (may be speculative)."

def query_with_index(index, question, temp_hint):
    """
    Query the provided index with a prompt-level temperature hint.
    Returns the model's raw string.
    """
    preface = temp_hint + "\n\nQuestion: " + question + "\nAnswer:"
    qe = index.as_query_engine()
    resp = qe.query(preface)
    return str(resp)

# ---------- Optional: strict numeric temp (uncomment to enable)
# WARNING: enabling this will reinitialize LlamaCPP per temperature (heavy)
# def real_query_strict_temp(index, question, temp, context_window):
#     Settings.llm = None
#     llm_local = LlamaCPP(model_url=LLM_MODEL_URL or '...', temperature=temp, context_window=context_window, max_new_tokens=256, messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt)
#     Settings.llm = llm_local
#     qe = index.as_query_engine()
#     resp = qe.query(question)
#     return str(resp)
# ----------

# -------------------------
# 8. Main sweep: iterate combos, evaluate per-question, aggregate
# -------------------------
results = []
per_question_logs = []

start_all = time.time()
for temp in temperatures:
    temp_hint = temperature_prompt_hint(temp)   # emulate temperature via instruction
    for csize in chunk_sizes:
        index = index_cache[csize]  # reuse cached index for this chunk size
        # We loaded the LLM with max_context. If you need to restrict context_window in responses,
        # either reinit the model (heavy) or accept that model sees the larger context (usually fine).
        context_window_used = max_context

        # Evaluate all questions for this (temp, chunk, context)
        acc_labels = []
        hall_labels = []
        robust_scores = []
        outputs_summary = []

        for q in TEST_QUESTIONS:
            try:
                out = query_with_index(index, q, temp_hint)
            except Exception as e:
                logger.error(f"Query failed for q='{q}' (t={temp}, c={csize}): {e}")
                out = ""

            # compute metrics vs GT
            gt = GROUND_TRUTH.get(q, "")
            cos = cosine_sim(out, gt) if out else 0.0
            r1, rL = compute_rouge(out, gt) if out else (0.0, 0.0)
            bleu = compute_bleu(out, gt) if out else 0.0

            # classify
            acc_label = classify_accuracy(cosine_score=cos, rouge1=r1, bleu=bleu)
            hall_label = hallucination_from_accuracy(acc_label)
            robust = robustness_score(acc_label, hall_label)

            # gather
            acc_labels.append(acc_label)
            hall_labels.append(hall_label)
            robust_scores.append(robust)
            outputs_summary.append(out[:300].replace("\n", " "))  # short preview

            per_question_logs.append({
                "temp": temp, "chunk": csize, "context_window": context_window_used,
                "question": q, "output": out, "cosine": cos, "rouge1": r1, "rougeL": rL, "bleu": bleu,
                "accuracy_label": acc_label, "hallucination_label": hall_label, "robustness": robust
            })

        # aggregate across questions (majority) and avg robustness
        def majority(lbls):
            unique, counts = np.unique(lbls, return_counts=True)
            return unique[np.argmax(counts)]

        agg_acc = majority(acc_labels)
        agg_hall = majority(hall_labels)
        agg_rob = round(float(np.mean(robust_scores)), 2)

        output_overall = "Correct" if agg_acc == "Yes" else ("Partial" if agg_acc == "Partial" else "Wrong")

        results.append({
            "Temp": temp,
            "Chunk": csize,
            "Context": context_window_used,
            "Output": output_overall,
            "Accuracy vs GT": agg_acc,
            "Hallucination": agg_hall,
            "Robustness": agg_rob,
            "sample_outputs": outputs_summary
        })

end_all = time.time()
logger.info(f"Full sweep finished in {round(end_all - start_all, 1)}s")

# -------------------------
# 9. Save / show results
# -------------------------
df = pd.DataFrame(results)
pd.DataFrame(per_question_logs).to_csv("per_question_details.csv", index=False)
df.to_csv("hyperparam_matrix_optimized.csv", index=False)

print("\nFinal Hyperparameter Matrix (optimized):")
display(df)

# Optional: visual heatmap (Temp x Chunk -> Robustness)
try:
    pivot = df.pivot(index="Temp", columns="Chunk", values="Robustness")
    import matplotlib.pyplot as plt
    plt.figure(figsize=(6,4))
    plt.title("Robustness Heatmap (Temp vs Chunk)")
    plt.imshow(pivot.values, aspect="auto")
    plt.colorbar(label="Robustness")
    plt.xticks(range(len(pivot.columns)), pivot.columns)
    plt.yticks(range(len(pivot.index)), pivot.index)
    plt.xlabel("Chunk Size")
    plt.ylabel("Temperature")
    plt.tight_layout()
    plt.show()
except Exception as e:
    logger.warning(f"Could not plot heatmap: {e}")

# -------------------------
# 10. Quick tips to further reduce memory/time if needed:
# -------------------------
#  - Use a smaller embedding model (e.g., all-MiniLM-L6-v2) for large PDFs.
#  - Use a smaller LLM for evaluation (distilled / 1-2B) if you only need ranking.
#  - Cache embedding vectors to disk (so reboots don't recompute).
#  - Run model on a GPU-enabled instance if available.
# -------------------------


In [None]:
!pip install -q llama-index
%pip install llama-index-llms-llama-cpp
# !pip install -U langchain-community
%pip install llama-index-embeddings-huggingface

In [None]:
# !pip install rouge-score nltk
# import nltk
# nltk.download('punkt')
# !pip install RAGAS


# import nltk
# nltk.download('punkt_tab')