In [None]:
import sys
from pathlib import Path
from dotenv import load_dotenv

# Set up directory paths
notebook_path = Path().resolve()  # Path to the current notebook directory
experiments_path = notebook_path.parent # Path to the experiments directory where the utils module is located
backend_path = experiments_path.parent / "backend"

if experiments_path not in sys.path:
    sys.path.insert(0, str(experiments_path))

# Load environment variables from backend/.env if it exists;
# otherwise, load system environment variables with default behavior
env_path = backend_path / ".env"
if env_path.exists():
    load_dotenv(env_path, override=True)
    print(f"[OK] Loaded environment variables from {env_path}")
else:
    load_dotenv(override=True)
    print("[WARNING] Backend .env not found, using default environment")


In [None]:
# --- Local Utilities ---
from utils.handbook_loader import load_handbook_documents

# Load all handbook documents from the backend data/handbook directory
all_documents = load_handbook_documents(backend_path / "data" / "handbook")

In [None]:
from litellm import completion
from typing import List
from utils.prompts import QA_GENERATION_SYSTEM_PROMPT
from utils.models import QAPairList, HandbookDoc, HandbookDocMetadata, QAPairWithTS

# ---------------------------------------------------------------------------
# Core QA generation utility function
# ---------------------------------------------------------------------------

def generate_qa_pairs_from_single_document(
    document: HandbookDoc,
    num_questions: int = 3,
    model: str = 'groq/openai/gpt-oss-20b'
) -> List[QAPairWithTS]:
    """
    Generate search-style factoid QA pairs from a single document string.

    Args:
        document: The document object.
        num_questions: Number of QA pairs to generate.
        model: Model name to use for LLM.

    Returns:
        List of QAPairWithTS objects representing question-answer pairs enriched with source document metadata.
    """
    user_prompt = (
        f"The user has provided the following document:\n\n"
        f"[DOCUMENT BEGINS]\n\n{document.content}\n\n[DOCUMENT ENDS]\n\n\n"
        f"Generate {num_questions} question-answer pairs from the document.\n\n"
        "Reply only with the question-answer pairs, nothing else."
    )
    messages = [
        {"role": "system", "content": QA_GENERATION_SYSTEM_PROMPT},
        {"role": "user", "content": user_prompt},
    ]

    response = completion(model=model, messages=messages, response_format=QAPairList)
    reply = response.choices[0].message.content

    return [
        QAPairWithTS(**pair.model_dump(), question_type='single-source', doc_metadata=[HandbookDocMetadata(**document.model_dump(exclude={'content'}))])
        for pair in QAPairList.model_validate_json(reply).pairs
    ]

qa_pairs = generate_qa_pairs_from_single_document(all_documents[0])


In [None]:
qa_pairs

In [None]:
from utils.prompts import GROUNDEDNESS_CRITIQUE_SYSTEM_PROMPT, RELEVANCE_CRITIQUE_SYSTEM_PROMPT, STANDALONE_CRITIQUE_SYSTEM_PROMPT
from utils.models import QuestionCritique, QuestionCritiqueWithType, CritiqueType

# ---------------------------------------------------------------------------
# Unified critique model and prompt mapping
# ---------------------------------------------------------------------------
 
CRITIQUE_PROMPTS: dict[CritiqueType, str] = {
    "groundedness": GROUNDEDNESS_CRITIQUE_SYSTEM_PROMPT,
    "relevance": RELEVANCE_CRITIQUE_SYSTEM_PROMPT,
    "standalone": STANDALONE_CRITIQUE_SYSTEM_PROMPT,
}

def critique_question(
    context: str,
    question: str,
    critique_type: CritiqueType,
    model: str = 'groq/openai/gpt-oss-20b',
) -> QuestionCritiqueWithType:
    """
    Critique a question against a context using a specified evaluation dimension.

    Args:
        context: The source document text.
        question: The question to evaluate.
        critique_type: One of "groundedness", "relevance", or "standalone".
        model: Model name to use for the LLM call.

    Returns:
        A QuestionCritiqueWithType with a rationale, score, and critique type.
    """
    system_prompt = CRITIQUE_PROMPTS[critique_type]
    user_prompt = (
        f"The user has provided the following question:\n\n"
        f"[QUESTION BEGINS]\n\n{question}\n\n[QUESTION ENDS]\n\n"
        f"The user has provided the following context:\n\n"
        f"[CONTEXT BEGINS]\n\n{context}\n\n[CONTEXT ENDS]\n\n\n"
        "Reply only with your rationale for the rating and your score (1-5), nothing else."
    )
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    response = completion(model=model, messages=messages, response_format=QuestionCritique)
    reply = response.choices[0].message.content
    return QuestionCritiqueWithType(
        **QuestionCritique.model_validate_json(reply).model_dump(),
        critique_type=critique_type,
    )


In [None]:
def critique_all_dimensions(context: str, question: str) -> List[QuestionCritiqueWithType]:
    """
    Critique a question against a context across all supported evaluation dimensions.

    Args:
        context: The source document text.
        question: The question to evaluate.

    Returns:
        A list of QuestionCritiqueWithType instances, one per critique dimension.
    """
    return [
        critique_question(context, question, crit_type)
        for crit_type in ["relevance", "standalone", "groundedness"]
    ]

# Test the unified critique function across all three dimensions
context = all_documents[0].content
question = qa_pairs[0].question
critiques = critique_all_dimensions(context, question)

print(question)
for critique in critiques:
    print(f"{critique.critique_type}: {critique.rationale} (score: {critique.score})")

In [None]:
critiques

In [None]:
from utils.models import QAPairEvalRecord

doc_content_lookup = {doc.id: doc.content for doc in all_documents}

eval_records = [
    QAPairEvalRecord(
        **qa_pair.model_dump(),
        critiques=critique_all_dimensions("\n\n---\n\n".join(doc_content_lookup[m.id] for m in qa_pair.doc_metadata), qa_pair.question),
    )
    for qa_pair in qa_pairs
]

eval_records

In [None]:
import random

def generate_eval_dataset(
    documents: List[HandbookDoc],
    n_docs: int = 10,
    seed: int = 42,
) -> List[QAPairEvalRecord]:
    """
    Sample n_docs documents and run the full QA generation + critique pipeline.

    Args:
        documents: Full list of handbook documents to sample from.
        n_docs: Number of documents to sample (default 10).
        seed: Random seed for reproducible document sampling (default 42).

    Returns:
        List of QAPairEvalRecord instances with critiques across all dimensions.
    """
    random.seed(seed)
    sampled_docs = random.sample(documents, min(n_docs, len(documents)))
    doc_content_lookup = {doc.id: doc.content for doc in sampled_docs}

    all_qa_pairs: List[QAPairWithTS] = []
    for doc in sampled_docs:
        all_qa_pairs.extend(generate_qa_pairs_from_single_document(doc))

    return [
        QAPairEvalRecord(
            **qa_pair.model_dump(),
            critiques=critique_all_dimensions(
                "\n\n---\n\n".join(doc_content_lookup[m.id] for m in qa_pair.doc_metadata), qa_pair.question
            ),
        )
        for qa_pair in all_qa_pairs
    ]

eval_records = generate_eval_dataset(all_documents, n_docs=5, seed=42)
print(f"Generated {len(eval_records)} eval records from 5 sampled documents.")


In [None]:
OUTPUT_DIR = Path(".")  # same directory as the notebook

def save_eval_jsonl(records: List[QAPairEvalRecord], path: Path) -> None:
    """Serialize all eval records to a JSONL file."""
    path = Path(path)
    with path.open("w", encoding="utf-8") as f:
        for record in records:
            f.write(record.model_dump_json() + "\n")
    print(f"Saved {len(records)} records → {path}")


def save_filtered_eval_jsonl(
    records: List[QAPairEvalRecord],
    path: Path,
    min_score: int = 4,
) -> List[QAPairEvalRecord]:
    """
    Keep only records where every critique dimension scores >= min_score, then save to JSONL.

    Returns the filtered list for downstream use.
    """
    filtered = [
        r for r in records
        if all(c.score >= min_score for c in r.critiques)
    ]
    save_eval_jsonl(filtered, path)
    print(
        f"Filtered to {len(filtered)} / {len(records)} records "
        f"(all dimensions >= {min_score})"
    )
    return filtered


save_eval_jsonl(eval_records, OUTPUT_DIR / "eval_questions.jsonl")
filtered_records = save_filtered_eval_jsonl(
    eval_records, OUTPUT_DIR / "eval_questions_filtered.jsonl"
)

In [None]:
import pandas as pd

SCORE_DIMS = ["relevance", "standalone", "groundedness"]


def build_eval_dataframe(records: List[QAPairEvalRecord]) -> pd.DataFrame:
    """
    Build a tidy DataFrame from eval records — one row per question.

    Columns: id, question, answer, question_type, doc_title, doc_category,
             relevance, standalone, groundedness.

    The integer 'id' (0-indexed) is the shared key that links rows here
    to entries in the markdown export and JSONL files.
    """
    rows = []
    for i, record in enumerate(records):
        score_map = {c.critique_type: c.score for c in record.critiques}
        rows.append({
            "id": i,
            "question": record.question,
            "answer": record.answer,
            "question_type": record.question_type,
            "doc_title": " | ".join(m.title for m in record.doc_metadata),
            "doc_category": " | ".join(m.category for m in record.doc_metadata),
            "relevance": score_map.get("relevance"),
            "standalone": score_map.get("standalone"),
            "groundedness": score_map.get("groundedness"),
        })
    return pd.DataFrame(rows)


df = build_eval_dataframe(eval_records)
df.to_csv(OUTPUT_DIR / "eval_questions.csv", index=False)
print(f"Saved DataFrame ({df.shape[0]} rows × {df.shape[1]} cols) → eval_questions.csv")
df.head()

In [None]:
def format_eval_record_md(record: QAPairEvalRecord, id: int) -> str:
    """
    Render a single eval record as a Markdown block.

    Uses native Markdown (headings, tables, bold) so the file renders cleanly
    in any Markdown viewer. The 'ID: <n>' heading is Ctrl+F friendly and maps
    directly to the 'id' column in the DataFrame.
    """
    SCORE_EMOJI = {1: "★☆☆☆☆", 2: "★★☆☆☆", 3: "★★★☆☆", 4: "★★★★☆", 5: "★★★★★"}

    lines = []

    # Header — easy to Ctrl+F
    lines.append(f"## ID: {id}")
    lines.append("")

    # Q / A / metadata
    lines.append(f"**Q:** {record.question}")
    lines.append("")
    lines.append(f"**A:** {record.answer}")
    lines.append("")
    lines.append(
        f"**Type:** {record.question_type} &nbsp;|&nbsp; "
        f"**Source:** {", ".join(f"{m.title} [{m.category}]" for m in record.doc_metadata)}"
    )
    lines.append("")

    # Score summary table
    lines.append("| Dimension | Score | Rating |")
    lines.append("|-----------|:-----:|--------|")
    for c in record.critiques:
        lines.append(
            f"| {c.critique_type.capitalize()} "
            f"| {c.score} / 5 "
            f"| {SCORE_EMOJI.get(c.score, '')} |"
        )
    lines.append("")

    # Per-dimension rationale
    for c in record.critiques:
        lines.append(f"**{c.critique_type.capitalize()}** — {c.score}/5")
        lines.append("")
        lines.append(f"> {c.rationale}")
        lines.append("")

    lines.append("---")
    lines.append("")

    return "\n".join(lines)


def export_eval_markdown(records: List[QAPairEvalRecord], path: Path) -> None:
    """
    Export eval records to a Markdown file with proper Markdown formatting.

    Each record is a section headed '## ID: <n>' for easy Ctrl+F navigation.
    The integer ID matches the 'id' column in the DataFrame.
    """
    path = Path(path)
    content = "\n".join(format_eval_record_md(r, i) for i, r in enumerate(records))
    path.write_text(content, encoding="utf-8")
    print(f"Exported {len(records)} records → {path}")


export_eval_markdown(eval_records, OUTPUT_DIR / "eval_questions.md")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

QUALITY_CUTOFF = 4
PALETTE = {"Before filter": "#4472C4", "After filter": "#ED7D31"}


def _build_before_after_df(
    df_all: pd.DataFrame, df_filtered: pd.DataFrame
) -> pd.DataFrame:
    """Combine all records and filtered records with a 'split' label column."""
    df_b = df_all[["doc_category"] + SCORE_DIMS].copy()
    df_b["split"] = "Before filter"
    df_a = df_filtered[["doc_category"] + SCORE_DIMS].copy()
    df_a["split"] = "After filter"
    return pd.concat([df_b, df_a], ignore_index=True)


def _polish_bars(ax) -> None:
    """Add white bar edges and a subtle dashed horizontal grid."""
    for patch in ax.patches:
        patch.set_edgecolor("white")
        patch.set_linewidth(0.8)
    ax.yaxis.grid(True, linestyle="--", linewidth=0.6, alpha=0.6, color="#CCCCCC")
    ax.set_axisbelow(True)


def plot_before_after_bar(
    df_all: pd.DataFrame, df_filtered: pd.DataFrame, output_path: Path
) -> None:
    """
    Grouped bar chart: mean score per dimension, before vs after the quality filter.
    Error bars show ±1 SD.
    """
    combined = _build_before_after_df(df_all, df_filtered)
    melted = combined.melt(
        id_vars=["doc_category", "split"], var_name="dimension", value_name="score"
    )

    fig, ax = plt.subplots(figsize=(9, 5))
    sns.barplot(
        data=melted, x="dimension", y="score", hue="split",
        palette=PALETTE, capsize=0.04, errorbar="sd", ax=ax,
        err_kws={"linewidth": 1.2},
    )
    _polish_bars(ax)
    ax.axhline(y=QUALITY_CUTOFF, color="crimson", linestyle="--", linewidth=1.2,
               label=f"Quality cutoff ({QUALITY_CUTOFF})", zorder=3)
    ax.set_ylim(0, 5.5)
    ax.set_title("Mean Score per Dimension — Before vs After Quality Filter",
                 fontsize=13, fontweight="bold")
    ax.set_xlabel("Dimension")
    ax.set_ylabel("Mean Score")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels, title="", frameon=False)

    sns.despine()
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.show()
    print(f"Saved → {output_path}")


def plot_category_before_after_bar(
    df_all: pd.DataFrame, df_filtered: pd.DataFrame, output_path: Path
) -> None:
    """
    One subplot per critique dimension; each shows mean score by document category,
    before vs after the quality filter. Error bars show ±1 SD.
    """
    combined = _build_before_after_df(df_all, df_filtered)
    melted = combined.melt(
        id_vars=["doc_category", "split"], var_name="dimension", value_name="score"
    )

    fig, axes = plt.subplots(1, len(SCORE_DIMS), figsize=(15, 5), sharey=True)
    fig.suptitle("Mean Score by Category — Before vs After Quality Filter",
                 fontsize=13, fontweight="bold")

    for ax, dim in zip(axes, SCORE_DIMS):
        subset = melted[melted["dimension"] == dim]
        sns.barplot(
            data=subset, x="doc_category", y="score", hue="split",
            palette=PALETTE, capsize=0.04, errorbar="sd", ax=ax,
            err_kws={"linewidth": 1.2},
        )
        _polish_bars(ax)
        ax.axhline(y=QUALITY_CUTOFF, color="crimson", linestyle="--", linewidth=1.2, zorder=3)
        ax.set_title(dim.capitalize(), fontsize=11)
        ax.set_xlabel("Category")
        ax.set_ylabel("Mean Score" if ax is axes[0] else "")
        ax.set_ylim(0, 5.5)
        ax.tick_params(axis="x", rotation=20)
        ax.get_legend().remove()

    legend_elements = [
        mpatches.Patch(facecolor=PALETTE["Before filter"], edgecolor="white", linewidth=0.8, label="Before filter"),
        mpatches.Patch(facecolor=PALETTE["After filter"], edgecolor="white", linewidth=0.8, label="After filter"),
    ]
    fig.legend(handles=legend_elements, loc="upper right", frameon=False)

    sns.despine()
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    plt.show()
    print(f"Saved → {output_path}")


# Build the filtered DataFrame and run both plots
df_filtered = build_eval_dataframe(filtered_records)

plot_before_after_bar(df, df_filtered, OUTPUT_DIR / "scores_before_after_bar.png")
plot_category_before_after_bar(df, df_filtered, OUTPUT_DIR / "scores_by_category_before_after.png")