# ðŸ§  DSPy Checklist: Minimalist Demonstration

This notebook demonstrates the core building blocks of DSPy: **Signatures** and **Modules** (like Chain of Thought).

In [None]:
import sys
import os
# Add the project root to sys.path
# '..' assumes the notebook is in notebooks/ and src/ is in the parent dir
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import dspy
from dspy import settings, Evaluate
from dotenv import load_dotenv
from src.retriever import HDBRetriever, get_hdb_index
from src.model import HDBRAG

# 1. Setup - Ensure you have OPENAI_API_KEY in your .env file
load_dotenv()

# student = dspy.LM('ollama_chat/smollm2:360m', api_base='http://localhost:11434')
student = dspy.LM('ollama/qwen3:0.6b', api_base='http://localhost:11434')
# judge_lm = dspy.LM("gpt-4o-mini")
judge_lm = dspy.LM('ollama/qwen3:0.6b', api_base='http://localhost:11434', cache=True, max_tokens=512, temperature=0)

settings.configure(lm=student)
# settings.configure(teacher=teacher, lm=student)
student

In [None]:
import json

with open("../data/qa_pairs.json", "r", encoding="utf-8") as f:
    qa_pairs = json.load(f)

print(f"Loaded {len(qa_pairs)} QA pairs")
qa_pairs[0]

examples = [
    dspy.Example(
        question=qa["question"],
        answer=qa["answer"]
    ).with_inputs("question")
    for qa in qa_pairs
]
print(examples[1])
print("Inputs:", examples[0].inputs())
print("Label:", examples[0].answer)
train_examples = examples[:20]
test_examples = examples[10:]

In [None]:
class JudgeQA(dspy.Signature):
    """
    Decide whether the predicted answer is correct.

    Mark as correct if:
    - It states the same fact as the gold answer, OR
    - Both answers indicate that the information is missing, unknown,
      not mentioned, or cannot be determined from the context.

    Mark as incorrect if:
    - The prediction contradicts the gold answer
    - The prediction invents information
    """
    question: str = dspy.InputField()
    gold_answer: str = dspy.InputField()
    predicted_answer: str = dspy.InputField()

    is_accurate: bool = dspy.OutputField(
        desc="True if the predicted answer is semantically equivalent to the gold answer"
    )

judge = dspy.Predict(JudgeQA)
def metric(example, pred, trace=None):
    # Guard against bad predictions
    if not hasattr(pred, "answer"):
        return 0.0
    if pred.answer is None or pred.answer.strip() == "":
        return 0.0

    try:
        with dspy.settings.context(lm=judge_lm):
            result = judge(
                question=example.question,
                gold_answer=example.answer,
                predicted_answer=pred.answer
            )
        # Coerce to float for Evaluate
        return float(result.is_accurate)

    except Exception as e:
        # Any judge failure must NOT crash Evaluate
        if trace is not None:
            trace["judge_error"] = str(e)
        return 0.0

In [None]:
# 2. Initialize the index and the simple RAG module
index = get_hdb_index()
rag = HDBRAG(index=index, k=3)

trace = {}
# 3. Perform a sample query
query = examples[1].question
print(f"\nQuery: {query}")

response = rag(question=query)
print("\n--- Answer ---")
print(response.answer)
print(type(response.answer))

In [None]:
evaluator = dspy.Evaluate(
    devset=test_examples,
    metric=metric,
    num_threads=4,
    # display_progress=True,
    display_table=True
)

results = evaluator(rag)

In [None]:
results

## Optimization

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

rag_save_path = "../data/optimized_rag_smollm.json"

# teleprompter = dspy.MIPROv2(
#     metric=metric,
#     max_bootstrapped_demos=4,
#     max_labeled_demos=5,
#     num_threads=4
# )

# optimized_rag = teleprompter.compile(
#     rag,
#     trainset=train_examples,
# )

# optimized_rag.save(rag_save_path)

In [None]:
evaluator = dspy.Evaluate(
    devset=test_examples,
    metric=metric,
    num_threads=4,
    # display_progress=True,
    # display_table=True
)

results = evaluator(rag)

optimized_rag = HDBRAG(index=index, k=3)

optimized_rag.load("../data/optimized_rag_qwen3:0.6b.json")

evaluator = dspy.Evaluate(
    devset=test_examples,
    metric=metric,
    num_threads=4,
    # display_progress=True,
)

results = evaluator(optimized_rag)