From 9c753b7611339fa7ea22d8b6fcfb2776021913c9 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sat, 5 Oct 2024 12:24:21 +0000
Subject: [PATCH 01/17] feat(dspy): add dummyLM based upon dspy.clients.lm.LM

---
 dspy/utils/dummies.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/dspy/utils/dummies.py b/dspy/utils/dummies.py
index d97d59b5a8..ecac8a54d3 100644
--- a/dspy/utils/dummies.py
+++ b/dspy/utils/dummies.py
@@ -4,12 +4,13 @@
 
 import numpy as np
 
-from dsp.modules import LM
+from dsp.modules import LM as DSPLM
 from dsp.utils.utils import dotdict
+from dspy.clients.lm import LM
 
 
-class DummyLM(LM):
-    """Dummy language model for unit testing purposes."""
+class DSPDummyLM(DSPLM):
+    """Dummy language model for unit testing purposes subclassing DSP LM class."""
 
     def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
         """Initializes the dummy language model.
@@ -64,7 +65,7 @@ def basic_request(self, prompt, n=1, **kwargs) -> dict[str, list[dict[str, str]]
                 },
             )
 
-            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            RED, _, RESET = "\033[91m", "\033[92m", "\033[0m"
             print("=== DummyLM ===")
             print(prompt, end="")
             print(f"{RED}{answer}{RESET}")
@@ -94,6 +95,24 @@ def get_convo(self, index) -> str:
         return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
 
 
+class DummyLM(LM):
+    def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
+        super().__init__("dummy", "chat", 0.0, 1000, True)
+        self.answers = iter([[ans] for ans in answers])
+
+    def __call__(self, **kwargs):
+        fallback = "No more responses"
+        if isinstance(self.answers, dict):
+            answer = next((v for k, v in self.answers.items() if k in kwargs["prompt"]), fallback)
+        else:
+            answer = next(self.answers, fallback)
+        return answer
+
+    def get_convo(self, index) -> str:
+        """Get the prompt + anwer from the ith message."""
+        return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
+
+
 def dummy_rm(passages=()) -> callable:
     if not passages:
 

From b0d4f2c0cdd839db97dac1295f14eacb1c6380f5 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sat, 5 Oct 2024 12:26:37 +0000
Subject: [PATCH 02/17] feat(dspy): move tests using DspyDummyLM to own folder
 for deprecation

---
 tests/dsp_LM/__init__.py                      |   0
 tests/dsp_LM/evaluate/test_evaluate.py        | 100 ++
 tests/dsp_LM/evaluate/test_metrics.py         |  36 +
 tests/dsp_LM/examples/test_baleen.py          | 122 +++
 tests/dsp_LM/functional/test_functional.py    | 906 ++++++++++++++++++
 .../functional/test_signature_opt_typed.py    | 187 ++++
 .../dsp_LM/functional/test_signature_typed.py | 155 +++
 tests/dsp_LM/modules/test_aws_models.py       |  70 ++
 .../dsp_LM/modules/test_cloudflare_models.py  |  61 ++
 tests/dsp_LM/modules/test_hf_model.py         |  31 +
 .../modules/vectorizer/test_fastembed.py      |  43 +
 tests/dsp_LM/predict/test_aggregation.py      |  43 +
 tests/dsp_LM/predict/test_chain_of_thought.py |  36 +
 .../test_chain_of_thought_with_hint.py        |  43 +
 tests/dsp_LM/predict/test_knn.py              |  51 +
 tests/dsp_LM/predict/test_langchain.py        |  55 ++
 .../predict/test_multi_chain_comparison.py    |  40 +
 tests/dsp_LM/predict/test_predict.py          | 101 ++
 .../dsp_LM/predict/test_program_of_thought.py | 135 +++
 tests/dsp_LM/predict/test_react.py            | 154 +++
 tests/dsp_LM/predict/test_retry.py            | 110 +++
 tests/dsp_LM/primitives/test_example.py       | 110 +++
 tests/dsp_LM/primitives/test_module.py        |  49 +
 tests/dsp_LM/primitives/test_program.py       |  21 +
 .../primitives/test_python_interpreter.py     |  53 +
 .../retrieve/integration_test_pgvectorrm.py   |  94 ++
 tests/dsp_LM/retrieve/test_llama_index_rm.py  |  61 ++
 tests/dsp_LM/signatures/test_signature.py     |  41 +
 tests/dsp_LM/teleprompt/test_bootstrap.py     | 156 +++
 .../dsp_LM/teleprompt/test_copro_optimizer.py | 149 +++
 tests/dsp_LM/teleprompt/test_ensemble.py      |  59 ++
 tests/dsp_LM/teleprompt/test_knn_fewshot.py   |  65 ++
 .../dsp_LM/teleprompt/test_mipro_optimizer.py | 254 +++++
 tests/dsp_LM/teleprompt/test_random_search.py |  39 +
 34 files changed, 3630 insertions(+)
 create mode 100644 tests/dsp_LM/__init__.py
 create mode 100644 tests/dsp_LM/evaluate/test_evaluate.py
 create mode 100644 tests/dsp_LM/evaluate/test_metrics.py
 create mode 100644 tests/dsp_LM/examples/test_baleen.py
 create mode 100644 tests/dsp_LM/functional/test_functional.py
 create mode 100644 tests/dsp_LM/functional/test_signature_opt_typed.py
 create mode 100644 tests/dsp_LM/functional/test_signature_typed.py
 create mode 100644 tests/dsp_LM/modules/test_aws_models.py
 create mode 100644 tests/dsp_LM/modules/test_cloudflare_models.py
 create mode 100644 tests/dsp_LM/modules/test_hf_model.py
 create mode 100644 tests/dsp_LM/modules/vectorizer/test_fastembed.py
 create mode 100644 tests/dsp_LM/predict/test_aggregation.py
 create mode 100644 tests/dsp_LM/predict/test_chain_of_thought.py
 create mode 100644 tests/dsp_LM/predict/test_chain_of_thought_with_hint.py
 create mode 100644 tests/dsp_LM/predict/test_knn.py
 create mode 100644 tests/dsp_LM/predict/test_langchain.py
 create mode 100644 tests/dsp_LM/predict/test_multi_chain_comparison.py
 create mode 100644 tests/dsp_LM/predict/test_predict.py
 create mode 100644 tests/dsp_LM/predict/test_program_of_thought.py
 create mode 100644 tests/dsp_LM/predict/test_react.py
 create mode 100644 tests/dsp_LM/predict/test_retry.py
 create mode 100644 tests/dsp_LM/primitives/test_example.py
 create mode 100644 tests/dsp_LM/primitives/test_module.py
 create mode 100644 tests/dsp_LM/primitives/test_program.py
 create mode 100644 tests/dsp_LM/primitives/test_python_interpreter.py
 create mode 100644 tests/dsp_LM/retrieve/integration_test_pgvectorrm.py
 create mode 100644 tests/dsp_LM/retrieve/test_llama_index_rm.py
 create mode 100644 tests/dsp_LM/signatures/test_signature.py
 create mode 100644 tests/dsp_LM/teleprompt/test_bootstrap.py
 create mode 100644 tests/dsp_LM/teleprompt/test_copro_optimizer.py
 create mode 100644 tests/dsp_LM/teleprompt/test_ensemble.py
 create mode 100644 tests/dsp_LM/teleprompt/test_knn_fewshot.py
 create mode 100644 tests/dsp_LM/teleprompt/test_mipro_optimizer.py
 create mode 100644 tests/dsp_LM/teleprompt/test_random_search.py

diff --git a/tests/dsp_LM/__init__.py b/tests/dsp_LM/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/evaluate/test_evaluate.py b/tests/dsp_LM/evaluate/test_evaluate.py
new file mode 100644
index 0000000000..5c6c1f82ee
--- /dev/null
+++ b/tests/dsp_LM/evaluate/test_evaluate.py
@@ -0,0 +1,100 @@
+import signal
+import threading
+from unittest.mock import patch
+
+import pytest
+
+import dsp
+import dspy
+from dspy.evaluate.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+from dspy.utils.dummies import DSPDummyLM
+
+
+def new_example(question, answer):
+    """Helper function to create a new example."""
+    return dspy.Example(
+        question=question,
+        answer=answer,
+    ).with_inputs("question")
+
+
+def test_evaluate_call():
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    score = ev(program)
+    assert score == 100.0
+
+
+def test_multithread_evaluate_call():
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+        num_threads=2,
+    )
+    score = ev(program)
+    assert score == 100.0
+
+
+def test_multi_thread_evaluate_call_cancelled(monkeypatch):
+    # slow LM that sleeps for 1 second before returning the answer
+    class SlowLM(DSPDummyLM):
+        def __call__(self, prompt, **kwargs):
+            import time
+
+            time.sleep(1)
+            return super().__call__(prompt, **kwargs)
+
+    dspy.settings.configure(lm=SlowLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+
+    # spawn a thread that will sleep for .1 seconds then send a KeyboardInterrupt
+    def sleep_then_interrupt():
+        import time
+
+        time.sleep(0.1)
+        import os
+
+        os.kill(os.getpid(), signal.SIGINT)
+
+    input_thread = threading.Thread(target=sleep_then_interrupt)
+    input_thread.start()
+
+    with pytest.raises(KeyboardInterrupt):
+        ev = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            num_threads=2,
+        )
+        score = ev(program)
+        assert score == 100.0
+
+
+def test_evaluate_call_bad():
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "0", "What is 2+2?": "0"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    score = ev(program)
+    assert score == 0.0
diff --git a/tests/dsp_LM/evaluate/test_metrics.py b/tests/dsp_LM/evaluate/test_metrics.py
new file mode 100644
index 0000000000..04e32e68ca
--- /dev/null
+++ b/tests/dsp_LM/evaluate/test_metrics.py
@@ -0,0 +1,36 @@
+# FILEPATH: /Users/ahle/repos/dspy/tests/evaluate/test_metrics.py
+
+import dsp
+import dspy
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+
+
+def test_answer_exact_match_string():
+    example = dspy.Example(
+        question="What is 1+1?",
+        answer="2",
+    ).with_inputs("question")
+    pred = Predict("question -> answer")
+    pred.answer = "2"
+    assert answer_exact_match(example, pred)
+
+
+def test_answer_exact_match_list():
+    example = dspy.Example(
+        question="What is 1+1?",
+        answer=["2", "two"],
+    ).with_inputs("question")
+    pred = Predict("question -> answer")
+    pred.answer = "2"
+    assert answer_exact_match(example, pred)
+
+
+def test_answer_exact_match_no_match():
+    example = dspy.Example(
+        question="What is 1+1?",
+        answer="2",
+    ).with_inputs("question")
+    pred = Predict("question -> answer")
+    pred.answer = "3"
+    assert not answer_exact_match(example, pred)
diff --git a/tests/dsp_LM/examples/test_baleen.py b/tests/dsp_LM/examples/test_baleen.py
new file mode 100644
index 0000000000..f0b8042699
--- /dev/null
+++ b/tests/dsp_LM/examples/test_baleen.py
@@ -0,0 +1,122 @@
+import pytest
+
+import dspy
+import dspy.evaluate
+from dsp.utils import deduplicate
+from dspy.datasets import HotPotQA
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt.bootstrap import BootstrapFewShot
+
+
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+class GenerateSearchQuery(dspy.Signature):
+    """Write a simple search query that will help answer a complex question."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    query = dspy.OutputField()
+
+
+class SimplifiedBaleen(dspy.Module):
+    def __init__(self, passages_per_hop=3, max_hops=2):
+        super().__init__()
+
+        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
+        self.retrieve = dspy.Retrieve(k=passages_per_hop)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+        self.max_hops = max_hops
+
+    def forward(self, question):
+        context = []
+
+        for hop in range(self.max_hops):
+            query = self.generate_query[hop](context=context, question=question).query
+            passages = self.retrieve(query).passages
+            context = deduplicate(context + passages)
+
+        pred = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=pred.answer)
+
+
+def load_hotpotqa():
+    # Load the dataset.
+    dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)
+    # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+    return trainset, devset
+
+
+# @pytest.mark.slow_test
+# TODO: Find a way to make this test run without openai
+def _test_baleen():
+    lm = dspy.OpenAI(model="gpt-3.5-turbo")
+    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    # Ask any question you like to this simple RAG program.
+    my_question = "How many storeys are in the castle that David Gregory inherited?"
+
+    # Get the prediction. This contains `pred.context` and `pred.answer`.
+    uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
+    pred = uncompiled_baleen(my_question)
+
+    assert pred.answer == "five"
+
+
+def validate_context_and_answer_and_hops(example, pred, trace=None):
+    if not dspy.evaluate.answer_exact_match(example, pred):
+        return False
+    if not dspy.evaluate.answer_passage_match(example, pred):
+        return False
+
+    hops = [example.question] + [outputs.query for *_, outputs in trace if "query" in outputs]
+
+    if max([len(h) for h in hops]) > 100:
+        return False
+    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))):
+        return False
+
+    return True
+
+
+def gold_passages_retrieved(example, pred, trace=None):
+    gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
+    found_titles = set(map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context]))
+
+    return gold_titles.issubset(found_titles)
+
+
+# @pytest.mark.slow_test
+# TODO: Find a way to make this test run without the slow hotpotqa dataset
+def _test_compiled_baleen():
+    trainset, devset = load_hotpotqa()
+    lm = dspy.OpenAI(model="gpt-3.5-turbo")
+    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
+
+    teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
+    compiled_baleen = teleprompter.compile(
+        SimplifiedBaleen(),
+        teacher=SimplifiedBaleen(passages_per_hop=2),
+        trainset=trainset,
+    )
+
+    evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+    uncompiled_baleen_retrieval_score = evaluate_on_hotpotqa(
+        uncompiled_baleen, metric=gold_passages_retrieved, display=False
+    )
+    # assert uncompiled_baleen_retrieval_score / 100 == 18 / 50
+
+    compiled_baleen_retrieval_score = evaluate_on_hotpotqa(compiled_baleen, metric=gold_passages_retrieved)
+    # assert compiled_baleen_retrieval_score / 100 == 27 / 50
+    assert uncompiled_baleen_retrieval_score < compiled_baleen_retrieval_score
diff --git a/tests/dsp_LM/functional/test_functional.py b/tests/dsp_LM/functional/test_functional.py
new file mode 100644
index 0000000000..bc844a82c2
--- /dev/null
+++ b/tests/dsp_LM/functional/test_functional.py
@@ -0,0 +1,906 @@
+import datetime
+import textwrap
+from typing import Annotated, Any, Generic, List, Literal, Optional, TypeVar
+
+import pydantic
+import pytest
+from pydantic import AfterValidator, BaseModel, Field, ValidationError, field_validator, model_validator
+
+import dspy
+from dspy.functional import FunctionalModule, TypedChainOfThought, TypedPredictor, cot, predictor
+from dspy.predict.predict import Predict
+from dspy.primitives.example import Example
+from dspy.teleprompt.bootstrap import BootstrapFewShot
+from dspy.teleprompt.vanilla import LabeledFewShot
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_simple():
+    @predictor
+    def hard_question(topic: str) -> str:
+        """Think of a hard factual question about a topic."""
+
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM([expected])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_question(topic="Physics")
+    lm.inspect_history(n=2)
+
+    assert question == expected
+
+
+def test_list_output():
+    @predictor
+    def hard_questions(topics: List[str]) -> List[str]:
+        pass
+
+    expected = ["What is the speed of light?", "What is the speed of sound?"]
+    lm = DSPDummyLM(['["What is the speed of light?", "What is the speed of sound?"]'])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_questions(topics=["Physics", "Music"])
+    lm.inspect_history(n=2)
+
+    assert question == expected
+
+
+def test_simple_type():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    @predictor
+    def hard_question(topic: str) -> Question:
+        """Think of a hard factual question about a topic."""
+
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM([f'{{"value": "{expected}"}}'])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_question(topic="Physics")
+
+    assert isinstance(question, Question)
+    assert question.value == expected
+
+
+def test_simple_type_input():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class Answer(pydantic.BaseModel):
+        value: str
+
+    @predictor
+    def answer(question: Question) -> Answer:
+        pass
+
+    question = Question(value="What is the speed of light?")
+    lm = DSPDummyLM([f'{{"value": "3e8"}}'])
+    dspy.settings.configure(lm=lm)
+
+    result = answer(question=question)
+
+    assert result == Answer(value="3e8")
+
+
+def test_simple_class():
+    class Answer(pydantic.BaseModel):
+        value: float
+        certainty: float
+        comments: List[str] = pydantic.Field(description="At least two comments about the answer")
+
+    class QA(FunctionalModule):
+        @predictor
+        def hard_question(self, topic: str) -> str:
+            """Think of a hard factual question about a topic. It should be answerable with a number."""
+
+        @cot
+        def answer(self, question: Annotated[str, "Question to answer"]) -> Answer:
+            pass
+
+        def forward(self, **kwargs):
+            question = self.hard_question(**kwargs)
+            return (question, self.answer(question=question))
+
+    expected = Answer(
+        value=3e8,
+        certainty=0.9,
+        comments=["It is the speed of light", "It is a constant"],
+    )
+
+    lm = DSPDummyLM(
+        [
+            "What is the speed of light?",
+            "Some bad reasoning, 3e8 m/s.",
+            "3e8",  # Bad answer 1
+            "{...}",  # Model is asked to create an example
+            "Some good reasoning...",
+            expected.model_dump_json(),  # Good answer
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    qa = QA()
+    assert isinstance(qa, FunctionalModule)
+    assert isinstance(qa.answer, dspy.Module)
+
+    question, answer = qa(topic="Physics")
+
+    print(qa.answer)
+
+    assert question == "What is the speed of light?"
+    assert answer == expected
+
+
+def test_simple_oop():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class MySignature(dspy.Signature):
+        topic: str = dspy.InputField()
+        output: Question = dspy.OutputField()
+
+    # Run the signature
+    program = TypedPredictor(MySignature)
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM(
+        [
+            Question(value=expected).model_dump_json(),
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    question = program(topic="Physics").output
+
+    assert isinstance(question, Question)
+    assert question.value == expected
+
+
+def test_bootstrap_effectiveness():
+    class SimpleModule(FunctionalModule):
+        @predictor
+        def output(self, input: str) -> str:
+            pass
+
+        def forward(self, **kwargs):
+            return self.output(**kwargs)
+
+    def simple_metric(example, prediction, trace=None):
+        return example.output == prediction.output
+
+    examples = [
+        ex.with_inputs("input")
+        for ex in (
+            Example(input="What is the color of the sky?", output="blue"),
+            Example(
+                input="What does the fox say?",
+                output="Ring-ding-ding-ding-dingeringeding!",
+            ),
+        )
+    ]
+    trainset = [examples[0]]
+    valset = [examples[1]]
+
+    # This test verifies if the bootstrapping process improves the student's predictions
+    student = SimpleModule()
+    teacher = SimpleModule()
+    assert student.output.predictor.signature.equals(teacher.output.predictor.signature)
+
+    lm = DSPDummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    dspy.settings.configure(lm=lm, trace=[])
+
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    lm.inspect_history(n=2)
+
+    # Check that the compiled student has the correct demos
+    _, predict = next(compiled_student.named_sub_modules(Predict, skip_compiled=False))
+    demos = predict.demos
+    assert len(demos) == 1
+    assert demos[0].input == trainset[0].input
+    assert demos[0].output == trainset[0].output
+
+    # Test the compiled student's prediction.
+    # We are using a DspDummyLM with follow_examples=True, which means that
+    # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
+    # on the second output, if it seems an example that perfectly matches the
+    # prompt, it will use that instead. That is why we expect "blue" here.
+    prediction = compiled_student(input=trainset[0].input)
+    assert prediction == trainset[0].output
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue"""
+    )
+
+
+def test_regex():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    email = textwrap.dedent(
+        """\
+        We're excited to welcome you aboard your upcoming flight from
+        John F. Kennedy International Airport (JFK) to Los Angeles International Airport (LAX)
+        on December 25, 2022. Here's everything you need to know before you take off: ...
+    """
+    )
+    lm = DSPDummyLM(
+        [
+            # Example with a bad origin code.
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            # Example to help the model understand
+            "{...}",
+            # Fixed
+            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email=email) == TravelInformation(
+        origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
+    )
+
+
+def test_custom_model_validate_json():
+    class Airport(BaseModel):
+        code: str = Field(pattern=r"^[A-Z]{3}$")
+        lat: float
+        lon: float
+
+    class TravelInformation(BaseModel):
+        origin: Airport
+        destination: Airport
+        date: datetime.date
+
+        @classmethod
+        def model_validate_json(
+            cls, json_data: str, *, strict: Optional[bool] = None, context: Optional[dict[str, Any]] = None
+        ) -> "TravelInformation":
+            try:
+                __tracebackhide__ = True
+                return cls.__pydantic_validator__.validate_json(json_data, strict=strict, context=context)
+            except ValidationError:
+                for substring_length in range(len(json_data), 1, -1):
+                    for start in range(len(json_data) - substring_length + 1):
+                        substring = json_data[start : start + substring_length]
+                        try:
+                            __tracebackhide__ = True
+                            res = cls.__pydantic_validator__.validate_json(substring, strict=strict, context=context)
+                            return res
+                        except ValidationError as exc:
+                            last_exc = exc
+                            pass
+            raise ValueError("Could not find valid json") from last_exc
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    email = textwrap.dedent(
+        """\
+        We're excited to welcome you aboard your upcoming flight from
+        John F. Kennedy International Airport (JFK) to Los Angeles International Airport (LAX)
+        on December 25, 2022. Here's everything you need to know before you take off: ...
+    """
+    )
+    lm = DSPDummyLM(
+        [
+            # Example with a bad origin code.
+            (
+                "Here is your json: "
+                "{"
+                '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
+                '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
+                '"date": "2022-12-25"}'
+            ),
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email=email) == TravelInformation(
+        origin={"code": "JFK", "lat": 40.6446, "lon": -73.7797},
+        destination={"code": "LAX", "lat": 33.942791, "lon": -118.410042},
+        date=datetime.date(2022, 12, 25),
+    )
+
+
+def test_raises():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    lm = DSPDummyLM(
+        [
+            "A list of bad inputs",
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            '{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    with pytest.raises(ValueError):
+        flight_information(email="Some email")
+
+
+def test_multi_errors():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    lm = DSPDummyLM(
+        [
+            # First origin is wrong, then destination, then all is good
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            "{...}",  # Example to help the model understand
+            '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
+            "{...}",  # Example to help the model understand
+            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email="Some email") == TravelInformation(
+        origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
+    )
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `email`, produce the fields `flight_information`.
+
+        ---
+
+        Follow the following format.
+
+        Email: ${email}
+
+        Past Error in Flight Information: An error to avoid in the future
+
+        Past Error (2) in Flight Information: An error to avoid in the future
+
+        Flight Information: ${flight_information}. Respond with a single JSON object. JSON Schema: {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
+
+        ---
+
+        Email: Some email
+
+        Past Error in Flight Information: String should match pattern '^[A-Z]{3}$': origin (error type: string_pattern_mismatch)
+
+        Past Error (2) in Flight Information: String should match pattern '^[A-Z]{3}$': destination (error type: string_pattern_mismatch)
+
+        Flight Information: {"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}"""
+    )
+
+
+def test_field_validator():
+    class UserDetails(BaseModel):
+        name: str
+        age: int
+
+        @field_validator("name")
+        @classmethod
+        def validate_name(cls, v):
+            if v.upper() != v:
+                raise ValueError("Name must be in uppercase.")
+            return v
+
+    @predictor
+    def get_user_details() -> UserDetails:
+        pass
+
+    # Keep making the mistake (lower case name) until we run
+    # out of retries.
+    lm = DSPDummyLM(
+        [
+            '{"name": "lower case name", "age": 25}',
+        ]
+        * 10
+    )
+    dspy.settings.configure(lm=lm)
+
+    with pytest.raises(ValueError):
+        get_user_details()
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields , produce the fields `get_user_details`.
+
+        ---
+
+        Follow the following format.
+
+        Past Error in Get User Details: An error to avoid in the future
+        Past Error (2) in Get User Details: An error to avoid in the future
+        Get User Details: ${get_user_details}. Respond with a single JSON object. JSON Schema: {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
+
+        ---
+
+        Past Error in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
+        Past Error (2) in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
+        Get User Details: {"name": "lower case name", "age": 25}"""
+    )
+
+
+def test_annotated_field():
+    @predictor
+    def test(input: Annotated[str, Field(description="description")]) -> Annotated[float, Field(gt=0, lt=1)]:
+        pass
+
+    # First try 0, which fails, then try 0.5, which passes
+    lm = DSPDummyLM(["0", "0.5"])
+    dspy.settings.configure(lm=lm)
+
+    output = test(input="input")
+
+    assert output == 0.5
+
+
+def test_multiple_outputs():
+    lm = DSPDummyLM([str(i) for i in range(100)])
+    dspy.settings.configure(lm=lm)
+
+    test = TypedPredictor("input -> output")
+    output = test(input="input", config=dict(n=3)).completions.output
+    assert output == ["0", "1", "2"]
+
+
+def test_multiple_outputs_int():
+    lm = DSPDummyLM([str(i) for i in range(100)])
+    dspy.settings.configure(lm=lm)
+
+    class TestSignature(dspy.Signature):
+        input: int = dspy.InputField()
+        output: int = dspy.OutputField()
+
+    test = TypedPredictor(TestSignature)
+
+    output = test(input=8, config=dict(n=3)).completions.output
+    assert output == [0, 1, 2]
+
+
+def test_multiple_outputs_int_cot():
+    # Note: Multiple outputs only work when the language model "speculatively" generates all the outputs in one go.
+    lm = DSPDummyLM(
+        [
+            "thoughts 0\nOutput: 0\n",
+            "thoughts 1\nOutput: 1\n",
+            "thoughts 2\nOutput: 2\n",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    test = TypedChainOfThought("input:str -> output:int")
+
+    output = test(input="8", config=dict(n=3)).completions.output
+    assert output == [0, 1, 2]
+
+
+def test_parse_type_string():
+    lm = DSPDummyLM([str(i) for i in range(100)])
+    dspy.settings.configure(lm=lm)
+
+    test = TypedPredictor("input:int -> output:int")
+
+    output = test(input=8, config=dict(n=3)).completions.output
+    assert output == [0, 1, 2]
+
+
+def test_literal():
+    lm = DSPDummyLM(['"2"', '"3"'])
+    dspy.settings.configure(lm=lm)
+
+    @predictor
+    def f() -> Literal["2", "3"]:
+        pass
+
+    assert f() == "2"
+
+
+def test_literal_missmatch():
+    lm = DSPDummyLM([f'"{i}"' for i in range(5, 100)])
+    dspy.settings.configure(lm=lm)
+
+    @predictor(max_retries=1)
+    def f() -> Literal["2", "3"]:
+        pass
+
+    with pytest.raises(Exception) as e_info:
+        f()
+
+    assert e_info.value.args[1]["f"] == "Input should be '2' or '3':  (error type: literal_error)"
+
+
+def test_literal_int():
+    lm = DSPDummyLM(["2", "3"])
+    dspy.settings.configure(lm=lm)
+
+    @predictor
+    def f() -> Literal[2, 3]:
+        pass
+
+    assert f() == 2
+
+
+def test_literal_int_missmatch():
+    lm = DSPDummyLM([f"{i}" for i in range(5, 100)])
+    dspy.settings.configure(lm=lm)
+
+    @predictor(max_retries=1)
+    def f() -> Literal[2, 3]:
+        pass
+
+    with pytest.raises(Exception) as e_info:
+        f()
+
+    assert e_info.value.args[1]["f"] == "Input should be 2 or 3:  (error type: literal_error)"
+
+
+def test_fields_on_base_signature():
+    class SimpleOutput(dspy.Signature):
+        output: float = dspy.OutputField(gt=0, lt=1)
+
+    lm = DSPDummyLM(
+        [
+            "2.1",  # Bad output
+            "0.5",  # Good output
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    predictor = TypedPredictor(SimpleOutput)
+
+    assert predictor().output == 0.5
+
+
+def test_synthetic_data_gen():
+    class SyntheticFact(BaseModel):
+        fact: str = Field(..., description="a statement")
+        varacity: bool = Field(..., description="is the statement true or false")
+
+    class ExampleSignature(dspy.Signature):
+        """Generate an example of a synthetic fact."""
+
+        fact: SyntheticFact = dspy.OutputField()
+
+    lm = DSPDummyLM(
+        [
+            '{"fact": "The sky is blue", "varacity": true}',
+            '{"fact": "The sky is green", "varacity": false}',
+            '{"fact": "The sky is red", "varacity": true}',
+            '{"fact": "The earth is flat", "varacity": false}',
+            '{"fact": "The earth is round", "varacity": true}',
+            '{"fact": "The earth is a cube", "varacity": false}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    generator = TypedPredictor(ExampleSignature)
+    examples = generator(config=dict(n=3))
+    for ex in examples.completions.fact:
+        assert isinstance(ex, SyntheticFact)
+    assert examples.completions.fact[0] == SyntheticFact(fact="The sky is blue", varacity=True)
+
+    # If you have examples and want more
+    existing_examples = [
+        dspy.Example(fact="The sky is blue", varacity=True),
+        dspy.Example(fact="The sky is green", varacity=False),
+    ]
+    trained = LabeledFewShot().compile(student=generator, trainset=existing_examples)
+
+    augmented_examples = trained(config=dict(n=3))
+    for ex in augmented_examples.completions.fact:
+        assert isinstance(ex, SyntheticFact)
+
+
+def test_list_input2():
+    # Inspired by the Signature Optimizer
+
+    class ScoredString(pydantic.BaseModel):
+        string: str
+        score: float
+
+    class ScoredSignature(dspy.Signature):
+        attempted_signatures: list[ScoredString] = dspy.InputField()
+        proposed_signature: str = dspy.OutputField()
+
+    program = TypedChainOfThought(ScoredSignature)
+
+    lm = DSPDummyLM(["Thoughts", "Output"])
+    dspy.settings.configure(lm=lm)
+
+    output = program(
+        attempted_signatures=[
+            ScoredString(string="string 1", score=0.5),
+            ScoredString(string="string 2", score=0.4),
+            ScoredString(string="string 3", score=0.3),
+        ]
+    ).proposed_signature
+
+    print(lm.get_convo(-1))
+
+    assert output == "Output"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `attempted_signatures`, produce the fields `proposed_signature`.
+
+        ---
+
+        Follow the following format.
+
+        Attempted Signatures: ${attempted_signatures}
+        Reasoning: Let's think step by step in order to ${produce the proposed_signature}. We ...
+        Proposed Signature: ${proposed_signature}
+
+        ---
+
+        Attempted Signatures: [{"string":"string 1","score":0.5},{"string":"string 2","score":0.4},{"string":"string 3","score":0.3}]
+        Reasoning: Let's think step by step in order to Thoughts
+        Proposed Signature: Output"""
+    )
+
+
+def test_custom_reasoning_field():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class QuestionSignature(dspy.Signature):
+        topic: str = dspy.InputField()
+        question: Question = dspy.OutputField()
+
+    reasoning = dspy.OutputField(
+        prefix="Custom Reasoning: Let's break this down. To generate a question about",
+        desc="${topic}, we should ...",
+    )
+
+    program = TypedChainOfThought(QuestionSignature, reasoning=reasoning)
+
+    expected = "What is the speed of light?"
+    lm = DSPDummyLM(["Thoughts", f'{{"value": "{expected}"}}'])
+    dspy.settings.configure(lm=lm)
+
+    output = program(topic="Physics")
+
+    assert isinstance(output.question, Question)
+    assert output.question.value == expected
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `topic`, produce the fields `question`.
+
+        ---
+
+        Follow the following format.
+
+        Topic: ${topic}
+        Custom Reasoning: Let's break this down. To generate a question about ${topic}, we should ...
+        Question: ${question}. Respond with a single JSON object. JSON Schema: {"properties": {"value": {"title": "Value", "type": "string"}}, "required": ["value"], "title": "Question", "type": "object"}
+
+        ---
+
+        Topic: Physics
+        Custom Reasoning: Let's break this down. To generate a question about Thoughts
+        Question: {"value": "What is the speed of light?"}"""
+    )
+
+
+def test_generic_signature():
+    T = TypeVar("T")
+
+    class GenericSignature(dspy.Signature, Generic[T]):
+        """My signature"""
+
+        output: T = dspy.OutputField()
+
+    predictor = TypedPredictor(GenericSignature[int])
+    assert predictor.signature.instructions == "My signature"
+
+    lm = DSPDummyLM(["23"])
+    dspy.settings.configure(lm=lm)
+
+    assert predictor().output == 23
+
+
+def test_lm_as_validator():
+    @predictor
+    def is_square(n: int) -> bool:
+        """Is n a square number?"""
+
+    def check_square(n):
+        assert is_square(n=n)
+        return n
+
+    @predictor
+    def next_square(n: int) -> Annotated[int, AfterValidator(check_square)]:
+        """What is the next square number after n?"""
+
+    lm = DSPDummyLM(["3", "False", "4", "True"])
+    dspy.settings.configure(lm=lm)
+
+    m = next_square(n=2)
+    lm.inspect_history(n=2)
+
+    assert m == 4
+
+
+def test_annotated_validator():
+    def is_square(n: int) -> int:
+        root = n**0.5
+        if not root.is_integer():
+            raise ValueError(f"{n} is not a square")
+        return n
+
+    class MySignature(dspy.Signature):
+        """What is the next square number after n?"""
+
+        n: int = dspy.InputField()
+        next_square: Annotated[int, AfterValidator(is_square)] = dspy.OutputField()
+
+    lm = DSPDummyLM(["3", "4"])
+    dspy.settings.configure(lm=lm)
+
+    m = TypedPredictor(MySignature)(n=2).next_square
+    lm.inspect_history(n=2)
+
+    assert m == 4
+
+
+def test_annotated_validator_functional():
+    def is_square(n: int) -> int:
+        if not (n**0.5).is_integer():
+            raise ValueError(f"{n} is not a square")
+        return n
+
+    @predictor
+    def next_square(n: int) -> Annotated[int, AfterValidator(is_square)]:
+        """What is the next square number after n?"""
+
+    lm = DSPDummyLM(["3", "4"])
+    dspy.settings.configure(lm=lm)
+
+    m = next_square(n=2)
+    lm.inspect_history(n=2)
+
+    assert m == 4
+
+
+def test_demos():
+    demos = [
+        dspy.Example(input="What is the speed of light?", output="3e8"),
+    ]
+    program = LabeledFewShot(k=len(demos)).compile(
+        student=dspy.TypedPredictor("input -> output"),
+        trainset=[ex.with_inputs("input") for ex in demos],
+    )
+
+    lm = DSPDummyLM(["Paris"])
+    dspy.settings.configure(lm=lm)
+
+    assert program(input="What is the capital of France?").output == "Paris"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the speed of light?
+        Output: 3e8
+
+        ---
+
+        Input: What is the capital of France?
+        Output: Paris"""
+    )
+
+
+def _test_demos_missing_input():
+    demos = [dspy.Example(input="What is the speed of light?", output="3e8")]
+    program = LabeledFewShot(k=len(demos)).compile(
+        student=dspy.TypedPredictor("input -> output, thoughts"),
+        trainset=[ex.with_inputs("input") for ex in demos],
+    )
+    dspy.settings.configure(lm=DSPDummyLM(["My thoughts", "Paris"]))
+    assert program(input="What is the capital of France?").output == "Paris"
+
+    assert dspy.settings.lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Thoughts: ${thoughts}
+        Output: ${output}
+
+        ---
+
+        Input: What is the speed of light?
+        Output: 3e8
+
+        ---
+
+        Input: What is the capital of France?
+        Thoughts: My thoughts
+        Output: Paris"""
+    )
+
+
+def test_conlist():
+    dspy.settings.configure(lm=DSPDummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+
+    @predictor
+    def make_numbers(input: str) -> Annotated[list[int], Field(min_items=2)]:
+        pass
+
+    assert make_numbers(input="What are the first two numbers?") == [1, 2]
+
+
+def test_conlist2():
+    dspy.settings.configure(lm=DSPDummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+
+    make_numbers = TypedPredictor("input:str -> output:Annotated[List[int], Field(min_items=2)]")
+    assert make_numbers(input="What are the first two numbers?").output == [1, 2]
+
+
+def test_model_validator():
+    class MySignature(dspy.Signature):
+        input_data: str = dspy.InputField()
+        allowed_categories: list[str] = dspy.InputField()
+        category: str = dspy.OutputField()
+
+        @model_validator(mode="after")
+        def check_cateogry(self):
+            if self.category not in self.allowed_categories:
+                raise ValueError(f"category not in {self.allowed_categories}")
+            return self
+
+    lm = DSPDummyLM(["horse", "dog"])
+    dspy.settings.configure(lm=lm)
+    predictor = TypedPredictor(MySignature)
+
+    pred = predictor(input_data="What is the best animal?", allowed_categories=["cat", "dog"])
+    assert pred.category == "dog"
diff --git a/tests/dsp_LM/functional/test_signature_opt_typed.py b/tests/dsp_LM/functional/test_signature_opt_typed.py
new file mode 100644
index 0000000000..3533926b62
--- /dev/null
+++ b/tests/dsp_LM/functional/test_signature_opt_typed.py
@@ -0,0 +1,187 @@
+import json
+from typing import Generic, TypeVar
+
+import pydantic
+from pydantic_core import to_jsonable_python
+
+import dspy
+from dspy.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.functional import TypedPredictor
+from dspy.teleprompt.signature_opt_typed import make_info, optimize_signature
+from dspy.utils import DSPDummyLM
+
+hotpotqa = [
+    ex.with_inputs("question")
+    for ex in [
+        dspy.Example(
+            question="At My Window was released by which American singer-songwriter?",
+            answer="John Townes Van Zandt",
+        ),
+        dspy.Example(
+            question="which  American actor was Candace Kita  guest starred with ",
+            answer="Bill Murray",
+        ),
+        dspy.Example(
+            question="Which of these publications was most recently published, Who Put the Bomp or Self?",
+            answer="Self",
+        ),
+        dspy.Example(
+            question="The Victorians - Their Story In Pictures is a documentary series written by an author born in what year?",
+            answer="1950",
+        ),
+        dspy.Example(
+            question="Which magazine has published articles by Scott Shaw, Tae Kwon Do Times or Southwest Art?",
+            answer="Tae Kwon Do Times",
+        ),
+        dspy.Example(
+            question="In what year was the club founded that played Manchester City in the 1972 FA Charity Shield",
+            answer="1874",
+        ),
+        dspy.Example(
+            question="Which is taller, the Empire State Building or the Bank of America Tower?",
+            answer="The Empire State Building",
+        ),
+        dspy.Example(
+            question='Which American actress who made their film debut in the 1995 teen drama "Kids" was the co-founder of Voto Latino?',
+            answer="Rosario Dawson",
+        ),
+        dspy.Example(
+            question="Tombstone stared an actor born May 17, 1955 known as who?",
+            answer="Bill Paxton",
+        ),
+        dspy.Example(
+            question="What is the code name for the German offensive that started this Second World War engagement on the Eastern Front (a few hundred kilometers from Moscow) between Soviet and German forces, which included 102nd Infantry Division?",
+            answer="Operation Citadel",
+        ),
+        dspy.Example(
+            question='Who acted in the shot film The Shore and is also the youngest actress ever to play Ophelia in a Royal Shakespeare Company production of "Hamlet." ?',
+            answer="Kerry Condon",
+        ),
+        dspy.Example(
+            question="Which company distributed this 1977 American animated film produced by Walt Disney Productions for which Sherman Brothers wrote songs?",
+            answer="Buena Vista Distribution",
+        ),
+        dspy.Example(
+            question="Samantha Cristoforetti and Mark Shuttleworth are both best known for being first in their field to go where? ",
+            answer="space",
+        ),
+        dspy.Example(
+            question="Having the combination of excellent foot speed and bat speed helped Eric Davis, create what kind of outfield for the Los Angeles Dodgers? ",
+            answer="Outfield of Dreams",
+        ),
+        dspy.Example(
+            question="Which Pakistani cricket umpire who won 3 consecutive ICC umpire of the year awards in 2009, 2010, and 2011 will be in the ICC World Twenty20?",
+            answer="Aleem Sarwar Dar",
+        ),
+        dspy.Example(
+            question="The Organisation that allows a community to influence their operation or use and to enjoy the benefits arisingwas founded in what year?",
+            answer="2010",
+        ),
+        dspy.Example(
+            question='"Everything Has Changed" is a song from an album released under which record label ?',
+            answer="Big Machine Records",
+        ),
+        dspy.Example(
+            question="Who is older, Aleksandr Danilovich Aleksandrov or Anatoly Fomenko?",
+            answer="Aleksandr Danilovich Aleksandrov",
+        ),
+        dspy.Example(
+            question="On the coast of what ocean is the birthplace of Diogal Sakho?",
+            answer="Atlantic",
+        ),
+        dspy.Example(
+            question="This American guitarist best known for her work with the Iron Maidens is an ancestor of a composer who was known as what?",
+            answer="The Waltz King",
+        ),
+    ]
+]
+
+
+def test_opt():
+    class BasicQA(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: str = dspy.OutputField()
+
+    qa_model = DSPDummyLM([])
+    prompt_model = DSPDummyLM(
+        [
+            # Seed prompts
+            "some thoughts",
+            '[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
+        ]
+    )
+    dspy.settings.configure(lm=qa_model)
+
+    result = optimize_signature(
+        student=TypedPredictor(BasicQA),
+        evaluator=Evaluate(devset=hotpotqa, metric=answer_exact_match, num_threads=1),
+        initial_prompts=1,
+        n_iterations=2,
+        verbose=True,
+        prompt_model=prompt_model,
+        strategy="last",
+    )
+
+    # Since we are requesting the last signature, it doesn't matter that our qa_model is
+    # bad, and gets 0 score. We should still get the last signature.
+    class ExpectedSignature(dspy.Signature):
+        "I"
+
+        question: str = dspy.InputField(desc="$q", prefix="Q:")
+        answer: str = dspy.OutputField(desc="$a", prefix="A:")
+
+    assert result.program.signature.equals(ExpectedSignature)
+
+    assert result.scores == [0, 0]
+
+
+def test_opt_composed():
+    class MyModule(dspy.Module):
+        def __init__(self):
+            self.p1 = TypedPredictor("question:str -> considerations:list[str]", max_retries=1)
+            self.p2 = TypedPredictor("considerations:list[str] -> answer:str", max_retries=1)
+
+        def forward(self, question):
+            considerations = self.p1(question=question).considerations
+            return self.p2(considerations=considerations)
+
+    class ExpectedSignature1(dspy.Signature):
+        "I1"
+
+        question: str = dspy.InputField(desc="$q", prefix="Q:")
+        considerations: list[str] = dspy.OutputField(desc="$c", prefix="C:")
+
+    info1 = make_info(ExpectedSignature1)
+
+    class ExpectedSignature2(dspy.Signature):
+        "I2"
+
+        considerations: list[str] = dspy.InputField(desc="$c", prefix="C:")
+        answer: str = dspy.OutputField(desc="$a", prefix="A:")
+
+    info2 = make_info(ExpectedSignature2)
+
+    qa_model = DSPDummyLM([])
+    prompt_model = DSPDummyLM(
+        [
+            "some thoughts",
+            json.dumps([to_jsonable_python(info1)]),
+            "some thoughts",
+            json.dumps([to_jsonable_python(info2)]),
+        ]
+    )
+    dspy.settings.configure(lm=qa_model)
+
+    result = optimize_signature(
+        student=MyModule(),
+        evaluator=lambda x: 0,  # We don't care about the evaluator here
+        initial_prompts=1,
+        n_iterations=2,
+        verbose=True,
+        prompt_model=prompt_model,
+        strategy="last",
+    )
+
+    assert result.program.p1.signature.equals(ExpectedSignature1)
+    assert result.program.p2.signature.equals(ExpectedSignature2)
diff --git a/tests/dsp_LM/functional/test_signature_typed.py b/tests/dsp_LM/functional/test_signature_typed.py
new file mode 100644
index 0000000000..6e875d851d
--- /dev/null
+++ b/tests/dsp_LM/functional/test_signature_typed.py
@@ -0,0 +1,155 @@
+from typing import Any, Optional, Union
+
+import pydantic
+import pytest
+
+import dspy
+from dspy.functional import TypedPredictor
+from dspy.signatures.signature import signature_to_template
+
+
+def get_field_and_parser(signature: dspy.Signature) -> tuple[Any, Any]:
+    module = TypedPredictor(signature)
+    signature = module._prepare_signature()
+    assert "answer" in signature.fields, "'answer' not in signature.fields"
+    field = signature.fields.get("answer")
+    parser = field.json_schema_extra.get("parser")
+    return field, parser
+
+
+class Mysubmodel(pydantic.BaseModel):
+    sub_floating: float
+
+
+class MyModel(pydantic.BaseModel):
+    floating: float
+    string: str
+    boolean: bool
+    integer: int
+    optional: Optional[str]
+    sequence_of_strings: list[str]
+    union: Union[str, float]
+    submodel: Mysubmodel
+    optional_submodel: Optional[Mysubmodel]
+    optional_existing_submodule: Optional[Mysubmodel]
+
+
+def build_model_instance() -> MyModel:
+    return MyModel(
+        floating=3.14,
+        string="foobar",
+        boolean=True,
+        integer=42,
+        optional=None,
+        sequence_of_strings=["foo", "bar"],
+        union=3.14,
+        submodel=Mysubmodel(sub_floating=42.42),
+        optional_submodel=None,
+        optional_existing_submodule=Mysubmodel(sub_floating=42.42),
+    )
+
+
+@pytest.mark.parametrize(
+    "test_type,serialized, expected", [(str, "foo", "foo"), (int, "42", 42), (float, "42.42", 42.42)]
+)
+def test_basic_types(test_type: type, serialized: str, expected: Any):
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: test_type = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+    assert parser is test_type, "Parser is not correct for 'answer'"
+    assert parser(serialized) == expected, f"{test_type}({serialized})!= {expected}"
+
+
+def test_boolean():
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: bool = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+    assert parser("true"), f"Parsing 'true' failed"
+    assert not parser("false"), f"Parsing 'false' failed"
+
+
+@pytest.mark.parametrize(
+    "test_type,serialized, expected",
+    [(list[str], '["foo", "bar"]', ["foo", "bar"]), (tuple[int, float], "[42, 3.14]", (42, 3.14))],
+)
+def test_sequences(test_type: type, serialized: str, expected: Any):
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: test_type = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+
+    assert parser(serialized) == expected, f"Parsing {expected} failed"
+
+
+@pytest.mark.parametrize(
+    "test_type,serialized, expected",
+    [
+        (Optional[str], '"foobar"', "foobar"),
+        (Optional[str], "null", None),
+        (Union[str, float], "3.14", 3.14),
+        (Union[str, bool], "true", True),
+    ],
+)
+def test_unions(test_type: type, serialized: str, expected: Any):
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: test_type = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+
+    assert parser(serialized) == expected, f"Parsing {expected} failed"
+
+
+def test_pydantic():
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: MyModel = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+
+    instance = build_model_instance()
+    parsed_instance = parser(instance.model_dump_json())
+
+    assert parsed_instance == instance, f"{instance} != {parsed_instance}"
+
+
+def test_optional_pydantic():
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: Optional[MyModel] = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+
+    instance = build_model_instance()
+    parsed_instance = parser(instance.model_dump_json())
+    assert parsed_instance == instance, f"{instance} != {parsed_instance}"
+
+    # Check null case
+    parsed_instance = parser("null")
+    assert parsed_instance == None, "Optional[MyModel] should be None"
+
+
+def test_dataclass():
+    from dataclasses import dataclass
+
+    @dataclass(frozen=True)
+    class MyDataclass:
+        string: str
+        number: int
+        floating: float
+        boolean: bool
+
+    class MySignature(dspy.Signature):
+        question: str = dspy.InputField()
+        answer: MyDataclass = dspy.OutputField()
+
+    _, parser = get_field_and_parser(MySignature)
+
+    instance = MyDataclass("foobar", 42, 3.14, True)
+    parsed_instance = parser('{"string": "foobar", "number": 42, "floating": 3.14, "boolean": true}')
+    assert parsed_instance == instance, f"{instance} != {parsed_instance}"
diff --git a/tests/dsp_LM/modules/test_aws_models.py b/tests/dsp_LM/modules/test_aws_models.py
new file mode 100644
index 0000000000..b6e018b337
--- /dev/null
+++ b/tests/dsp_LM/modules/test_aws_models.py
@@ -0,0 +1,70 @@
+"""Tests for AWS models.
+Note: Requires configuration of your AWS credentials with the AWS CLI and creating sagemaker endpoints.
+TODO: Create mock fixtures for pytest to remove the need for AWS credentials and endpoints.
+"""
+
+import dsp
+import dspy
+
+
+def get_lm(lm_provider: str, model_path: str, **kwargs) -> dsp.modules.lm.LM:
+    """get the language model"""
+    # extract model vendor and name from model name
+    # Model path format is <MODEL_VENDOR>/<MODEL_NAME_OR_ENDPOINT>
+    model_vendor = model_path.split("/")[0]
+    model_name = model_path.split("/")[1]
+
+    if lm_provider == "Bedrock":
+        bedrock = dspy.Bedrock(region_name="us-west-2")
+        if model_vendor == "mistral":
+            return dspy.AWSMistral(bedrock, model_name, **kwargs)
+        elif model_vendor == "anthropic":
+            return dspy.AWSAnthropic(bedrock, model_name, **kwargs)
+        elif model_vendor == "meta":
+            return dspy.AWSMeta(bedrock, model_name, **kwargs)
+        else:
+            raise ValueError(
+                "Model vendor missing or unsupported: Model path format is <MODEL_VENDOR>/<MODEL_NAME_OR_ENDPOINT>"
+            )
+    elif lm_provider == "Sagemaker":
+        sagemaker = dspy.Sagemaker(region_name="us-west-2")
+        if model_vendor == "mistral":
+            return dspy.AWSMistral(sagemaker, model_name, **kwargs)
+        elif model_vendor == "meta":
+            return dspy.AWSMeta(sagemaker, model_name, **kwargs)
+        else:
+            raise ValueError(
+                "Model vendor missing or unsupported: Model path format is <MODEL_VENDOR>/<MODEL_NAME_OR_ENDPOINT>"
+            )
+    else:
+        raise ValueError(f"Unsupported model: {model_name}")
+
+
+def run_tests():
+    """Test the providers and models"""
+    # Configure your AWS credentials with the AWS CLI before running this script
+    provider_model_tuples = [
+        ("Bedrock", "mistral/mistral.mixtral-8x7b-instruct-v0:1"),
+        ("Bedrock", "anthropic/anthropic.claude-3-haiku-20240307-v1:0"),
+        ("Bedrock", "anthropic/anthropic.claude-3-sonnet-20240229-v1:0"),
+        ("Bedrock", "meta/meta.llama2-70b-chat-v1"),
+        ("Bedrock", "meta/meta.llama3-8b-instruct-v1:0"),
+        ("Bedrock", "meta/meta.llama3-70b-instruct-v1:0"),
+        # ('Sagemaker', 'mistral/<YOUR_ENDPOINT_NAME>'),  # REPLACE YOUR_ENDPOINT_NAME with your sagemaker endpoint
+    ]
+
+    predict_func = dspy.Predict("question -> answer")
+    for provider, model_path in provider_model_tuples:
+        print(f"Provider: {provider}, Model: {model_path}")
+        lm = get_lm(provider, model_path)
+        with dspy.context(lm=lm):
+            question = "What is the capital of France?"
+            answer = predict_func(question=question).answer
+            print(f"Question: {question}\nAnswer: {answer}")
+            print("---------------------------------")
+            lm.inspect_history()
+            print("---------------------------------\n")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/dsp_LM/modules/test_cloudflare_models.py b/tests/dsp_LM/modules/test_cloudflare_models.py
new file mode 100644
index 0000000000..188d2424e4
--- /dev/null
+++ b/tests/dsp_LM/modules/test_cloudflare_models.py
@@ -0,0 +1,61 @@
+"""Tests for Cloudflare models.
+Note: Requires configuration of your Cloudflare account_id and api_key.
+"""
+
+import dspy
+
+models = {
+    "@cf/qwen/qwen1.5-0.5b-chat": "https://huggingface.co/qwen/qwen1.5-0.5b-chat",
+    "@hf/meta-llama/meta-llama-3-8b-instruct": "https://llama.meta.com",
+    "@hf/nexusflow/starling-lm-7b-beta": "https://huggingface.co/Nexusflow/Starling-LM-7B-beta",
+    "@cf/meta/llama-3-8b-instruct": "https://llama.meta.com",
+    "@hf/thebloke/neural-chat-7b-v3-1-awq": "",
+    "@cf/meta/llama-2-7b-chat-fp16": "https://ai.meta.com/llama/",
+    "@cf/mistral/mistral-7b-instruct-v0.1": "https://mistral.ai/news/announcing-mistral-7b/",
+    "@cf/tinyllama/tinyllama-1.1b-chat-v1.0": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "@hf/mistral/mistral-7b-instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
+    "@cf/fblgit/una-cybertron-7b-v2-bf16": "",
+    "@hf/thebloke/codellama-7b-instruct-awq": "https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-AWQ",
+    "@cf/thebloke/discolm-german-7b-v1-awq": "https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-AWQ",
+    "@cf/meta/llama-2-7b-chat-int8": "https://ai.meta.com/llama/",
+    "@hf/thebloke/mistral-7b-instruct-v0.1-awq": "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
+    "@hf/thebloke/openchat_3.5-awq": "",
+    "@cf/qwen/qwen1.5-7b-chat-awq": "https://huggingface.co/qwen/qwen1.5-7b-chat-awq",
+    "@hf/thebloke/llama-2-13b-chat-awq": "https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ",
+    "@hf/thebloke/deepseek-coder-6.7b-base-awq": "",
+    "@hf/thebloke/openhermes-2.5-mistral-7b-awq": "",
+    "@hf/thebloke/deepseek-coder-6.7b-instruct-awq": "",
+    "@cf/deepseek-ai/deepseek-math-7b-instruct": "https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct",
+    "@cf/tiiuae/falcon-7b-instruct": "https://huggingface.co/tiiuae/falcon-7b-instruct",
+    "@hf/nousresearch/hermes-2-pro-mistral-7b": "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B",
+    "@hf/thebloke/zephyr-7b-beta-awq": "https://huggingface.co/TheBloke/zephyr-7B-beta-AWQ",
+    "@cf/qwen/qwen1.5-1.8b-chat": "https://huggingface.co/qwen/qwen1.5-1.8b-chat",
+    "@cf/defog/sqlcoder-7b-2": "https://huggingface.co/defog/sqlcoder-7b-2",
+    "@cf/microsoft/phi-2": "https://huggingface.co/microsoft/phi-2",
+    "@hf/google/gemma-7b-it": "https://ai.google.dev/gemma/docs",
+}
+
+
+def get_lm(name: str):  # -> dspy.LM:
+    return dspy.CloudflareAI(model=name)
+
+
+def run_tests():
+    """Test the providers and models"""
+    # Configure your AWS credentials with the AWS CLI before running this script
+    models
+
+    predict_func = dspy.Predict("question -> answer")
+    for model_name in models.keys():
+        lm = get_lm(model_name)
+        with dspy.context(lm=lm):
+            question = "What is the capital of France?"
+            answer = predict_func(question=question).answer
+            print(f"Question: {question}\nAnswer: {answer}")
+            print("---------------------------------")
+            lm.inspect_history()
+            print("---------------------------------\n")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/dsp_LM/modules/test_hf_model.py b/tests/dsp_LM/modules/test_hf_model.py
new file mode 100644
index 0000000000..0b06c80429
--- /dev/null
+++ b/tests/dsp_LM/modules/test_hf_model.py
@@ -0,0 +1,31 @@
+from pytest_mock.plugin import MockerFixture
+from transformers import AutoModelForSeq2SeqLM
+
+import dspy
+
+
+class MockConfig:
+    def __init__(self, architectures: list[str]):
+        self.architectures = architectures
+
+
+def test_load_gated_model(mocker: MockerFixture):
+    conf = MockConfig(architectures=["ConditionalGeneration"])
+    mocker.patch("transformers.AutoModelForSeq2SeqLM.from_pretrained")
+    mocker.patch("transformers.AutoConfig.from_pretrained", return_value=conf)
+    mocker.patch("transformers.AutoTokenizer.from_pretrained")
+
+    some_token = "asdfasdfasdf"
+    model = "google/gemma-7b"
+    _ = dspy.HFModel(model, token=some_token)
+    AutoModelForSeq2SeqLM.from_pretrained.assert_called_with(model, device_map="auto", token=some_token)
+
+
+def test_load_ungated_model(mocker: MockerFixture):
+    conf = MockConfig(architectures=["ConditionalGeneration"])
+    mocker.patch("transformers.AutoModelForSeq2SeqLM.from_pretrained")
+    mocker.patch("transformers.AutoConfig.from_pretrained", return_value=conf)
+    mocker.patch("transformers.AutoTokenizer.from_pretrained")
+    _ = dspy.HFModel("openai-community/gpt2")
+    # no token used in automodel
+    AutoModelForSeq2SeqLM.from_pretrained.assert_called_with("openai-community/gpt2", device_map="auto", token=None)
diff --git a/tests/dsp_LM/modules/vectorizer/test_fastembed.py b/tests/dsp_LM/modules/vectorizer/test_fastembed.py
new file mode 100644
index 0000000000..e9a7335a4f
--- /dev/null
+++ b/tests/dsp_LM/modules/vectorizer/test_fastembed.py
@@ -0,0 +1,43 @@
+import pytest
+
+from dsp.modules.sentence_vectorizer import FastEmbedVectorizer
+from dspy.primitives.example import Example
+
+# Skip the test if the 'fastembed' package is not installed
+pytest.importorskip("fastembed", reason="'fastembed' is not installed. Use `pip install fastembed` to install it.")
+
+
+@pytest.mark.parametrize(
+    "n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5"), (512, "jinaai/jina-embeddings-v2-small-en")]
+)
+def test_fastembed_with_examples(n_dims, model_name):
+    vectorizer = FastEmbedVectorizer(model_name)
+
+    examples = [
+        Example(query="What's the price today?", response="The price is $10.00").with_inputs("query", "response"),
+        Example(query="What's the weather today?", response="The weather is sunny").with_inputs("query", "response"),
+        Example(query="Who was leading the team?", response="It was Jim. Rather enthusiastic guy.").with_inputs(
+            "query", "response"
+        ),
+    ]
+
+    embeddings = vectorizer(examples)
+
+    assert embeddings.shape == (len(examples), n_dims)
+
+
+@pytest.mark.parametrize(
+    "n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5"), (512, "jinaai/jina-embeddings-v2-small-en")]
+)
+def test_fastembed_with_strings(n_dims, model_name):
+    vectorizer = FastEmbedVectorizer(model_name)
+
+    inputs = [
+        "Jonathan Kent is a fictional character appearing in American comic books published by DC Comics.",
+        "Clark Kent is a fictional character appearing in American comic books published by DC Comics.",
+        "Martha Kent is a fictional character appearing in American comic books published by DC Comics.",
+    ]
+
+    embeddings = vectorizer(inputs)
+
+    assert embeddings.shape == (len(inputs), n_dims)
diff --git a/tests/dsp_LM/predict/test_aggregation.py b/tests/dsp_LM/predict/test_aggregation.py
new file mode 100644
index 0000000000..eb1d975368
--- /dev/null
+++ b/tests/dsp_LM/predict/test_aggregation.py
@@ -0,0 +1,43 @@
+from dsp.utils import normalize_text
+from dspy.predict.aggregation import majority
+from dspy.primitives.prediction import Completions, Prediction
+
+
+def test_majority_with_prediction():
+    prediction = Prediction.from_completions([{"answer": "2"}, {"answer": "2"}, {"answer": "3"}])
+    result = majority(prediction)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_completions():
+    completions = Completions([{"answer": "2"}, {"answer": "2"}, {"answer": "3"}])
+    result = majority(completions)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_list():
+    completions = [{"answer": "2"}, {"answer": "2"}, {"answer": "3"}]
+    result = majority(completions)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_normalize():
+    completions = [{"answer": "2"}, {"answer": " 2"}, {"answer": "3"}]
+    result = majority(completions, normalize=normalize_text)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_field():
+    completions = [
+        {"answer": "2", "other": "1"},
+        {"answer": "2", "other": "1"},
+        {"answer": "3", "other": "2"},
+    ]
+    result = majority(completions, field="other")
+    assert result.completions[0]["other"] == "1"
+
+
+def test_majority_with_no_majority():
+    completions = [{"answer": "2"}, {"answer": "3"}, {"answer": "4"}]
+    result = majority(completions)
+    assert result.completions[0]["answer"] == "2"  # The first completion is returned in case of a tie
diff --git a/tests/dsp_LM/predict/test_chain_of_thought.py b/tests/dsp_LM/predict/test_chain_of_thought.py
new file mode 100644
index 0000000000..2567ae032f
--- /dev/null
+++ b/tests/dsp_LM/predict/test_chain_of_thought.py
@@ -0,0 +1,36 @@
+import textwrap
+
+import dspy
+from dspy import ChainOfThought
+from dspy.utils import DSPDummyLM
+
+
+def test_initialization_with_string_signature():
+    lm = DSPDummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThought("question -> answer")
+    assert list(predict.extended_signature.output_fields.keys()) == [
+        "rationale",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?").answer == "2"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `question`, produce the fields `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+        Answer: ${answer}
+
+        ---
+
+        Question: What is 1+1?
+        Reasoning: Let's think step by step in order to find the number after 1
+        Answer: 2"""
+    )
diff --git a/tests/dsp_LM/predict/test_chain_of_thought_with_hint.py b/tests/dsp_LM/predict/test_chain_of_thought_with_hint.py
new file mode 100644
index 0000000000..0afa06b1c0
--- /dev/null
+++ b/tests/dsp_LM/predict/test_chain_of_thought_with_hint.py
@@ -0,0 +1,43 @@
+import dspy
+from dspy import ChainOfThoughtWithHint
+from dspy.utils import DSPDummyLM
+
+
+def test_cot_with_no_hint():
+    lm = DSPDummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThoughtWithHint("question -> answer")
+    # Check output fields have the right order
+    assert list(predict.extended_signature2.output_fields.keys()) == [
+        "rationale",
+        "hint",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?").answer == "2"
+
+    final_convo = lm.get_convo(-1)
+    assert final_convo.endswith(
+        "Question: What is 1+1?\n"
+        "Reasoning: Let's think step by step in order to find the number after 1\n"
+        "Answer: 2"
+    )
+
+
+def test_cot_with_hint():
+    lm = DSPDummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThoughtWithHint("question -> answer")
+    assert list(predict.extended_signature2.output_fields.keys()) == [
+        "rationale",
+        "hint",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?", hint="think small").answer == "2"
+
+    final_convo = lm.get_convo(-1)
+    assert final_convo.endswith(
+        "Question: What is 1+1?\n\n"
+        "Reasoning: Let's think step by step in order to find the number after 1\n\n"
+        "Hint: think small\n\n"
+        "Answer: 2"
+    )
diff --git a/tests/dsp_LM/predict/test_knn.py b/tests/dsp_LM/predict/test_knn.py
new file mode 100644
index 0000000000..7f35b42e4f
--- /dev/null
+++ b/tests/dsp_LM/predict/test_knn.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pytest
+
+import dsp
+import dspy
+from dspy.predict import KNN
+from dspy.utils import DummyVectorizer
+
+
+def mock_example(question: str, answer: str) -> dsp.Example:
+    """Creates a mock DSP example with specified question and answer."""
+    return dspy.Example(question=question, answer=answer).with_inputs("question")
+
+
+@pytest.fixture
+def setup_knn():
+    """Sets up a KNN instance with a mocked vectorizer for testing."""
+    dsp.SentenceTransformersVectorizer = DummyVectorizer
+    trainset = [
+        mock_example("What is the capital of France?", "Paris"),
+        mock_example("What is the largest ocean?", "Pacific"),
+        mock_example("What is 2+2?", "4"),
+    ]
+    knn = KNN(k=2, trainset=trainset)
+    return knn
+
+
+def test_knn_initialization(setup_knn):
+    """Tests the KNN initialization and checks if the trainset vectors are correctly created."""
+    knn = setup_knn
+    assert knn.k == 2, "Incorrect k value"
+    assert len(knn.trainset_vectors) == 3, "Incorrect size of trainset vectors"
+    assert isinstance(knn.trainset_vectors, np.ndarray), "Trainset vectors should be a NumPy array"
+
+
+def test_knn_query(setup_knn):
+    """Tests the KNN query functionality for retrieving the nearest neighbors."""
+    knn = setup_knn
+    query = {"question": "What is 3+3?"}  # A query close to "What is 2+2?"
+    nearest_samples = knn(**query)
+    assert len(nearest_samples) == 2, "Incorrect number of nearest samples returned"
+    assert nearest_samples[0].answer == "4", "Incorrect nearest sample returned"
+
+
+def test_knn_query_specificity(setup_knn):
+    """Tests the KNN query functionality for specificity of returned examples."""
+    knn = setup_knn
+    query = {"question": "What is the capital of Germany?"}  # A query close to "What is the capital of France?"
+    nearest_samples = knn(**query)
+    assert len(nearest_samples) == 2, "Incorrect number of nearest samples returned"
+    assert "Paris" in [sample.answer for sample in nearest_samples], "Expected Paris to be a nearest sample answer"
diff --git a/tests/dsp_LM/predict/test_langchain.py b/tests/dsp_LM/predict/test_langchain.py
new file mode 100644
index 0000000000..89fa1d75d9
--- /dev/null
+++ b/tests/dsp_LM/predict/test_langchain.py
@@ -0,0 +1,55 @@
+import pytest
+
+pytest.importorskip("langchain")
+
+import os
+
+from langchain import hub
+from langchain_chroma import Chroma
+from langchain_community.embeddings import FakeEmbeddings
+from langchain_core.documents import Document
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_openai import ChatOpenAI
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from dspy.predict.langchain import LangChainModule, LangChainPredict
+
+
+def test_copying_module():
+    os.environ["OPENAI_API_KEY"] = "fake-key"
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    docs = [Document(page_content="Hello, world!", metadata={"source": "https://example.com"})]
+
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=10)
+    splits = text_splitter.split_documents(docs)
+    vectorstore = Chroma.from_documents(documents=splits, embedding=FakeEmbeddings(size=5))
+
+    # Retrieve and generate using the relevant snippets of the blog.
+    retriever = vectorstore.as_retriever()
+
+    prompt = hub.pull("rlm/rag-prompt")
+
+    def format_docs(docs):
+        return "\n\n".join(doc.page_content for doc in docs)
+
+    rag_chain = (
+        {"context": retriever | format_docs, "question": RunnablePassthrough()}
+        | LangChainPredict(prompt, llm)
+        | StrOutputParser()
+    )
+    # Now we wrap it in LangChainModule.
+    rag_dspy_module = LangChainModule(rag_chain)
+
+    copied_module = rag_dspy_module.reset_copy()
+    assert len(copied_module.chain.steps) == len(rag_dspy_module.chain.steps)
+    for module, copied_module in zip(rag_dspy_module.chain.steps, copied_module.chain.steps):
+        if isinstance(module, LangChainPredict):
+            # The LangChainPredict modules are deep copied.
+            assert module != copied_module
+            assert module.langchain_llm.model_name == copied_module.langchain_llm.model_name
+        else:
+            # The rest of the modules are just copied by reference.
+            assert module == copied_module
+    # Clean up.
+    os.environ["OPENAI_API_KEY"] = None
diff --git a/tests/dsp_LM/predict/test_multi_chain_comparison.py b/tests/dsp_LM/predict/test_multi_chain_comparison.py
new file mode 100644
index 0000000000..e97c3dfbd0
--- /dev/null
+++ b/tests/dsp_LM/predict/test_multi_chain_comparison.py
@@ -0,0 +1,40 @@
+import pytest
+
+import dspy
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_basic_example():
+    class BasicQA(dspy.Signature):
+        """Answer questions with short factoid answers."""
+
+        question = dspy.InputField()
+        answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+    # Example completions generated by a model for reference
+    completions = [
+        dspy.Prediction(
+            rationale="I recall that during clear days, the sky often appears this color.",
+            answer="blue",
+        ),
+        dspy.Prediction(
+            rationale="Based on common knowledge, I believe the sky is typically seen as this color.",
+            answer="green",
+        ),
+        dspy.Prediction(
+            rationale="From images and depictions in media, the sky is frequently represented with this hue.",
+            answer="blue",
+        ),
+    ]
+
+    # Pass signature to MultiChainComparison module
+    compare_answers = dspy.MultiChainComparison(BasicQA)
+
+    # Call the MultiChainComparison on the completions
+    question = "What is the color of the sky?"
+    lm = DSPDummyLM(["my rationale", "blue"])
+    dspy.settings.configure(lm=lm)
+    final_pred = compare_answers(completions, question=question)
+
+    assert final_pred.rationale == "my rationale"
+    assert final_pred.answer == "blue"
diff --git a/tests/dsp_LM/predict/test_predict.py b/tests/dsp_LM/predict/test_predict.py
new file mode 100644
index 0000000000..9158987e3a
--- /dev/null
+++ b/tests/dsp_LM/predict/test_predict.py
@@ -0,0 +1,101 @@
+import copy
+import textwrap
+
+import pydantic
+import pytest
+import ujson
+
+import dspy
+from dspy import Predict, Signature, TypedPredictor
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_call_method():
+    predict_instance = Predict("input -> output")
+    lm = DSPDummyLM(["test output"])
+    dspy.settings.configure(lm=lm)
+    result = predict_instance(input="test input")
+    assert result.output == "test output"
+    assert lm.get_convo(-1) == (
+        "Given the fields `input`, produce the fields `output`.\n"
+        "\n---\n\n"
+        "Follow the following format.\n\n"
+        "Input: ${input}\n"
+        "Output: ${output}\n"
+        "\n---\n\n"
+        "Input: test input\n"
+        "Output: test output"
+    )
+
+
+def test_forward_method():
+    program = Predict("question -> answer")
+    dspy.settings.configure(lm=DSPDummyLM([]))
+    result = program(question="What is 1+1?").answer
+    assert result == "No more responses"
+
+
+def test_forward_method2():
+    program = Predict("question -> answer1, answer2")
+    dspy.settings.configure(lm=DSPDummyLM(["my first answer", "my second answer"]))
+    result = program(question="What is 1+1?")
+    assert result.answer1 == "my first answer"
+    assert result.answer2 == "my second answer"
+
+
+def test_config_management():
+    predict_instance = Predict("input -> output")
+    predict_instance.update_config(new_key="value")
+    config = predict_instance.get_config()
+    assert "new_key" in config and config["new_key"] == "value"
+
+
+def test_multi_output():
+    program = Predict("question -> answer", n=2)
+    dspy.settings.configure(lm=DSPDummyLM(["my first answer", "my second answer"]))
+    results = program(question="What is 1+1?")
+    assert results.completions.answer[0] == "my first answer"
+    assert results.completions.answer[1] == "my second answer"
+
+
+def test_multi_output2():
+    program = Predict("question -> answer1, answer2", n=2)
+    dspy.settings.configure(
+        lm=DSPDummyLM(
+            [
+                "my 0 answer\nAnswer 2: my 2 answer",
+                "my 1 answer\nAnswer 2: my 3 answer",
+            ],
+        )
+    )
+    results = program(question="What is 1+1?")
+    assert results.completions.answer1[0] == "my 0 answer"
+    assert results.completions.answer1[1] == "my 1 answer"
+    assert results.completions.answer2[0] == "my 2 answer"
+    assert results.completions.answer2[1] == "my 3 answer"
+
+
+def test_output_only():
+    class OutputOnlySignature(dspy.Signature):
+        output = dspy.OutputField()
+
+    predictor = Predict(OutputOnlySignature)
+
+    lm = DSPDummyLM(["short answer"])
+    dspy.settings.configure(lm=lm)
+    assert predictor().output == "short answer"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields , produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Output: ${output}
+
+        ---
+
+        Output: short answer"""
+    )
diff --git a/tests/dsp_LM/predict/test_program_of_thought.py b/tests/dsp_LM/predict/test_program_of_thought.py
new file mode 100644
index 0000000000..e5522a847c
--- /dev/null
+++ b/tests/dsp_LM/predict/test_program_of_thought.py
@@ -0,0 +1,135 @@
+import textwrap
+
+import dspy
+from dspy import ProgramOfThought, Signature
+from dspy.utils import DSPDummyLM
+
+
+class BasicQA(Signature):
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+def test_pot_code_generation():
+    pot = ProgramOfThought(BasicQA)
+    lm = DSPDummyLM(
+        [
+            "Reason_A",
+            "```python\nresult = 1+1\n```",
+            "Reason_B",
+            "2",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+    res = pot(question="What is 1+1?")
+    assert res.answer == "2"
+    assert lm.get_convo(index=-1) == textwrap.dedent(
+        """\
+        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Code: python code that answers the question
+
+        Code Output: output of previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+
+        Answer: often between 1 and 5 words
+
+        ---
+
+        Question: What is 1+1?
+
+        Code: result = 1+1
+
+        Code Output: 2
+
+        Reasoning: Let's think step by step in order to Reason_B
+
+        Answer: 2"""
+    )
+
+
+def test_pot_code_generation_with_error():
+    pot = ProgramOfThought(BasicQA)
+    lm = DSPDummyLM(
+        [
+            "Reason_A",
+            "```python\nresult = 1+0/0\n```",
+            "Reason_B",  # Error: division by zero
+            "```python\nresult = 1+1\n```",
+            "Reason_C",
+            "2",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+    res = pot(question="What is 1+1?")
+    assert res.answer == "2"
+
+    # The first code example failed
+    assert lm.get_convo(index=2) == textwrap.dedent(
+        """\
+        You are given `question`, `previous_code`, `error` due to an error in previous code.
+        Your task is to correct the error and provide the new `generated_code`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Previous Code: previously-generated python code that errored
+
+        Error: error message from previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the generated_code}. We ...
+
+        Code: python code that answers the question
+
+        ---
+
+        Question: What is 1+1?
+
+        Previous Code: result = 1+0/0
+
+        Error: division by zero
+
+        Reasoning: Let's think step by step in order to Reason_B"""
+    )
+
+    # The second code example succeeded
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Code: python code that answers the question
+
+        Code Output: output of previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+
+        Answer: often between 1 and 5 words
+
+        ---
+
+        Question: What is 1+1?
+
+        Code: result = 1+1
+
+        Code Output: 2
+
+        Reasoning: Let's think step by step in order to Reason_C
+
+        Answer: 2"""
+    )
diff --git a/tests/dsp_LM/predict/test_react.py b/tests/dsp_LM/predict/test_react.py
new file mode 100644
index 0000000000..e3f11d438c
--- /dev/null
+++ b/tests/dsp_LM/predict/test_react.py
@@ -0,0 +1,154 @@
+from dataclasses import dataclass
+
+import dspy
+from dspy.utils.dummies import dummy_rm
+
+
+def test_example_no_tools():
+    # Createa a simple dataset which the model will use with the Retrieve tool.
+    lm = dspy.utils.DspDummyLM(
+        [
+            "Initial thoughts",  # Thought_1
+            "Finish[blue]",  # Action_1
+        ]
+    )
+    dspy.settings.configure(lm=lm, rm=dummy_rm())
+
+    program = dspy.ReAct("question -> answer")
+
+    # Check default tools
+    assert isinstance(program.tools["Finish"], dspy.Example)
+
+    # Call the ReAct module on a particular input
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "blue"
+
+    # For debugging
+    print("---")
+    for row in lm.history:
+        print(row["prompt"])
+        print("Response:", row["response"]["choices"][0]["text"])
+        print("---")
+
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n" "Thought 1: Initial thoughts\n" "Action 1: Finish[blue]"
+    )
+
+
+def test_example_search():
+    # Createa a simple dataset which the model will use with the Retrieve tool.
+    lm = dspy.utils.DspDummyLM(
+        [
+            "Initial thoughts",  # Thought_1
+            "Search[the color of the sky]",  # Thought_1
+            "More thoughts",  # Thought_2
+            "Finish[blue]",  # Action_2
+        ]
+    )
+    rm = dummy_rm(
+        [
+            "We all know the color of the sky is blue.",
+            "Somethng about the sky colors",
+            "This sentence is completely irellevant to answer the question.",
+            "Let's add some more sentences to act as summy passages.",
+            "Let's add some more sentences to act as summy passages.",
+            "Let's add some more sentences to act as summy passages.",
+        ]
+    )
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    program = dspy.ReAct("question -> answer")
+
+    # Check default tools
+    assert len(program.tools) == 2
+    assert isinstance(program.tools["Search"], dspy.Retrieve)
+    assert isinstance(program.tools["Finish"], dspy.Example)
+
+    # Call the ReAct module on a particular input
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "blue"
+
+    # For debugging
+    print(lm.get_convo(-1))
+
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n\n"
+        "Thought 1: Initial thoughts\n\n"
+        "Action 1: Search[the color of the sky]\n\n"
+        "Observation 1:\n"
+        "[1] «We all know the color of the sky is blue.»\n"
+        "[2] «Somethng about the sky colors»\n"
+        "[3] «This sentence is completely irellevant to answer the question.»\n\n"
+        "Thought 2: More thoughts\n\n"
+        "Action 2: Finish[blue]"
+    )
+
+
+class DummyTool1:
+    name = "Tool1"
+    input_variable = "query"
+    desc = ""
+    num_calls = 0
+
+    def __call__(self, *args, **kwargs):
+        # test case with no passages attribute
+        assert args[0] == "foo"
+        self.num_calls += 1
+        return "tool 1 output"
+
+
+@dataclass
+class DummyOutput:
+    passages: str
+
+
+class DummyTool2:
+    name = "Tool2"
+    input_variable = "query"
+    desc = ""
+    num_calls = 0
+
+    def __call__(self, *args, **kwargs):
+        # test case with passages attribute
+        assert args[0] == "bar"
+        self.num_calls += 1
+        return DummyOutput(passages="tool 2 output")
+
+
+def test_custom_tools():
+    lm = dspy.utils.DspDummyLM(
+        [
+            "Initial thoughts",
+            "Tool1[foo]",
+            "More thoughts",
+            "Tool2[bar]",
+            "Even more thoughts",
+            "Finish[baz]",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    tool1 = DummyTool1()
+    tool2 = DummyTool2()
+    program = dspy.ReAct("question -> answer", tools=[tool1, tool2])
+
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "baz"
+
+    # each tool should be called only once
+    assert tool1.num_calls == 1
+    assert tool2.num_calls == 1
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n\n"
+        "Thought 1: Initial thoughts\n\n"
+        "Action 1: Tool1[foo]\n\n"
+        "Observation 1: tool 1 output\n\n"
+        "Thought 2: More thoughts\n\n"
+        "Action 2: Tool2[bar]\n\n"
+        "Observation 2: tool 2 output\n\n"
+        "Thought 3: Even more thoughts\n\n"
+        "Action 3: Finish[baz]"
+    )
diff --git a/tests/dsp_LM/predict/test_retry.py b/tests/dsp_LM/predict/test_retry.py
new file mode 100644
index 0000000000..bd22984d48
--- /dev/null
+++ b/tests/dsp_LM/predict/test_retry.py
@@ -0,0 +1,110 @@
+import functools
+
+import pydantic
+
+import dspy
+from dspy.primitives.assertions import assert_transform_module, backtrack_handler
+from dspy.utils import DSPDummyLM
+
+
+def test_retry_simple():
+    predict = dspy.Predict("question -> answer")
+    retry_module = dspy.Retry(predict)
+
+    # Test Retry has created the correct new signature
+    for field in predict.signature.output_fields:
+        assert f"past_{field}" in retry_module.new_signature.input_fields
+    assert "feedback" in retry_module.new_signature.input_fields
+
+    lm = DSPDummyLM(["blue"])
+    dspy.settings.configure(lm=lm)
+    result = retry_module.forward(
+        question="What color is the sky?",
+        past_outputs={"answer": "red"},
+        feedback="Try harder",
+    )
+    assert result.answer == "blue"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1).endswith(
+        "Question: What color is the sky?\n\n" "Previous Answer: red\n\n" "Instructions: Try harder\n\n" "Answer: blue"
+    )
+
+
+def test_retry_forward_with_feedback():
+    # First we make a mistake, then we fix it
+    lm = DSPDummyLM(["red", "blue"])
+    dspy.settings.configure(lm=lm, trace=[])
+
+    class SimpleModule(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.predictor = dspy.Predict("question -> answer")
+
+        def forward(self, **kwargs):
+            result = self.predictor(**kwargs)
+            print(f"SimpleModule got {result.answer=}")
+            dspy.Suggest(result.answer == "blue", "Please think harder")
+            return result
+
+    program = SimpleModule()
+    program = assert_transform_module(
+        program.map_named_predictors(dspy.Retry),
+        functools.partial(backtrack_handler, max_backtracks=1),
+    )
+
+    result = program(question="What color is the sky?")
+
+    assert result.answer == "blue"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1).endswith(
+        "Question: What color is the sky?\n\n"
+        "Previous Answer: red\n\n"
+        "Instructions: Please think harder\n\n"
+        "Answer: blue"
+    )
+
+
+def test_retry_forward_with_typed_predictor():
+    # First we make a mistake, then we fix it
+    lm = DSPDummyLM(['{"answer":"red"}', '{"answer":"blue"}'])
+    dspy.settings.configure(lm=lm, trace=[])
+
+    class AnswerQuestion(dspy.Signature):
+        """Answer questions with succint responses."""
+
+        class Input(pydantic.BaseModel):
+            question: str
+
+        class Output(pydantic.BaseModel):
+            answer: str
+
+        input: Input = dspy.InputField()
+        output: Output = dspy.OutputField()
+
+    class QuestionAnswerer(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.answer_question = dspy.TypedPredictor(AnswerQuestion)
+
+        def forward(self, **kwargs):
+            result = self.answer_question(input=AnswerQuestion.Input(**kwargs)).output
+            dspy.Suggest(result.answer == "blue", "Please think harder")
+            return result
+
+    program = QuestionAnswerer()
+    program = assert_transform_module(
+        program.map_named_predictors(dspy.Retry),
+        functools.partial(backtrack_handler, max_backtracks=1),
+    )
+
+    result = program(question="What color is the sky?")
+
+    assert result.answer == "blue"
+    assert lm.get_convo(-1).endswith(
+        'Input: {"question":"What color is the sky?"}\n\n'
+        'Previous Output: {"answer":"red"}\n\n'
+        "Instructions: Please think harder\n\n"
+        'Output: {"answer":"blue"}'
+    )
diff --git a/tests/dsp_LM/primitives/test_example.py b/tests/dsp_LM/primitives/test_example.py
new file mode 100644
index 0000000000..39ad0b749f
--- /dev/null
+++ b/tests/dsp_LM/primitives/test_example.py
@@ -0,0 +1,110 @@
+import pytest
+
+from dspy import Example
+
+
+def test_example_initialization():
+    example = Example(a=1, b=2)
+    assert example.a == 1
+    assert example.b == 2
+
+
+def test_example_initialization_from_base():
+    base = Example(a=1, b=2)
+    example = Example(base=base, c=3)
+    assert example.a == 1
+    assert example.b == 2
+    assert example.c == 3
+
+
+def test_example_initialization_from_dict():
+    base_dict = {"a": 1, "b": 2}
+    example = Example(base=base_dict, c=3)
+    assert example.a == 1
+    assert example.b == 2
+    assert example.c == 3
+
+
+def test_example_set_get_item():
+    example = Example()
+    example["a"] = 1
+    assert example["a"] == 1
+
+
+def test_example_attribute_access():
+    example = Example(a=1)
+    assert example.a == 1
+    example.a = 2
+    assert example.a == 2
+
+
+def test_example_deletion():
+    example = Example(a=1, b=2)
+    del example["a"]
+    with pytest.raises(AttributeError):
+        _ = example.a
+
+
+def test_example_len():
+    example = Example(a=1, b=2, dspy_hidden=3)
+    assert len(example) == 2
+
+
+def test_example_repr_str():
+    example = Example(a=1)
+    assert repr(example) == "Example({'a': 1}) (input_keys=None)"
+    assert str(example) == "Example({'a': 1}) (input_keys=None)"
+
+
+def test_example_eq():
+    example1 = Example(a=1, b=2)
+    example2 = Example(a=1, b=2)
+    assert example1 == example2
+    assert example1 != ""
+
+
+def test_example_hash():
+    example1 = Example(a=1, b=2)
+    example2 = Example(a=1, b=2)
+    assert hash(example1) == hash(example2)
+
+
+def test_example_keys_values_items():
+    example = Example(a=1, b=2, dspy_hidden=3)
+    assert set(example.keys()) == {"a", "b"}
+    assert 1 in example.values()
+    assert ("b", 2) in example.items()
+
+
+def test_example_get():
+    example = Example(a=1, b=2)
+    assert example.get("a") == 1
+    assert example.get("c", "default") == "default"
+
+
+def test_example_with_inputs():
+    example = Example(a=1, b=2).with_inputs("a")
+    assert example._input_keys == {"a"}
+
+
+def test_example_inputs_labels():
+    example = Example(a=1, b=2).with_inputs("a")
+    inputs = example.inputs()
+    assert inputs.toDict() == {"a": 1}
+    labels = example.labels()
+    assert labels.toDict() == {"b": 2}
+
+
+def test_example_copy_without():
+    example = Example(a=1, b=2)
+    copied = example.copy(c=3)
+    assert copied.a == 1
+    assert copied.c == 3
+    without_a = copied.without("a")
+    with pytest.raises(AttributeError):
+        _ = without_a.a
+
+
+def test_example_to_dict():
+    example = Example(a=1, b=2)
+    assert example.toDict() == {"a": 1, "b": 2}
diff --git a/tests/dsp_LM/primitives/test_module.py b/tests/dsp_LM/primitives/test_module.py
new file mode 100644
index 0000000000..4ba6df220b
--- /dev/null
+++ b/tests/dsp_LM/primitives/test_module.py
@@ -0,0 +1,49 @@
+import threading
+
+import dspy
+
+
+def test_deepcopy_basic():
+    signature = dspy.Signature("q -> a")
+    cot = dspy.ChainOfThought(signature)
+    cot_copy = cot.deepcopy()
+    assert len(cot.parameters()) == len(cot_copy.parameters())
+    # Parameters should be different objects with the same values.
+    assert id(cot.parameters()[0]) != id(cot_copy.parameters()[0])
+    assert cot.parameters()[0].__dict__ == cot_copy.parameters()[0].__dict__
+
+
+def test_deepcopy_with_uncopyable_modules():
+    class CustomClass(dspy.Module):
+        def __init__(self):
+            self.lock = threading.Lock()  # Non-copyable object.
+            self.cot = dspy.ChainOfThought(dspy.Signature("q -> a"))
+
+    model = CustomClass()
+    model_copy = model.deepcopy()
+    assert len(model.parameters()) == len(model_copy.parameters())
+    # The lock should be refer to the same object (shallow copy).
+    assert id(model.lock) == id(model_copy.lock)
+    # Parameters should be different objects with the same values.
+    assert id(model.parameters()[0]) != id(model_copy.parameters()[0])
+    assert model.parameters()[0].__dict__ == model_copy.parameters()[0].__dict__
+
+
+def test_deepcopy_with_nested_modules():
+    class CustomClass1(dspy.Module):
+        def __init__(self):
+            self.lock = threading.Lock()  # Non-copyable object.
+            self.cot = dspy.ChainOfThought(dspy.Signature("q -> a"))
+
+    class CustomClass2(dspy.Module):
+        def __init__(self):
+            self.submodel = CustomClass1()
+
+    model = CustomClass2()
+    model_copy = model.deepcopy()
+    assert len(model.parameters()) == len(model_copy.parameters())
+    # The lock should be refer to the same object (shallow copy).
+    assert id(model.submodel.lock) == id(model_copy.submodel.lock)
+    # Parameters should be different objects with the same values.
+    assert id(model.parameters()[0]) != id(model_copy.parameters()[0])
+    assert model.parameters()[0].__dict__ == model_copy.parameters()[0].__dict__
diff --git a/tests/dsp_LM/primitives/test_program.py b/tests/dsp_LM/primitives/test_program.py
new file mode 100644
index 0000000000..dc817882a1
--- /dev/null
+++ b/tests/dsp_LM/primitives/test_program.py
@@ -0,0 +1,21 @@
+import dspy
+from dspy.primitives.program import Module, set_attribute_by_name  # Adjust the import based on your file structure
+from dspy.utils import DSPDummyLM
+
+
+class HopModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.predict1 = dspy.Predict("question -> query")
+        self.predict2 = dspy.Predict("query -> answer")
+
+    def forward(self, question):
+        query = self.predict1(question=question).query
+        return self.predict2(query=query)
+
+
+def test_forward():
+    program = HopModule()
+    dspy.settings.configure(lm=DSPDummyLM({"What is 1+1?": "let me check", "let me check": "2"}))
+    result = program(question="What is 1+1?").answer
+    assert result == "2"
diff --git a/tests/dsp_LM/primitives/test_python_interpreter.py b/tests/dsp_LM/primitives/test_python_interpreter.py
new file mode 100644
index 0000000000..a1c0713887
--- /dev/null
+++ b/tests/dsp_LM/primitives/test_python_interpreter.py
@@ -0,0 +1,53 @@
+import pytest
+
+from dspy.primitives.python_interpreter import CodePrompt, PythonInterpreter, TextPrompt
+
+
+def test_execute_simple_code():
+    interpreter = PythonInterpreter(action_space={"print": print})
+    code = "print('Hello, World!')"
+    result = interpreter.execute(code)
+    assert result is None, "Simple print statement should return None"
+
+
+def test_action_space_limitation():
+    def func(string):
+        pass
+
+    interpreter = PythonInterpreter(action_space={})
+    code = "func('This should not execute')"
+    with pytest.raises(Exception):
+        interpreter.execute(code)
+
+
+def test_import_whitelist():
+    interpreter = PythonInterpreter(action_space={}, import_white_list=["math"])
+    code = "import math\nresult = math.sqrt(4)"
+    result = interpreter.execute(code)
+    assert result == 2, "Should be able to import and use math.sqrt"
+
+
+def test_fuzzy_variable_matching():
+    interpreter = PythonInterpreter(action_space={})
+    code = "result = number + 1"
+    result = interpreter.execute(code, fuzz_state={"number": 4})
+    assert result == 5, "Fuzzy variable matching should work"
+
+
+def test_text_prompt_keyword_extraction():
+    prompt = TextPrompt("Hello {name}, how are you?")
+    assert "name" in prompt.key_words, "Keyword 'name' should be extracted"
+
+
+def test_text_prompt_formatting():
+    prompt = TextPrompt("Hello {name}, how are you?")
+    formatted = prompt.format(name="Alice")
+    assert formatted == "Hello Alice, how are you?", "Should format with provided value"
+
+
+def test_code_prompt_execution():
+    action_space = {"len": len}
+    interpreter = PythonInterpreter(action_space=action_space)
+    code_prompt = CodePrompt("result = len('hello')")
+    result, _ = code_prompt.execute(interpreter)
+    assert result == 5, "Code execution should return the length of 'hello'"
diff --git a/tests/dsp_LM/retrieve/integration_test_pgvectorrm.py b/tests/dsp_LM/retrieve/integration_test_pgvectorrm.py
new file mode 100644
index 0000000000..524209bea5
--- /dev/null
+++ b/tests/dsp_LM/retrieve/integration_test_pgvectorrm.py
@@ -0,0 +1,94 @@
+"""Instructions:
+Add to dev container features:
+    "ghcr.io/itsmechlark/features/postgresql:1": {},
+    "ghcr.io/robbert229/devcontainer-features/postgresql-client:1": {}
+Add to .personalization.sh:
+    sudo apt install -y postgresql-16-pgvector
+
+    sudo /etc/init.d/postgresql restart
+
+    psql -v ON_ERROR_STOP=1 --user ${PGUSER} <<EOF
+    create extension if not exists vector;
+    EOF
+poetry install -E postgres
+"""
+import psycopg2
+import pytest
+
+from dspy.primitives.example import Example
+from dspy.retrieve.pgvector_rm import PgVectorRM
+
+DB_URL = "postgresql://postgres:password@localhost/postgres"
+PG_TABLE_NAME = "test_table"
+
+
+def get_pgvectorrm():
+    openai_client = None  # Mock or use a real OpenAI client
+    pgvectorrm = PgVectorRM(
+        DB_URL, PG_TABLE_NAME, openai_client=openai_client, embedding_func=lambda x: "[2,3,4]", include_similarity=True
+    )
+    return pgvectorrm
+
+
+@pytest.fixture
+def setup_pgvectorrm():
+    pgvectorrm = get_pgvectorrm()
+    conn = psycopg2.connect(DB_URL)
+    cursor = conn.cursor()
+    cursor.execute(f"DROP TABLE IF EXISTS {PG_TABLE_NAME}")
+    conn.commit()
+
+    cursor.execute(f"CREATE TABLE IF NOT EXISTS {PG_TABLE_NAME} (id SERIAL PRIMARY KEY, text TEXT, embedding VECTOR)")
+    cursor.execute(
+        f"INSERT INTO {PG_TABLE_NAME} (text, embedding) VALUES ('Dummy text1', '[1,2,3]'), ('Dummy text2', '[4,5,6]')"
+    )
+    conn.commit()
+
+    yield pgvectorrm
+
+    cursor.execute(f"TRUNCATE TABLE {PG_TABLE_NAME}")
+    conn.commit()
+
+    cursor.close()
+    conn.close()
+
+
+def test_pgvectorrm_retrieve(setup_pgvectorrm):
+    pgvectorrm = setup_pgvectorrm
+    query = "test query"
+    results = pgvectorrm(query)
+    assert len(results) == 2
+    assert results == [
+        Example(text="Dummy text2", similarity=0.9946115458726394),
+        Example(text="Dummy text1", similarity=0.9925833339709302),
+    ]
+
+
+@pytest.mark.parametrize(
+    "k, expected",
+    [
+        (1, 1),
+        (2, 2),
+        (3, 2),  # Assuming only 2 entries exist
+    ],
+)
+def test_pgvectorrm_retrieve_diff_k(setup_pgvectorrm, k, expected):
+    setup_pgvectorrm.k = k
+    query = "test query"
+    results = setup_pgvectorrm(query)
+    assert len(results) == expected
+
+
+def test_empty_table():
+    # Assuming setup_pgvectorrm cleans up after yielding
+    query = "test query"
+    results = get_pgvectorrm()(query)
+    assert len(results) == 0
+
+
+def test_retrieval_without_similarity(setup_pgvectorrm):
+    setup_pgvectorrm.include_similarity = False
+    query = "test query"
+    results = setup_pgvectorrm(query)
+    # Ensure 'similarity' key is not in results
+    assert all("similarity" not in result for result in results)
diff --git a/tests/dsp_LM/retrieve/test_llama_index_rm.py b/tests/dsp_LM/retrieve/test_llama_index_rm.py
new file mode 100644
index 0000000000..497ed87fc7
--- /dev/null
+++ b/tests/dsp_LM/retrieve/test_llama_index_rm.py
@@ -0,0 +1,61 @@
+import logging
+
+import pytest
+
+import dspy
+from dsp.modules.dummy_lm import DspDummyLM
+from dspy.datasets import HotPotQA
+
+try:
+    from llama_index.core import Settings, VectorStoreIndex
+    from llama_index.core.base.base_retriever import BaseRetriever
+    from llama_index.core.embeddings.mock_embed_model import MockEmbedding
+    from llama_index.core.readers.string_iterable import StringIterableReader
+
+    from dspy.retrieve.llama_index_rm import LlamaIndexRM
+
+except ImportError:
+    logging.info("Optional dependency llama-index is not installed - skipping LlamaIndexRM tests.")
+
+
+@pytest.fixture()
+def rag_setup() -> dict:
+    """Builds the necessary fixtures to test LI"""
+    pytest.importorskip("llamaindex")
+    dataset = HotPotQA(train_seed=1, train_size=8, eval_seed=2023, dev_size=4, test_size=0)
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+    ragset = [f"Question: {x.question} Answer: {x.answer}" for x in dataset.train]
+    dummyset = {x.question: x.answer for x in dataset.train}
+
+    Settings.embed_model = MockEmbedding(8)
+    docs = StringIterableReader().load_data(texts=ragset)
+    index = VectorStoreIndex.from_documents(documents=docs)
+    retriever = index.as_retriever()
+    rm = LlamaIndexRM(retriever)
+
+    return {
+        "index": index,
+        "retriever": retriever,
+        "rm": rm,
+        "lm": DspDummyLM(answers=dummyset),
+        "trainset": trainset,
+        "devset": devset,
+    }
+
+
+def test_lirm_as_rm(rag_setup):
+    """Test the retriever as retriever method"""
+    pytest.importorskip("llamaindex")
+    retriever = rag_setup.get("retriever")
+    test_res_li = retriever.retrieve("At My Window was released by which American singer-songwriter?")
+    rm = rag_setup.get("rm")
+    test_res_dspy = rm.forward("At My Window was released by which American singer-songwriter?")
+
+    assert isinstance(retriever, BaseRetriever), "Ensuring that the retriever is a LI Retriever object"
+    assert isinstance(test_res_li, list), "Ensuring results are a list from LI Retriever"
+
+    assert isinstance(rm, dspy.Retrieve), "Ensuring the RM is a retriever object from dspy"
+    assert isinstance(test_res_dspy, list), "Ensuring the results are a list from the DSPy retriever"
+
+    assert len(test_res_li) == len(test_res_dspy), "Rough equality check of the results"
diff --git a/tests/dsp_LM/signatures/test_signature.py b/tests/dsp_LM/signatures/test_signature.py
new file mode 100644
index 0000000000..fffa58ab2c
--- /dev/null
+++ b/tests/dsp_LM/signatures/test_signature.py
@@ -0,0 +1,41 @@
+import textwrap
+from typing import List
+
+import pydantic
+import pytest
+
+import dspy
+from dspy import InputField, OutputField, Signature, infer_prefix
+from dspy.utils.dummies import DSPDummyLM
+
+
+def test_multiline_instructions():
+    class MySignature(Signature):
+        """First line
+        Second line
+            Third line"""
+
+        output = OutputField()
+
+    predictor = dspy.Predict(MySignature)
+
+    lm = DSPDummyLM(["short answer"])
+    dspy.settings.configure(lm=lm)
+    assert predictor().output == "short answer"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        First line
+        Second line
+            Third line
+
+        ---
+
+        Follow the following format.
+
+        Output: ${output}
+
+        ---
+
+        Output: short answer"""
+    )
diff --git a/tests/dsp_LM/teleprompt/test_bootstrap.py b/tests/dsp_LM/teleprompt/test_bootstrap.py
new file mode 100644
index 0000000000..6bc41ec610
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_bootstrap.py
@@ -0,0 +1,156 @@
+import textwrap
+
+import pytest
+
+import dspy
+from dspy import Example
+from dspy.predict import Predict
+from dspy.teleprompt import BootstrapFewShot
+from dspy.utils.dummies import DSPDummyLM
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction, trace=None):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+examples = [
+    Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"),
+]
+trainset = [examples[0]]
+valset = [examples[1]]
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = Predict(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_compile_with_predict_instances():
+    # Create Predict instances for student and teacher
+    # Note that dspy.Predict is not itself a module, so we can't use it directly here
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DSPDummyLM(["Initial thoughts", "Finish[blue]"])
+    dspy.settings.configure(lm=lm)
+
+    # Initialize BootstrapFewShot and compile the student
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    assert compiled_student is not None, "Failed to compile student"
+    assert hasattr(compiled_student, "_compiled") and compiled_student._compiled, "Student compilation flag not set"
+
+
+def test_bootstrap_effectiveness():
+    # This test verifies if the bootstrapping process improves the student's predictions
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+    lm = DSPDummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    dspy.settings.configure(lm=lm, trace=[])
+
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    # Check that the compiled student has the correct demos
+    assert len(compiled_student.predictor.demos) == 1
+    assert compiled_student.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.predictor.demos[0].output == trainset[0].output
+
+    # Test the compiled student's prediction.
+    # We are using a DspDummyLM with follow_examples=True, which means that
+    # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
+    # on the second output, if it seems an example that perfectly matches the
+    # prompt, it will use that instead. That is why we expect "blue" here.
+    prediction = compiled_student(input=trainset[0].input)
+    assert prediction.output == trainset[0].output
+
+    # For debugging
+    print("Convo")
+    print(lm.get_convo(-1))
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue"""
+    )
+
+
+def test_error_handling_during_bootstrap():
+    """
+    Test to verify error handling during the bootstrapping process
+    """
+
+    class BuggyModule(dspy.Module):
+        def __init__(self, signature):
+            super().__init__()
+            self.predictor = Predict(signature)
+
+        def forward(self, **kwargs):
+            raise RuntimeError("Simulated error")
+
+    student = SimpleModule("input -> output")
+    teacher = BuggyModule("input -> output")
+
+    # Setup DspDummyLM to simulate an error scenario
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",  # Simulate initial teacher's prediction
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric,
+        max_bootstrapped_demos=1,
+        max_labeled_demos=1,
+        max_errors=1,
+    )
+
+    with pytest.raises(RuntimeError, match="Simulated error"):
+        bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+
+def test_validation_set_usage():
+    """
+    Test to ensure the validation set is correctly used during bootstrapping
+    """
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",
+            "Finish[blue]",  # Expected output for both training and validation
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    # Check that validation examples are part of student's demos after compilation
+    assert len(compiled_student.predictor.demos) >= len(valset), "Validation set not used in compiled student demos"
diff --git a/tests/dsp_LM/teleprompt/test_copro_optimizer.py b/tests/dsp_LM/teleprompt/test_copro_optimizer.py
new file mode 100644
index 0000000000..50011eecc4
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_copro_optimizer.py
@@ -0,0 +1,149 @@
+import textwrap
+
+import pytest
+
+import dspy
+from dspy import Example
+from dspy.teleprompt.signature_opt import COPRO
+from dspy.utils.dummies import DSPDummyLM
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+# Example training and validation sets
+trainset = [
+    Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs(
+        "input"
+    ),
+]
+
+
+def test_signature_optimizer_initialization():
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    assert optimizer.metric == simple_metric, "Metric not correctly initialized"
+    assert optimizer.breadth == 2, "Breadth not correctly initialized"
+    assert optimizer.depth == 1, "Depth not correctly initialized"
+    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        # COPRO doesn't work with dspy.Predict
+        self.predictor = dspy.ChainOfThought(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_signature_optimizer_optimization_process():
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    dspy.settings.configure(lm=DSPDummyLM(["Optimized instruction 1", "Optimized instruction 2"]))
+
+    student = SimpleModule("input -> output")
+
+    # Assuming the compile method of COPRO requires a student module, a development set, and evaluation kwargs
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Check that the optimized student has been modified from the original
+    # This check can be more specific based on how the optimization modifies the student
+    assert optimized_student is not student, "Optimization did not modify the student"
+
+    # Further tests can be added to verify the specifics of the optimization process,
+    # such as checking the instructions of the optimized student's predictors.
+
+
+def test_signature_optimizer_statistics_tracking():
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    optimizer.track_stats = True  # Enable statistics tracking
+
+    dspy.settings.configure(lm=DSPDummyLM(["Optimized instruction"]))
+    student = SimpleModule("input -> output")
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Verify that statistics have been tracked and attached to the optimized student
+    assert hasattr(optimized_student, "total_calls"), "Total calls statistic not tracked"
+    assert hasattr(optimized_student, "results_best"), "Best results statistics not tracked"
+
+
+# Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before
+
+
+def test_optimization_and_output_verification():
+    lm = DSPDummyLM(
+        [
+            "Optimized Prompt",
+            "Optimized Prefix",
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+
+    student = SimpleModule("input -> output")
+
+    # Compile the student with the optimizer
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Simulate calling the optimized student with a new input
+    test_input = "What is the capital of France?"
+    prediction = optimized_student(input=test_input)
+
+    print(lm.get_convo(-1))
+
+    assert prediction.output == "No more responses"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Optimized Prompt
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Reasoning: Let's think step by step in order to ${produce the output}. We ...
+        Optimized Prefix ${output}
+
+        ---
+
+        Input: What is the capital of France?
+        Reasoning: Let's think step by step in order to No more responses
+        Optimized Prefix No more responses"""
+    )
+
+
+def test_statistics_tracking_during_optimization():
+    dspy.settings.configure(lm=DSPDummyLM(["Optimized instruction for stats tracking"]))
+
+    optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    optimizer.track_stats = True  # Enable statistics tracking
+
+    student = SimpleModule("input -> output")
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
+    # Verify that statistics have been tracked
+    assert hasattr(optimized_student, "total_calls"), "Optimizer did not track total metric calls"
+    assert optimized_student.total_calls > 0, "Optimizer reported no metric calls"
+
+    # Check if the results_best and results_latest contain valid statistics
+    assert "results_best" in optimized_student.__dict__, "Optimizer did not track the best results"
+    assert "results_latest" in optimized_student.__dict__, "Optimizer did not track the latest results"
+    assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics"
+    assert (
+        len(optimized_student.results_latest) > 0
+    ), "Optimizer did not properly populate the latest results statistics"
+
+    # Additional detailed checks can be added here to verify the contents of the tracked statistics
diff --git a/tests/dsp_LM/teleprompt/test_ensemble.py b/tests/dsp_LM/teleprompt/test_ensemble.py
new file mode 100644
index 0000000000..c779e87823
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_ensemble.py
@@ -0,0 +1,59 @@
+import pytest
+
+import dspy
+from dspy.teleprompt import Ensemble
+
+
+class MockProgram(dspy.Module):
+    def __init__(self, output):
+        super().__init__()
+        self.output = output
+
+    def forward(self, *args, **kwargs):
+        return self.output
+
+
+# Simple reduction function to test with
+def mock_reduce_fn(outputs):
+    return sum(outputs) / len(outputs)
+
+
+def test_ensemble_without_reduction():
+    """Test that Ensemble correctly combines outputs without applying a reduce_fn."""
+    programs = [MockProgram(i) for i in range(5)]
+    ensemble = Ensemble()
+    ensembled_program = ensemble.compile(programs)
+
+    outputs = ensembled_program()
+    assert len(outputs) == 5, "Ensemble did not combine the correct number of outputs"
+
+
+def test_ensemble_with_reduction():
+    """Test that Ensemble correctly applies a reduce_fn to combine outputs."""
+    programs = [MockProgram(i) for i in range(5)]
+    ensemble = Ensemble(reduce_fn=mock_reduce_fn)
+    ensembled_program = ensemble.compile(programs)
+
+    output = ensembled_program()
+    expected_output = sum(range(5)) / 5
+    assert output == expected_output, "Ensemble did not correctly apply the reduce_fn"
+
+
+def test_ensemble_with_size_limitation():
+    """Test that specifying a size limits the number of programs used in the ensemble."""
+    programs = [MockProgram(i) for i in range(10)]
+    ensemble_size = 3
+    ensemble = Ensemble(size=ensemble_size)
+    ensembled_program = ensemble.compile(programs)
+
+    outputs = ensembled_program()
+    assert len(outputs) == ensemble_size, "Ensemble did not respect the specified size limitation"
+
+
+def test_ensemble_deterministic_behavior():
+    """Verify that the Ensemble class raises an assertion for deterministic behavior."""
+    with pytest.raises(
+        AssertionError,
+        match="TODO: Implement example hashing for deterministic ensemble.",
+    ):
+        Ensemble(deterministic=True)
diff --git a/tests/dsp_LM/teleprompt/test_knn_fewshot.py b/tests/dsp_LM/teleprompt/test_knn_fewshot.py
new file mode 100644
index 0000000000..771e01d557
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_knn_fewshot.py
@@ -0,0 +1,65 @@
+import pytest
+
+import dsp
+import dspy
+from dspy.teleprompt.knn_fewshot import KNNFewShot
+from dspy.utils.dummies import DSPDummyLM, DummyVectorizer
+
+
+def mock_example(question: str, answer: str) -> dsp.Example:
+    """Creates a mock DSP example with specified question and answer."""
+    return dspy.Example(question=question, answer=answer).with_inputs("question")
+
+
+@pytest.fixture
+def setup_knn_few_shot():
+    """Sets up a KNNFewShot instance for testing."""
+    trainset = [
+        mock_example("What is the capital of France?", "Paris"),
+        mock_example("What is the largest ocean?", "Pacific"),
+        mock_example("What is 2+2?", "4"),
+    ]
+    dsp.SentenceTransformersVectorizer = DummyVectorizer
+    knn_few_shot = KNNFewShot(k=2, trainset=trainset)
+    return knn_few_shot
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = dspy.Predict(signature)
+
+    def forward(self, *args, **kwargs):
+        return self.predictor(**kwargs)
+
+    def reset_copy(self):
+        # Creates a new instance of SimpleModule with the same predictor
+        return SimpleModule(self.predictor.signature)
+
+
+# TODO: Test not working yet
+def _test_knn_few_shot_compile(setup_knn_few_shot):
+    """Tests the compile method of KNNFewShot with SimpleModule as student."""
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")  # Assuming teacher uses the same module type
+
+    # Setup DspDummyLM with a response for a query similar to one of the training examples
+    lm = DSPDummyLM(["Madrid", "10"])
+    dspy.settings.configure(lm=lm)  # Responses for the capital of Spain and the result of 5+5)
+
+    knn_few_shot = setup_knn_few_shot
+    trainset = knn_few_shot.KNN.trainset
+    compiled_student = knn_few_shot.compile(student, teacher=teacher, trainset=trainset, valset=None)
+
+    assert len(compiled_student.predictor.demos) == 1
+    assert compiled_student.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.predictor.demos[0].output == trainset[0].output
+    # Simulate a query that is similar to one of the training examples
+    output = compiled_student.forward(input="What is the capital of Spain?").output
+
+    print("CONVO")
+    print(lm.get_convo(-1))
+
+    # Validate that the output corresponds to one of the expected DspDummyLM responses
+    # This assumes the compiled_student's forward method will execute the predictor with the given query
+    assert output in ["Madrid", "10"], "The compiled student did not return the correct output based on the query"
diff --git a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
new file mode 100644
index 0000000000..99f4ede211
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
@@ -0,0 +1,254 @@
+import re
+import textwrap
+
+import pytest
+
+import dspy
+from dsp.modules import LM
+from dspy import Example
+from dspy.teleprompt.signature_opt_bayesian import MIPRO
+from dspy.utils.dummies import DSPDummyLM
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction, trace=None):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+# Some example data
+capitals = {
+    "Germany": "Berlin",
+    "France": "Paris",
+    "Denmark": "Copenhagen",
+    "Sweden": "Stockholm",
+    "Norway": "Oslo",
+}
+# Not used for training data
+extra_capitals = {
+    "Spain": "Madrid",
+    "Portugal": "Lisbon",
+    "Italy": "Rome",
+}
+
+# Example training and validation sets
+trainset = [
+    Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+] + [
+    Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input")
+    for country, capital in capitals.items()
+]
+
+
+class ConditionalLM(LM):
+    def __init__(self):
+        super().__init__("conditional-lm")
+
+    def basic_request(self, prompt, num_candidates=1, **kwargs):
+        # If we are in the "optimization" stage, we don't say much.
+        if prompt.endswith("Observations:"):
+            answer = " (*silence*)"
+        elif prompt.endswith("Proposed Instruction:"):
+            answer = " Input: "
+        elif prompt.endswith("Proposed Prefix For Output Field:"):
+            answer = " Output: "
+        elif prompt.endswith("Summary:"):
+            answer = " summarizing..."
+        else:
+            pairs = re.findall(r"Input: (.*?)\n(?:Reasoning:.*?\n)?Output: (.*?)\n", prompt, re.DOTALL)
+
+            # breakpoint()
+            print("PROMPT:", prompt)
+            print("PAIRS:", pairs)
+
+            last = re.search(r"Input: (.*)\nReasoning: (.*)$", prompt)
+            current_question = last.group(1)
+
+            if match := re.match(r"What is the capital of (.*?)\?", current_question):
+                country = match.group(1)
+                # If we had a previous example of a question about a capital, the model
+                # has learned the format, and will answer with question correctly.
+                if any("capital" in question for question, _ in pairs):
+                    answer = (capitals | extra_capitals)[country]
+                # Otherwise, it is confused and will answer with the country's name.
+                else:
+                    answer = country
+            # For other questions, the model will answer with the last word of the question.
+            else:
+                answer = current_question.split()[-1]
+
+            answer = "think deeply.\nOutput: " + answer
+
+        RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+        print("=== DspDummyLM ===")
+        print(prompt, end="")
+        print(f"{RED}{answer}{RESET}")
+        print("===")
+
+        dummy_response = {"choices": []}
+        for _ in range(num_candidates):
+            dummy_response["choices"].append(
+                {
+                    "text": answer,
+                    "finish_reason": "done",
+                }
+            )
+
+        # Simulate processing and storing the request and response.
+        history_entry = {
+            "prompt": prompt,
+            "response": dummy_response,
+            "kwargs": kwargs,
+            "raw_kwargs": kwargs,
+        }
+        self.history.append(history_entry)
+
+        return dummy_response
+
+    def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
+        response = self.basic_request(prompt, **kwargs)
+        return [choice["text"] for choice in response["choices"]]
+
+    def get_convo(self, index):
+        """get the prompt + anwer from the ith message"""
+        return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        # SignatureOptimizer doesn't work with dspy.Predict
+        self.predictor = dspy.ChainOfThought(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_signature_optimizer_optimization_process():
+    lm = ConditionalLM()
+    dspy.settings.configure(lm=lm)
+
+    student = SimpleModule(signature="input -> output")
+
+    optimizer = MIPRO(
+        metric=simple_metric,
+        num_candidates=10,
+        init_temperature=1.4,
+        verbose=False,
+        track_stats=False,
+    )
+
+    # Adjustments: Include required parameters for the compile method
+    optimized_student = optimizer.compile(
+        student=student,
+        trainset=trainset,
+        num_trials=10,
+        max_bootstrapped_demos=3,
+        max_labeled_demos=5,
+        eval_kwargs={"num_threads": 1, "display_progress": False},
+        requires_permission_to_run=False,
+    )
+
+    assert len(optimized_student.predictor.demos) == 5
+
+
+def test_signature_optimizer_bad_lm():
+    dspy.settings.configure(lm=DSPDummyLM([f"Optimized instruction {i}" for i in range(30)]))
+    student = SimpleModule(signature="input -> output")
+    optimizer = MIPRO(
+        metric=simple_metric,
+        num_candidates=10,
+        init_temperature=1.4,
+        verbose=False,
+        track_stats=False,
+    )
+
+    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DspDummyLM,
+    # which only outputs "Optimized instruction i" this means that none of the bootstrapped examples are successful,
+    # and therefore the set of examples that we're using to generate new prompts is empty
+    with pytest.raises(ValueError):
+        _optimized_student = optimizer.compile(
+            student=student,
+            trainset=trainset,
+            num_trials=10,
+            max_bootstrapped_demos=3,
+            max_labeled_demos=5,
+            eval_kwargs={"num_threads": 1, "display_progress": False},
+            requires_permission_to_run=False,
+        )
+
+
+def test_optimization_and_output_verification():
+    # Make a language model that is always right, except on the last
+    # example in the train set.
+    lm = ConditionalLM()
+    dspy.settings.configure(lm=lm)
+
+    optimizer = MIPRO(
+        metric=simple_metric,
+        num_candidates=10,
+        init_temperature=1.4,
+        verbose=False,
+        track_stats=True,
+    )
+
+    student = SimpleModule("input -> output")
+
+    # Compile the student with the optimizer
+    optimized_student = optimizer.compile(
+        student=student,
+        trainset=trainset,
+        num_trials=4,
+        max_bootstrapped_demos=2,
+        max_labeled_demos=3,
+        eval_kwargs={"num_threads": 1, "display_progress": False},
+        requires_permission_to_run=False,
+    )
+
+    # Simulate calling the optimized student with a new input
+    test_input = "What is the capital of Spain?"
+    prediction = optimized_student(input=test_input)
+
+    print("CORRECT ANSWER")
+    print(lm.get_convo(-1))
+
+    assert prediction.output == "Madrid"
+
+    expected_lm_output = textwrap.dedent(
+        """\
+        Input:
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Reasoning: Let's think step by step in order to ${produce the output}. We ...
+        Output: ${output}
+
+        ---
+
+        Input: What is the capital of France?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Paris
+
+        ---
+
+        Input: What is the capital of Norway?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Oslo
+
+        ---
+
+        Input: What does the fox say?
+        Output: Ring-ding-ding-ding-dingeringeding!
+
+        ---
+
+        Input: What is the capital of Spain?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Madrid"""
+    )
+
+    assert lm.get_convo(-1) == expected_lm_output
diff --git a/tests/dsp_LM/teleprompt/test_random_search.py b/tests/dsp_LM/teleprompt/test_random_search.py
new file mode 100644
index 0000000000..9d8e63d236
--- /dev/null
+++ b/tests/dsp_LM/teleprompt/test_random_search.py
@@ -0,0 +1,39 @@
+import dspy
+from dspy import Example
+from dspy.predict import Predict
+from dspy.teleprompt import BootstrapFewShotWithRandomSearch
+from dspy.utils.dummies import DSPDummyLM
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = Predict(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def simple_metric(example, prediction, trace=None):
+    return example.output == prediction.output
+
+
+def test_basic_workflow():
+    """Test to ensure the basic compile flow runs without errors."""
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DSPDummyLM(
+        [
+            "Initial thoughts",
+            "Finish[blue]",  # Expected output for both training and validation
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    optimizer = BootstrapFewShotWithRandomSearch(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    trainset = [
+        Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+        Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+    ]
+    optimizer.compile(student, teacher=teacher, trainset=trainset)

From 9753f3ed44ddf14297562964ba87fb0df016a818 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sat, 5 Oct 2024 12:41:37 +0000
Subject: [PATCH 03/17] feat(dspy): add unqiue base names for all tests by
 adding __init__.py files

---
 dsp/modules/dummy_lm.py             | 4 ++--
 tests/dsp_LM/evaluate/__init__.py   | 0
 tests/dsp_LM/examples/__init__.py   | 0
 tests/dsp_LM/functional/__init__.py | 0
 tests/dsp_LM/modules/__init__.py    | 0
 tests/dsp_LM/predict/__init__.py    | 0
 tests/dsp_LM/primitives/__init__.py | 0
 tests/dsp_LM/retrieve/__init__.py   | 0
 tests/dsp_LM/signatures/__init__.py | 0
 tests/dsp_LM/teleprompt/__init__.py | 0
 10 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 tests/dsp_LM/evaluate/__init__.py
 create mode 100644 tests/dsp_LM/examples/__init__.py
 create mode 100644 tests/dsp_LM/functional/__init__.py
 create mode 100644 tests/dsp_LM/modules/__init__.py
 create mode 100644 tests/dsp_LM/predict/__init__.py
 create mode 100644 tests/dsp_LM/primitives/__init__.py
 create mode 100644 tests/dsp_LM/retrieve/__init__.py
 create mode 100644 tests/dsp_LM/signatures/__init__.py
 create mode 100644 tests/dsp_LM/teleprompt/__init__.py

diff --git a/dsp/modules/dummy_lm.py b/dsp/modules/dummy_lm.py
index 1bd6a04a12..0509004c72 100644
--- a/dsp/modules/dummy_lm.py
+++ b/dsp/modules/dummy_lm.py
@@ -5,7 +5,7 @@
 
 
 # This testing module was moved in PR #735 to patch Arize Phoenix logging
-class DummyLM(LM):
+class DspDummyLM(LM):
     """Dummy language model for unit testing purposes."""
 
     def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
@@ -61,7 +61,7 @@ def basic_request(self, prompt, n=1, **kwargs) -> dict[str, list[dict[str, str]]
                 },
             )
 
-            RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
+            RED, _, RESET = "\033[91m", "\033[92m", "\033[0m"
             print("=== DummyLM ===")
             print(prompt, end="")
             print(f"{RED}{answer}{RESET}")
diff --git a/tests/dsp_LM/evaluate/__init__.py b/tests/dsp_LM/evaluate/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/examples/__init__.py b/tests/dsp_LM/examples/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/functional/__init__.py b/tests/dsp_LM/functional/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/modules/__init__.py b/tests/dsp_LM/modules/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/predict/__init__.py b/tests/dsp_LM/predict/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/primitives/__init__.py b/tests/dsp_LM/primitives/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/retrieve/__init__.py b/tests/dsp_LM/retrieve/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/signatures/__init__.py b/tests/dsp_LM/signatures/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/dsp_LM/teleprompt/__init__.py b/tests/dsp_LM/teleprompt/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 11006587a48bf7854b31c6e5158a93febd2181c2 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sat, 5 Oct 2024 12:44:40 +0000
Subject: [PATCH 04/17] feat(dspy): follow same pattern as other uses of
 DSPDummyLM

---
 tests/dsp_LM/predict/test_react.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/dsp_LM/predict/test_react.py b/tests/dsp_LM/predict/test_react.py
index e3f11d438c..37979ddbc0 100644
--- a/tests/dsp_LM/predict/test_react.py
+++ b/tests/dsp_LM/predict/test_react.py
@@ -1,12 +1,12 @@
 from dataclasses import dataclass
 
 import dspy
-from dspy.utils.dummies import dummy_rm
+from dspy.utils.dummies import DSPDummyLM, dummy_rm
 
 
 def test_example_no_tools():
-    # Createa a simple dataset which the model will use with the Retrieve tool.
-    lm = dspy.utils.DspDummyLM(
+    # Create a simple dataset which the model will use with the Retrieve tool.
+    lm = DSPDummyLM(
         [
             "Initial thoughts",  # Thought_1
             "Finish[blue]",  # Action_1
@@ -38,7 +38,7 @@ def test_example_no_tools():
 
 def test_example_search():
     # Createa a simple dataset which the model will use with the Retrieve tool.
-    lm = dspy.utils.DspDummyLM(
+    lm = DSPDummyLM(
         [
             "Initial thoughts",  # Thought_1
             "Search[the color of the sky]",  # Thought_1
@@ -118,7 +118,7 @@ def __call__(self, *args, **kwargs):
 
 
 def test_custom_tools():
-    lm = dspy.utils.DspDummyLM(
+    lm = DSPDummyLM(
         [
             "Initial thoughts",
             "Tool1[foo]",

From 7de20e6e9713a6b6752567bef0fbeea178e1b875 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sat, 5 Oct 2024 16:07:00 +0000
Subject: [PATCH 05/17] feat(dspy): update dummyLM completions for ChatAdapter
 instructions

---
 dspy/utils/dummies.py                         |  59 ++-
 .../dsp_LM/teleprompt/test_mipro_optimizer.py |  57 +--
 tests/evaluate/test_evaluate.py               |  25 +-
 tests/functional/test_functional.py           | 462 +++++++++++-------
 tests/functional/test_signature_opt_typed.py  |  24 +-
 tests/predict/test_chain_of_thought.py        |  40 +-
 .../test_chain_of_thought_with_hint.py        |  59 ++-
 tests/predict/test_predict.py                 |  73 ++-
 tests/predict/test_program_of_thought.py      | 181 ++++---
 tests/primitives/test_program.py              |  23 +-
 tests/retrieve/test_llama_index_rm.py         |   2 +-
 tests/signatures/test_signature.py            |  43 +-
 tests/teleprompt/test_bootstrap.py            | 143 +++---
 tests/teleprompt/test_copro_optimizer.py      | 161 ++++--
 tests/teleprompt/test_mipro_optimizer.py      |  93 ++--
 tests/teleprompt/test_random_search.py        |   8 +-
 16 files changed, 870 insertions(+), 583 deletions(-)

diff --git a/dspy/utils/dummies.py b/dspy/utils/dummies.py
index ecac8a54d3..3288facf5f 100644
--- a/dspy/utils/dummies.py
+++ b/dspy/utils/dummies.py
@@ -1,11 +1,13 @@
 import random
 import re
+from collections import defaultdict
 from typing import Union
 
 import numpy as np
 
 from dsp.modules import LM as DSPLM
 from dsp.utils.utils import dotdict
+from dspy.adapters.chat_adapter import field_header_pattern
 from dspy.clients.lm import LM
 
 
@@ -98,19 +100,56 @@ def get_convo(self, index) -> str:
 class DummyLM(LM):
     def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
         super().__init__("dummy", "chat", 0.0, 1000, True)
-        self.answers = iter([[ans] for ans in answers])
+        self.answers = answers
+        if isinstance(answers, list):
+            self.answers = iter(answers)
+        self.follow_examples = follow_examples
 
-    def __call__(self, **kwargs):
-        fallback = "No more responses"
-        if isinstance(self.answers, dict):
-            answer = next((v for k, v in self.answers.items() if k in kwargs["prompt"]), fallback)
-        else:
-            answer = next(self.answers, fallback)
-        return answer
+    def _use_example(self, messages):
+        # find all field names
+        fields = defaultdict(int)
+        for message in messages:
+            if "content" in message:
+                if ma := field_header_pattern.match(message["content"]):
+                    fields[message["content"][ma.start() : ma.end()]] += 1
+        # find the fields which are missing from the final turns
+        max_count = max(fields.values())
+        output_fields = [field for field, count in fields.items() if count != max_count]
+
+        # get the output from the last turn that has the output fields as headers
+        final_input = messages[-1]["content"].split("\n\n")[0]
+        for input, output in zip(reversed(messages[:-1]), reversed(messages)):
+            if any(field in output["content"] for field in output_fields) and final_input in input["content"]:
+                return output["content"]
+
+    def __call__(self, prompt=None, messages=None, **kwargs):
+        # Build the request.
+        outputs = []
+        for _ in range(kwargs.get("n", 1)):
+            messages = messages or [{"role": "user", "content": prompt}]
+            kwargs = {**self.kwargs, **kwargs}
 
-    def get_convo(self, index) -> str:
+            if self.follow_examples:
+                outputs.append(self._use_example(messages))
+            elif isinstance(self.answers, dict):
+                outputs.append(
+                    next((v for k, v in self.answers.items() if k in messages[-1]["content"]), "No more responses")
+                )
+            else:
+                outputs.append(next(self.answers, "No more responses"))
+
+            # Logging, with removed api key & where `cost` is None on cache hit.
+            kwargs = {k: v for k, v in kwargs.items() if not k.startswith("api_")}
+            entry = dict(prompt=prompt, messages=messages, kwargs=kwargs)
+            entry = dict(**entry, outputs=outputs, usage=0)
+            entry = dict(**entry, cost=0)
+            self.history.append(entry)
+
+        return outputs
+
+    def get_convo(self, index):
         """Get the prompt + anwer from the ith message."""
-        return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
+        return self.history[index]["messages"], self.history[index]["outputs"]
 
 
 def dummy_rm(passages=()) -> callable:
diff --git a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
index 99f4ede211..7536470b98 100644
--- a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
+++ b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
@@ -215,40 +215,23 @@ def test_optimization_and_output_verification():
 
     assert prediction.output == "Madrid"
 
-    expected_lm_output = textwrap.dedent(
-        """\
-        Input:
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Reasoning: Let's think step by step in order to ${produce the output}. We ...
-        Output: ${output}
-
-        ---
-
-        Input: What is the capital of France?
-        Reasoning: Let's think step by step in order to think deeply.
-        Output: Paris
-
-        ---
-
-        Input: What is the capital of Norway?
-        Reasoning: Let's think step by step in order to think deeply.
-        Output: Oslo
-
-        ---
-
-        Input: What does the fox say?
-        Output: Ring-ding-ding-ding-dingeringeding!
-
-        ---
-
-        Input: What is the capital of Spain?
-        Reasoning: Let's think step by step in order to think deeply.
-        Output: Madrid"""
-    )
-
-    assert lm.get_convo(-1) == expected_lm_output
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index c8034f53a0..01c3e5739b 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -34,7 +34,14 @@ def test_evaluate_initialization():
 
 
 def test_evaluate_call():
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": "[[ ## answer ## ]]\n2",
+                "What is 2+2?": "[[ ## answer ## ]]\n4",
+            }
+        )
+    )
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     assert program(question="What is 1+1?").answer == "2"
@@ -48,7 +55,9 @@ def test_evaluate_call():
 
 
 def test_multithread_evaluate_call():
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    dspy.settings.configure(
+        lm=DummyLM({"What is 1+1?": "[[ ## answer ## ]]\n2", "What is 2+2?": "[[ ## answer ## ]]\n4"})
+    )
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     assert program(question="What is 1+1?").answer == "2"
@@ -65,13 +74,15 @@ def test_multithread_evaluate_call():
 def test_multi_thread_evaluate_call_cancelled(monkeypatch):
     # slow LM that sleeps for 1 second before returning the answer
     class SlowLM(DummyLM):
-        def __call__(self, prompt, **kwargs):
+        def __call__(self, *args, **kwargs):
             import time
 
             time.sleep(1)
-            return super().__call__(prompt, **kwargs)
+            return super().__call__(*args, **kwargs)
 
-    dspy.settings.configure(lm=SlowLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    dspy.settings.configure(
+        lm=SlowLM({"What is 1+1?": "[[ ## answer ## ]]\n2", "What is 2+2?": "[[ ## answer ## ]]\n4"})
+    )
 
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
@@ -101,7 +112,9 @@ def sleep_then_interrupt():
 
 
 def test_evaluate_call_bad():
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "0", "What is 2+2?": "0"}))
+    dspy.settings.configure(
+        lm=DummyLM({"What is 1+1?": "[[ ## answer ## ]]\n0", "What is 2+2?": "[[ ## answer ## ]]\n0"})
+    )
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     ev = Evaluate(
diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
index 9e84d83933..b7d3a14872 100644
--- a/tests/functional/test_functional.py
+++ b/tests/functional/test_functional.py
@@ -21,7 +21,7 @@ def hard_question(topic: str) -> str:
         """Think of a hard factual question about a topic."""
 
     expected = "What is the speed of light?"
-    lm = DummyLM([expected])
+    lm = DummyLM([f"[[ ## hard_question ## ]]\n{expected}"])
     dspy.settings.configure(lm=lm)
 
     question = hard_question(topic="Physics")
@@ -36,7 +36,7 @@ def hard_questions(topics: List[str]) -> List[str]:
         pass
 
     expected = ["What is the speed of light?", "What is the speed of sound?"]
-    lm = DummyLM(['["What is the speed of light?", "What is the speed of sound?"]'])
+    lm = DummyLM(['[[ ## hard_questions ## ]]\n["What is the speed of light?", "What is the speed of sound?"]'])
     dspy.settings.configure(lm=lm)
 
     question = hard_questions(topics=["Physics", "Music"])
@@ -54,7 +54,7 @@ def hard_question(topic: str) -> Question:
         """Think of a hard factual question about a topic."""
 
     expected = "What is the speed of light?"
-    lm = DummyLM([f'{{"value": "{expected}"}}'])
+    lm = DummyLM([f'[[ ## hard_question ## ]]\n{{"value": "{expected}"}}'])
     dspy.settings.configure(lm=lm)
 
     question = hard_question(topic="Physics")
@@ -75,7 +75,7 @@ def answer(question: Question) -> Answer:
         pass
 
     question = Question(value="What is the speed of light?")
-    lm = DummyLM([f'{{"value": "3e8"}}'])
+    lm = DummyLM([f'[[ ## answer ## ]]\n{{"value": "3e8"}}'])
     dspy.settings.configure(lm=lm)
 
     result = answer(question=question)
@@ -110,12 +110,10 @@ def forward(self, **kwargs):
 
     lm = DummyLM(
         [
-            "What is the speed of light?",
-            "Some bad reasoning, 3e8 m/s.",
-            "3e8",  # Bad answer 1
-            "{...}",  # Model is asked to create an example
-            "Some good reasoning...",
-            expected.model_dump_json(),  # Good answer
+            "[[ ## hard_question ## ]]\nWhat is the speed of light?",
+            "[[ ## reasoning ## ]]\nSome bad reasoning, 3e8 m/s.\n\n[[ ## answer ## ]]\n3e8",  # Bad answer 1
+            "[[ ## json_object ## ]]\n{...}",  # Model is asked to create an example
+            f"[[ ## reasoning ## ]]\nSome good reasoning, 3e8 m/s.\n\n[[ ## answer ## ]]\n{expected.model_dump_json()}",  # Good answer
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -145,7 +143,7 @@ class MySignature(dspy.Signature):
     expected = "What is the speed of light?"
     lm = DummyLM(
         [
-            Question(value=expected).model_dump_json(),
+            f"[[ ## output ## ]]\n{Question(value=expected).model_dump_json()}",
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -229,8 +227,6 @@ def simple_metric(example, prediction, trace=None):
     bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
     compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
-    lm.inspect_history(n=2)
-
     # Check that the compiled student has the correct demos
     _, predict = next(compiled_student.named_sub_modules(Predict, skip_compiled=False))
     demos = predict.demos
@@ -246,27 +242,23 @@ def simple_metric(example, prediction, trace=None):
     prediction = compiled_student(input=trainset[0].input)
     assert prediction == trainset[0].output
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Output: ${output}
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue
-
-        ---
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": "Your input fields are:\n1. `input` (str)\n\nYour output fields are:\n1. `output` (str)\n\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## input ## ]]\n{input}\n\n[[ ## output ## ]]\n{output}\n\n[[ ## completed ## ]]\n\nIn adhering to this structure, your objective is: \n        Given the fields `input`, produce the fields `output`.",
+        },
+        {
+            "role": "user",
+            "content": "[[ ## input ## ]]\nWhat is the color of the sky?\n\nRespond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`.",
+        },
+        {"role": "assistant", "content": "[[ ## output ## ]]\nblue\n\n[[ ## completed ## ]]"},
+        {
+            "role": "user",
+            "content": "[[ ## input ## ]]\nWhat is the color of the sky?\n\nRespond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`.",
+        },
+    ]
 
-        Input: What is the color of the sky?
-        Output: blue"""
-    )
+    assert lm.get_convo(-1)[1] == ["[[ ## output ## ]]\nblue\n\n[[ ## completed ## ]]"]
 
 
 def test_regex():
@@ -289,11 +281,11 @@ def flight_information(email: str) -> TravelInformation:
     lm = DummyLM(
         [
             # Example with a bad origin code.
-            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            '[[ ## flight_information ## ]]\n{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
             # Example to help the model understand
-            "{...}",
+            "[[ ## json_object ## ]]\n{...}",
             # Fixed
-            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -349,7 +341,7 @@ def flight_information(email: str) -> TravelInformation:
         [
             # Example with a bad origin code.
             (
-                "Here is your json: "
+                "[[ ## flight_information ## ]]\nHere is your json: "
                 "{"
                 '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
                 '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
@@ -378,9 +370,8 @@ def flight_information(email: str) -> TravelInformation:
 
     lm = DummyLM(
         [
-            "A list of bad inputs",
-            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
-            '{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
+            '[[ ## flight_information ## ]]\n{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -402,11 +393,11 @@ def flight_information(email: str) -> TravelInformation:
     lm = DummyLM(
         [
             # First origin is wrong, then destination, then all is good
-            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
-            "{...}",  # Example to help the model understand
-            '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
-            "{...}",  # Example to help the model understand
-            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+            '[[ ## flight_information ## ]]\n{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            "[[ ## json_object ## ]]\n{...}",  # Example to help the model understand
+            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
+            "[[ ## json_object ## ]]\n{...}",  # Example to help the model understand
+            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -414,32 +405,56 @@ def flight_information(email: str) -> TravelInformation:
     assert flight_information(email="Some email") == TravelInformation(
         origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
     )
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `email`, produce the fields `flight_information`.
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            Your input fields are:
+            1. `email` (str)
+            2. `error_flight_information_0` (str): An error to avoid in the future
+            3. `error_flight_information_1` (str): An error to avoid in the future
 
-        ---
+            Your output fields are:
+            1. `flight_information` (TravelInformation): ${flight_information}. Respond with a single JSON object. JSON Schema: {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
 
-        Follow the following format.
+            All interactions will be structured in the following way, with the appropriate values filled in.
 
-        Email: ${email}
+            [[ ## email ## ]]
+            {email}
 
-        Past Error in Flight Information: An error to avoid in the future
+            [[ ## error_flight_information_0 ## ]]
+            {error_flight_information_0}
 
-        Past Error (2) in Flight Information: An error to avoid in the future
+            [[ ## error_flight_information_1 ## ]]
+            {error_flight_information_1}
 
-        Flight Information: ${flight_information}. Respond with a single JSON object. JSON Schema: {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
+            [[ ## flight_information ## ]]
+            {flight_information}
 
-        ---
+            [[ ## completed ## ]]
 
-        Email: Some email
+            In adhering to this structure, your objective is:
+                    Given the fields `email`, produce the fields `flight_information`."""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+            [[ ## email ## ]]
+            Some email
 
-        Past Error in Flight Information: String should match pattern '^[A-Z]{3}$': origin (error type: string_pattern_mismatch)
+            [[ ## error_flight_information_0 ## ]]
+            String should match pattern '^[A-Z]{3}$': origin (error type: string_pattern_mismatch)
 
-        Past Error (2) in Flight Information: String should match pattern '^[A-Z]{3}$': destination (error type: string_pattern_mismatch)
+            [[ ## error_flight_information_1 ## ]]
+            String should match pattern '^[A-Z]{3}$': destination (error type: string_pattern_mismatch)
 
-        Flight Information: {"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}"""
-    )
+            Respond with the corresponding output fields, starting with the field `flight_information`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
 
 def test_field_validator():
@@ -462,7 +477,7 @@ def get_user_details() -> UserDetails:
     # out of retries.
     lm = DummyLM(
         [
-            '{"name": "lower case name", "age": 25}',
+            '[[ ## get_user_details ## ]]\n{"name": "lower case name", "age": 25}',
         ]
         * 10
     )
@@ -471,25 +486,42 @@ def get_user_details() -> UserDetails:
     with pytest.raises(ValueError):
         get_user_details()
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields , produce the fields `get_user_details`.
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+                1. `json_schema` (str)
 
-        ---
+                Your output fields are:
+                1. `json_object` (str)
 
-        Follow the following format.
+                All interactions will be structured in the following way, with the appropriate values filled in.
 
-        Past Error in Get User Details: An error to avoid in the future
-        Past Error (2) in Get User Details: An error to avoid in the future
-        Get User Details: ${get_user_details}. Respond with a single JSON object. JSON Schema: {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
+                [[ ## json_schema ## ]]
+                {json_schema}
 
-        ---
+                [[ ## json_object ## ]]
+                {json_object}
 
-        Past Error in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
-        Past Error (2) in Get User Details: Value error, Name must be in uppercase.: name (error type: value_error)
-        Get User Details: {"name": "lower case name", "age": 25}"""
-    )
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is:
+                        Make a very succinct json object that validates with the following schema"""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## json_schema ## ]]
+                {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
+
+                Respond with the corresponding output fields, starting with the field `json_object`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
 
 def test_annotated_field():
@@ -498,7 +530,7 @@ def test(input: Annotated[str, Field(description="description")]) -> Annotated[f
         pass
 
     # First try 0, which fails, then try 0.5, which passes
-    lm = DummyLM(["0", "0.5"])
+    lm = DummyLM(["[[ ## test ## ]]\n0", "[[ ## test ## ]]\n0.5"])
     dspy.settings.configure(lm=lm)
 
     output = test(input="input")
@@ -507,7 +539,7 @@ def test(input: Annotated[str, Field(description="description")]) -> Annotated[f
 
 
 def test_multiple_outputs():
-    lm = DummyLM([str(i) for i in range(100)])
+    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     test = TypedPredictor("input -> output")
@@ -516,7 +548,7 @@ def test_multiple_outputs():
 
 
 def test_multiple_outputs_int():
-    lm = DummyLM([str(i) for i in range(100)])
+    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     class TestSignature(dspy.Signature):
@@ -533,9 +565,9 @@ def test_multiple_outputs_int_cot():
     # Note: Multiple outputs only work when the language model "speculatively" generates all the outputs in one go.
     lm = DummyLM(
         [
-            "thoughts 0\nOutput: 0\n",
-            "thoughts 1\nOutput: 1\n",
-            "thoughts 2\nOutput: 2\n",
+            "[[ ## reasoning ## ]]\nthoughts 0\n\n[[ ## output ## ]]\n0\n",
+            "[[ ## reasoning ## ]]\nthoughts 1\n\n[[ ## output ## ]]\n1\n",
+            "[[ ## reasoning ## ]]\nthoughts 2\n\n[[ ## output ## ]]\n2\n",
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -547,7 +579,7 @@ def test_multiple_outputs_int_cot():
 
 
 def test_parse_type_string():
-    lm = DummyLM([str(i) for i in range(100)])
+    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     test = TypedPredictor("input:int -> output:int")
@@ -557,7 +589,8 @@ def test_parse_type_string():
 
 
 def test_literal():
-    lm = DummyLM(['"2"', '"3"'])
+    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
+    lm = DummyLM(['[[ ## output ## ]]\n"2"', '[[ ## output ## ]]\n"3"'])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -568,7 +601,8 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_missmatch():
-    lm = DummyLM([f'"{i}"' for i in range(5, 100)])
+    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
+    lm = DummyLM([f'[[ ## output ## ]]\n"{i}"' for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -582,7 +616,8 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_int():
-    lm = DummyLM(["2", "3"])
+    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
+    lm = DummyLM(["[[ ## output ## ]]\n2", "[[ ## output ## ]]\n3"])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -593,7 +628,8 @@ def f() -> Literal[2, 3]:
 
 
 def test_literal_int_missmatch():
-    lm = DummyLM([f"{i}" for i in range(5, 100)])
+    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
+    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -612,8 +648,8 @@ class SimpleOutput(dspy.Signature):
 
     lm = DummyLM(
         [
-            "2.1",  # Bad output
-            "0.5",  # Good output
+            "[[ ## output ## ]]\n2.1",  # Bad output
+            "[[ ## output ## ]]\n0.5",  # Good output
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -635,12 +671,12 @@ class ExampleSignature(dspy.Signature):
 
     lm = DummyLM(
         [
-            '{"fact": "The sky is blue", "varacity": true}',
-            '{"fact": "The sky is green", "varacity": false}',
-            '{"fact": "The sky is red", "varacity": true}',
-            '{"fact": "The earth is flat", "varacity": false}',
-            '{"fact": "The earth is round", "varacity": true}',
-            '{"fact": "The earth is a cube", "varacity": false}',
+            '[[ ## fact ## ]]\n{"fact": "The sky is blue", "varacity": true}',
+            '[[ ## fact ## ]]\n{"fact": "The sky is green", "varacity": false}',
+            '[[ ## fact ## ]]\n{"fact": "The sky is red", "varacity": true}',
+            '[[ ## fact ## ]]\n{"fact": "The earth is flat", "varacity": false}',
+            '[[ ## fact ## ]]\n{"fact": "The earth is round", "varacity": true}',
+            '[[ ## fact ## ]]\n{"fact": "The earth is a cube", "varacity": false}',
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -676,7 +712,7 @@ class ScoredSignature(dspy.Signature):
 
     program = TypedChainOfThought(ScoredSignature)
 
-    lm = DummyLM(["Thoughts", "Output"])
+    lm = DummyLM(["[[ ## reasoning ## ]]\nThoughts\n\n[[ ## proposed_signature ## ]]\nOutput"])
     dspy.settings.configure(lm=lm)
 
     output = program(
@@ -691,24 +727,48 @@ class ScoredSignature(dspy.Signature):
 
     assert output == "Output"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `attempted_signatures`, produce the fields `proposed_signature`.
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+                1. `attempted_signatures` (list[ScoredString])
 
-        ---
+                Your output fields are:
+                1. `reasoning` (str): ${produce the proposed_signature}. We ...
+                2. `proposed_signature` (str)
 
-        Follow the following format.
+                All interactions will be structured in the following way, with the appropriate values filled in.
 
-        Attempted Signatures: ${attempted_signatures}
-        Reasoning: Let's think step by step in order to ${produce the proposed_signature}. We ...
-        Proposed Signature: ${proposed_signature}
+                [[ ## attempted_signatures ## ]]
+                {attempted_signatures}
 
-        ---
+                [[ ## reasoning ## ]]
+                {reasoning}
 
-        Attempted Signatures: [{"string":"string 1","score":0.5},{"string":"string 2","score":0.4},{"string":"string 3","score":0.3}]
-        Reasoning: Let's think step by step in order to Thoughts
-        Proposed Signature: Output"""
-    )
+                [[ ## proposed_signature ## ]]
+                {proposed_signature}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is:
+                        Given the fields `attempted_signatures`, produce the fields `proposed_signature`."""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## attempted_signatures ## ]]
+                [1] «string='string 1' score=0.5»
+                [2] «string='string 2' score=0.4»
+                [3] «string='string 3' score=0.3»
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `proposed_signature`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
 
 def test_custom_reasoning_field():
@@ -727,7 +787,7 @@ class QuestionSignature(dspy.Signature):
     program = TypedChainOfThought(QuestionSignature, reasoning=reasoning)
 
     expected = "What is the speed of light?"
-    lm = DummyLM(["Thoughts", f'{{"value": "{expected}"}}'])
+    lm = DummyLM([f'[[ ## reasoning ## ]]\nThoughts\n\n[[ ## question ## ]]\n{{"value": "{expected}"}}'])
     dspy.settings.configure(lm=lm)
 
     output = program(topic="Physics")
@@ -735,24 +795,46 @@ class QuestionSignature(dspy.Signature):
     assert isinstance(output.question, Question)
     assert output.question.value == expected
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `topic`, produce the fields `question`.
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+                1. `topic` (str)
 
-        ---
+                Your output fields are:
+                1. `reasoning` (str): ${topic}, we should ...
+                2. `question` (Question): ${question}. Respond with a single JSON object. JSON Schema: {"properties": {"value": {"title": "Value", "type": "string"}}, "required": ["value"], "title": "Question", "type": "object"}
 
-        Follow the following format.
+                All interactions will be structured in the following way, with the appropriate values filled in.
 
-        Topic: ${topic}
-        Custom Reasoning: Let's break this down. To generate a question about ${topic}, we should ...
-        Question: ${question}. Respond with a single JSON object. JSON Schema: {"properties": {"value": {"title": "Value", "type": "string"}}, "required": ["value"], "title": "Question", "type": "object"}
+                [[ ## topic ## ]]
+                {topic}
 
-        ---
+                [[ ## reasoning ## ]]
+                {reasoning}
 
-        Topic: Physics
-        Custom Reasoning: Let's break this down. To generate a question about Thoughts
-        Question: {"value": "What is the speed of light?"}"""
-    )
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is:
+                        Given the fields `topic`, produce the fields `question`."""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## topic ## ]]
+                Physics
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `question`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
 
 def test_generic_signature():
@@ -766,7 +848,7 @@ class GenericSignature(dspy.Signature, Generic[T]):
     predictor = TypedPredictor(GenericSignature[int])
     assert predictor.signature.instructions == "My signature"
 
-    lm = DummyLM(["23"])
+    lm = DummyLM(["[[ ## output ## ]]\n23"])
     dspy.settings.configure(lm=lm)
 
     assert predictor().output == 23
@@ -802,11 +884,17 @@ def check_square(n):
     def next_square(n: int) -> Annotated[int, AfterValidator(check_square)]:
         """What is the next square number after n?"""
 
-    lm = DummyLM(["3", "False", "4", "True"])
+    lm = DummyLM(
+        [
+            "[[ ## next_square ## ]]\n3",
+            "[[ ## is_square ## ]]\nFalse",
+            "[[ ## next_square ## ]]\n4",
+            "[[ ## is_square ## ]]\nTrue",
+        ]
+    )
     dspy.settings.configure(lm=lm)
 
     m = next_square(n=2)
-    lm.inspect_history(n=2)
 
     assert m == 4
 
@@ -824,7 +912,7 @@ class MySignature(dspy.Signature):
         n: int = dspy.InputField()
         next_square: Annotated[int, AfterValidator(is_square)] = dspy.OutputField()
 
-    lm = DummyLM(["3", "4"])
+    lm = DummyLM(["[[ ## next_square ## ]]\n3", "[[ ## next_square ## ]]\n4"])
     dspy.settings.configure(lm=lm)
 
     m = TypedPredictor(MySignature)(n=2).next_square
@@ -843,7 +931,7 @@ def is_square(n: int) -> int:
     def next_square(n: int) -> Annotated[int, AfterValidator(is_square)]:
         """What is the next square number after n?"""
 
-    lm = DummyLM(["3", "4"])
+    lm = DummyLM(["[[ ## next_square ## ]]\n3", "[[ ## next_square ## ]]\n4"])
     dspy.settings.configure(lm=lm)
 
     m = next_square(n=2)
@@ -861,70 +949,91 @@ def test_demos():
         trainset=[ex.with_inputs("input") for ex in demos],
     )
 
-    lm = DummyLM(["Paris"])
+    lm = DummyLM(["[[ ## output ## ]]\nParis"])
     dspy.settings.configure(lm=lm)
 
     assert program(input="What is the capital of France?").output == "Paris"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+                1. `input` (str)
 
-        Follow the following format.
+                Your output fields are:
+                1. `output` (str)
 
-        Input: ${input}
-        Output: ${output}
+                All interactions will be structured in the following way, with the appropriate values filled in.
 
-        ---
+                [[ ## input ## ]]
+                {input}
 
-        Input: What is the speed of light?
-        Output: 3e8
+                [[ ## output ## ]]
+                {output}
 
-        ---
+                [[ ## completed ## ]]
 
-        Input: What is the capital of France?
-        Output: Paris"""
-    )
+                In adhering to this structure, your objective is:
+                        Given the fields `input`, produce the fields `output`."""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## input ## ]]
+                What is the speed of light?
+
+                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": textwrap.dedent(
+                """\
+                [[ ## output ## ]]
+                3e8
+
+                [[ ## completed ## ]]"""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## input ## ]]
+                What is the capital of France?
+
+                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
 
-def _test_demos_missing_input():
+def test_demos_missing_input_in_demo():
     demos = [dspy.Example(input="What is the speed of light?", output="3e8")]
     program = LabeledFewShot(k=len(demos)).compile(
         student=dspy.TypedPredictor("input -> output, thoughts"),
         trainset=[ex.with_inputs("input") for ex in demos],
     )
-    dspy.settings.configure(lm=DummyLM(["My thoughts", "Paris"]))
+    lm = DummyLM(["[[ ## thoughts ## ]]\nMy thoughts\n\n[[ ## output ## ]]\nParis"])
+    dspy.settings.configure(lm=lm)
     assert program(input="What is the capital of France?").output == "Paris"
 
-    assert dspy.settings.lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Thoughts: ${thoughts}
-        Output: ${output}
-
-        ---
-
-        Input: What is the speed of light?
-        Output: 3e8
-
-        ---
-
-        Input: What is the capital of France?
-        Thoughts: My thoughts
-        Output: Paris"""
-    )
-
 
 def test_conlist():
-    dspy.settings.configure(lm=DummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                "[[ ## make_numbers ## ]]\n[]",
+                "[[ ## make_numbers ## ]]\n[1]",
+                "[[ ## make_numbers ## ]]\n[1, 2]",
+                "[[ ## make_numbers ## ]]\n[1, 2, 3]",
+            ]
+        )
+    )
 
     @predictor
     def make_numbers(input: str) -> Annotated[list[int], Field(min_items=2)]:
@@ -934,7 +1043,16 @@ def make_numbers(input: str) -> Annotated[list[int], Field(min_items=2)]:
 
 
 def test_conlist2():
-    dspy.settings.configure(lm=DummyLM(["[]", "[1]", "[1, 2]", "[1, 2, 3]"]))
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                "[[ ## output ## ]]\n[]",
+                "[[ ## output ## ]]\n[1]",
+                "[[ ## output ## ]]\n[1, 2]",
+                "[[ ## output ## ]]\n[1, 2, 3]",
+            ]
+        )
+    )
 
     make_numbers = TypedPredictor("input:str -> output:Annotated[List[int], Field(min_items=2)]")
     assert make_numbers(input="What are the first two numbers?").output == [1, 2]
@@ -952,7 +1070,7 @@ def check_cateogry(self):
                 raise ValueError(f"category not in {self.allowed_categories}")
             return self
 
-    lm = DummyLM(["horse", "dog"])
+    lm = DummyLM(["[[ ## category ## ]]\nhorse", "[[ ## category ## ]]\ndog"])
     dspy.settings.configure(lm=lm)
     predictor = TypedPredictor(MySignature)
 
diff --git a/tests/functional/test_signature_opt_typed.py b/tests/functional/test_signature_opt_typed.py
index 59a610e6b6..125cec30b7 100644
--- a/tests/functional/test_signature_opt_typed.py
+++ b/tests/functional/test_signature_opt_typed.py
@@ -1,17 +1,15 @@
+import json
 from typing import Generic, TypeVar
 
 import pydantic
-import dspy
-from dspy.evaluate import Evaluate
-from dspy.functional import TypedPredictor
-from dspy.teleprompt.signature_opt_typed import optimize_signature, make_info
-from dspy.utils import DummyLM
+from pydantic_core import to_jsonable_python
 
+import dspy
 from dspy.evaluate import Evaluate
 from dspy.evaluate.metrics import answer_exact_match
 from dspy.functional import TypedPredictor
-import json
-from pydantic_core import to_jsonable_python
+from dspy.teleprompt.signature_opt_typed import make_info, optimize_signature
+from dspy.utils import DummyLM
 
 hotpotqa = [
     ex.with_inputs("question")
@@ -105,12 +103,12 @@ class BasicQA(dspy.Signature):
         question: str = dspy.InputField()
         answer: str = dspy.OutputField()
 
-    qa_model = DummyLM([])
+    qa_model = DummyLM(["[[ ## answer ## ]]\nfoo"] * 100)
     prompt_model = DummyLM(
         [
             # Seed prompts
-            "some thoughts",
-            '[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
+            "[[ ## reasoning ## ]]\nsome thoughts\n\n"
+            '[[ ## proposed_signatures ## ]]\n[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
         ]
     )
     dspy.settings.configure(lm=qa_model)
@@ -167,10 +165,8 @@ class ExpectedSignature2(dspy.Signature):
     qa_model = DummyLM([])
     prompt_model = DummyLM(
         [
-            "some thoughts",
-            json.dumps([to_jsonable_python(info1)]),
-            "some thoughts",
-            json.dumps([to_jsonable_python(info2)]),
+            f"[[ ## reasoning ## ]]\nsome thoughts\n\n[[ ## proposed_signatures ## ]]\n{json.dumps([to_jsonable_python(info1)])}",
+            f"[[ ## reasoning ## ]]\nsome thoughts\n\n[[ ## proposed_signatures ## ]]\n{json.dumps([to_jsonable_python(info2)])}",
         ]
     )
     dspy.settings.configure(lm=qa_model)
diff --git a/tests/predict/test_chain_of_thought.py b/tests/predict/test_chain_of_thought.py
index c1d08e729c..78cf692649 100644
--- a/tests/predict/test_chain_of_thought.py
+++ b/tests/predict/test_chain_of_thought.py
@@ -1,4 +1,5 @@
 import textwrap
+
 import dspy
 from dspy import ChainOfThought
 from dspy.utils import DummyLM
@@ -14,22 +15,23 @@ def test_initialization_with_string_signature():
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `question`, produce the fields `answer`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
-        Answer: ${answer}
-
-        ---
-
-        Question: What is 1+1?
-        Reasoning: Let's think step by step in order to find the number after 1
-        Answer: 2"""
-    )
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
diff --git a/tests/predict/test_chain_of_thought_with_hint.py b/tests/predict/test_chain_of_thought_with_hint.py
index c28fb375ff..99c7f8e3b5 100644
--- a/tests/predict/test_chain_of_thought_with_hint.py
+++ b/tests/predict/test_chain_of_thought_with_hint.py
@@ -1,10 +1,12 @@
+import textwrap
+
 import dspy
 from dspy import ChainOfThoughtWithHint
 from dspy.utils import DummyLM
 
 
 def test_cot_with_no_hint():
-    lm = DummyLM(["find the number after 1", "2"])
+    lm = DummyLM(["[[ ## rationale ## ]]\nfind the number after 1\n\n[[ ## answer ## ]]\n2"])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     # Check output fields have the right order
@@ -15,16 +17,30 @@ def test_cot_with_no_hint():
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
-    final_convo = lm.get_convo(-1)
-    assert final_convo.endswith(
-        "Question: What is 1+1?\n"
-        "Reasoning: Let's think step by step in order to find the number after 1\n"
-        "Answer: 2"
-    )
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
 
 
 def test_cot_with_hint():
-    lm = DummyLM(["find the number after 1", "2"])
+    lm = DummyLM(["[[ ## rationale ## ]]\nfind the number after 1\n\n[[ ## answer ## ]]\n2"])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     assert list(predict.extended_signature2.output_fields.keys()) == [
@@ -34,10 +50,23 @@ def test_cot_with_hint():
     ]
     assert predict(question="What is 1+1?", hint="think small").answer == "2"
 
-    final_convo = lm.get_convo(-1)
-    assert final_convo.endswith(
-        "Question: What is 1+1?\n\n"
-        "Reasoning: Let's think step by step in order to find the number after 1\n\n"
-        "Hint: think small\n\n"
-        "Answer: 2"
-    )
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
index c701b380a8..55a37e4b3e 100644
--- a/tests/predict/test_predict.py
+++ b/tests/predict/test_predict.py
@@ -1,11 +1,13 @@
-import dspy
-from dspy import Predict, Signature, TypedPredictor
-from dspy.utils.dummies import DummyLM
 import copy
 import textwrap
+
 import pydantic
 import ujson
 
+import dspy
+from dspy import Predict, Signature, TypedPredictor
+from dspy.utils.dummies import DummyLM
+
 
 def test_initialization_with_string_signature():
     signature_string = "input1, input2 -> output"
@@ -43,16 +45,27 @@ def test_call_method():
     dspy.settings.configure(lm=lm)
     result = predict_instance(input="test input")
     assert result.output == "test output"
-    assert lm.get_convo(-1) == (
-        "Given the fields `input`, produce the fields `output`.\n"
-        "\n---\n\n"
-        "Follow the following format.\n\n"
-        "Input: ${input}\n"
-        "Output: ${output}\n"
-        "\n---\n\n"
-        "Input: test input\n"
-        "Output: test output"
-    )
+
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
 
 
 def test_instructions_after_dump_and_load_state():
@@ -206,17 +219,23 @@ class OutputOnlySignature(dspy.Signature):
     dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields , produce the fields `output`.
-        
-        ---
-        
-        Follow the following format.
-        
-        Output: ${output}
-        
-        ---
-        
-        Output: short answer"""
-    )
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
diff --git a/tests/predict/test_program_of_thought.py b/tests/predict/test_program_of_thought.py
index 2aa153a1d6..2980272f9a 100644
--- a/tests/predict/test_program_of_thought.py
+++ b/tests/predict/test_program_of_thought.py
@@ -1,121 +1,106 @@
-from dspy import Signature, ProgramOfThought
+import textwrap
+
 import dspy
+from dspy import ProgramOfThought, Signature
 from dspy.utils import DummyLM
-import textwrap
+
 
 class BasicQA(Signature):
     question = dspy.InputField()
     answer = dspy.OutputField(desc="often between 1 and 5 words")
 
+
 def test_pot_code_generation():
     pot = ProgramOfThought(BasicQA)
-    lm = DummyLM([
-        "Reason_A",
-        "```python\nresult = 1+1\n```", 
-        "Reason_B",
-        "2",
-    ])
+    lm = DummyLM(
+        [
+            "Reason_A",
+            "```python\nresult = 1+1\n```",
+            "Reason_B",
+            "2",
+        ]
+    )
     dspy.settings.configure(lm=lm)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
-    assert lm.get_convo(index=-1) == textwrap.dedent("""\
-        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-
-        Code: python code that answers the question
-
-        Code Output: output of previously-generated python code
-
-        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
 
-        Answer: often between 1 and 5 words
-
-        ---
-
-        Question: What is 1+1?
-
-        Code: result = 1+1
-
-        Code Output: 2
-
-        Reasoning: Let's think step by step in order to Reason_B
-
-        Answer: 2""")
 
 def test_pot_code_generation_with_error():
     pot = ProgramOfThought(BasicQA)
-    lm = DummyLM([
-        "Reason_A",
-        "```python\nresult = 1+0/0\n```",
-        "Reason_B", # Error: division by zero
-        "```python\nresult = 1+1\n```",
-        "Reason_C",
-        "2",
-    ])
+    lm = DummyLM(
+        [
+            "Reason_A",
+            "```python\nresult = 1+0/0\n```",
+            "Reason_B",  # Error: division by zero
+            "```python\nresult = 1+1\n```",
+            "Reason_C",
+            "2",
+        ]
+    )
     dspy.settings.configure(lm=lm)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
 
     # The first code example failed
-    assert lm.get_convo(index=2) == textwrap.dedent("""\
-        You are given `question`, `previous_code`, `error` due to an error in previous code.
-        Your task is to correct the error and provide the new `generated_code`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-
-        Previous Code: previously-generated python code that errored
-
-        Error: error message from previously-generated python code
-
-        Reasoning: Let's think step by step in order to ${produce the generated_code}. We ...
-
-        Code: python code that answers the question
-
-        ---
-
-        Question: What is 1+1?
-
-        Previous Code: result = 1+0/0
-
-        Error: division by zero
-
-        Reasoning: Let's think step by step in order to Reason_B""")
+    for message in lm.get_convo(2)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(2)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
 
     # The second code example succeeded
-    assert lm.get_convo(-1) == textwrap.dedent("""\
-        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
-
-        ---
-
-        Follow the following format.
-
-        Question: ${question}
-
-        Code: python code that answers the question
-
-        Code Output: output of previously-generated python code
-
-        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
-
-        Answer: often between 1 and 5 words
-
-        ---
-
-        Question: What is 1+1?
-
-        Code: result = 1+1
-
-        Code Output: 2
-
-        Reasoning: Let's think step by step in order to Reason_C
-
-        Answer: 2""")
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
diff --git a/tests/primitives/test_program.py b/tests/primitives/test_program.py
index ecbbf06ecb..9f271e992b 100644
--- a/tests/primitives/test_program.py
+++ b/tests/primitives/test_program.py
@@ -1,8 +1,5 @@
 import dspy
-from dspy.primitives.program import (
-    Module,
-    set_attribute_by_name,
-)  # Adjust the import based on your file structure
+from dspy.primitives.program import Module, set_attribute_by_name  # Adjust the import based on your file structure
 from dspy.utils import DummyLM
 
 
@@ -39,7 +36,14 @@ def test_predictors():
 
 def test_forward():
     program = HopModule()
-    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "let me check", "let me check": "2"}))
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": "[[ ## query ## ]]\nlet me check",
+                "let me check": "[[ ## answer ## ]]\n2",
+            }
+        )
+    )
     result = program(question="What is 1+1?").answer
     assert result == "2"
 
@@ -166,8 +170,9 @@ def __init__(self):
         self.p0 = dspy.Predict("question -> answer")
         self.p1 = self.p0
 
+
 def test_named_parameters_duplicate_references():
-   module = DuplicateModule()
-   # Only testing for whether exceptions are thrown or not
-   # As Module.named_parameters() is recursive, this is mainly for catching infinite recursion
-   module.named_parameters()
+    module = DuplicateModule()
+    # Only testing for whether exceptions are thrown or not
+    # As Module.named_parameters() is recursive, this is mainly for catching infinite recursion
+    module.named_parameters()
diff --git a/tests/retrieve/test_llama_index_rm.py b/tests/retrieve/test_llama_index_rm.py
index 35711087e9..f06f96388b 100644
--- a/tests/retrieve/test_llama_index_rm.py
+++ b/tests/retrieve/test_llama_index_rm.py
@@ -3,8 +3,8 @@
 import pytest
 
 import dspy
-from dsp.modules.dummy_lm import DummyLM
 from dspy.datasets import HotPotQA
+from dspy.utils.dummies import DummyLM
 
 try:
     from llama_index.core import Settings, VectorStoreIndex
diff --git a/tests/signatures/test_signature.py b/tests/signatures/test_signature.py
index c99d1bdd03..52f6f74777 100644
--- a/tests/signatures/test_signature.py
+++ b/tests/signatures/test_signature.py
@@ -191,26 +191,45 @@ class MySignature(Signature):
 
     predictor = dspy.Predict(MySignature)
 
-    lm = DummyLM(["short answer"])
+    lm = DummyLM(["[[ ## output ## ]]\nshort answer"])
     dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
 
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        First line
-        Second line
-            Third line
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+
+
+                Your output fields are:
+                1. `output` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
 
-        ---
 
-        Follow the following format.
 
-        Output: ${output}
+                [[ ## output ## ]]
+                {output}
 
-        ---
+                [[ ## completed ## ]]
 
-        Output: short answer"""
-    )
+                In adhering to this structure, your objective is:
+                        First line
+                        Second line
+                            Third line"""
+            ),
+        },
+        {
+            "role": "user",
+            "content": "Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`.",
+        },
+    ]
 
 
 def test_replaced_by_replace_context_manager():
diff --git a/tests/teleprompt/test_bootstrap.py b/tests/teleprompt/test_bootstrap.py
index ba2acd5563..05431a632e 100644
--- a/tests/teleprompt/test_bootstrap.py
+++ b/tests/teleprompt/test_bootstrap.py
@@ -1,10 +1,12 @@
+import textwrap
+
 import pytest
+
 import dspy
-from dspy.predict import Predict
-from dspy.utils.dummies import DummyLM
 from dspy import Example
+from dspy.predict import Predict
 from dspy.teleprompt import BootstrapFewShot
-import textwrap
+from dspy.utils.dummies import DummyLM
 
 
 # Define a simple metric function for testing
@@ -15,9 +17,7 @@ def simple_metric(example, prediction, trace=None):
 
 examples = [
     Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(
-        input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
-    ),
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"),
 ]
 trainset = [examples[0]]
 valset = [examples[1]]
@@ -25,9 +25,7 @@ def simple_metric(example, prediction, trace=None):
 
 def test_bootstrap_initialization():
     # Initialize BootstrapFewShot with a dummy metric and minimal setup
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
     assert bootstrap.metric == simple_metric, "Metric not correctly initialized"
 
 
@@ -50,32 +48,24 @@ def test_compile_with_predict_instances():
     dspy.settings.configure(lm=lm)
 
     # Initialize BootstrapFewShot and compile the student
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
-    compiled_student = bootstrap.compile(
-        student, teacher=teacher, trainset=trainset
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
     assert compiled_student is not None, "Failed to compile student"
-    assert (
-        hasattr(compiled_student, "_compiled") and compiled_student._compiled
-    ), "Student compilation flag not set"
+    assert hasattr(compiled_student, "_compiled") and compiled_student._compiled, "Student compilation flag not set"
 
 
 def test_bootstrap_effectiveness():
     # This test verifies if the bootstrapping process improves the student's predictions
     student = SimpleModule("input -> output")
     teacher = SimpleModule("input -> output")
-    lm = DummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    lm = DummyLM(
+        ["[[ ## output ## ]]\nblue", "[[ ## output ## ]]\nRing-ding-ding-ding-dingeringeding!"], follow_examples=True
+    )
     dspy.settings.configure(lm=lm, trace=[])
 
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
-    compiled_student = bootstrap.compile(
-        student, teacher=teacher, trainset=trainset
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
     # Check that the compiled student has the correct demos
     assert len(compiled_student.predictor.demos) == 1
@@ -90,31 +80,66 @@ def test_bootstrap_effectiveness():
     prediction = compiled_student(input=trainset[0].input)
     assert prediction.output == trainset[0].output
 
-    # For debugging
-    print("Convo")
-    print(lm.get_convo(-1))
-
-    assert lm.get_convo(-1) == textwrap.dedent(
-        """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Follow the following format.
-
-        Input: ${input}
-        Output: ${output}
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue
-
-        ---
-
-        Input: What is the color of the sky?
-        Output: blue"""
-    )
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+                1. `input` (str)
+
+                Your output fields are:
+                1. `output` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## input ## ]]
+                {input}
+
+                [[ ## output ## ]]
+                {output}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is:
+                        Given the fields `input`, produce the fields `output`."""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## input ## ]]
+                What is the color of the sky?
+
+                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
+            ),
+        },
+        {
+            "role": "assistant",
+            "content": textwrap.dedent(
+                """\
+                [[ ## output ## ]]
+                blue
+
+                [[ ## completed ## ]]"""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## input ## ]]
+                What is the color of the sky?
+
+                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
 
 def test_error_handling_during_bootstrap():
@@ -136,7 +161,7 @@ def forward(self, **kwargs):
     # Setup DummyLM to simulate an error scenario
     lm = DummyLM(
         [
-            "Initial thoughts",  # Simulate initial teacher's prediction
+            "[[ ## output ## ]]\nInitial thoughts",  # Simulate initial teacher's prediction
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -161,20 +186,14 @@ def test_validation_set_usage():
 
     lm = DummyLM(
         [
-            "Initial thoughts",
-            "Finish[blue]",  # Expected output for both training and validation
+            "[[ ## output ## ]]\nInitial thoughts",
+            "[[ ## output ## ]]\nFinish[blue]",  # Expected output for both training and validation
         ]
     )
     dspy.settings.configure(lm=lm)
 
-    bootstrap = BootstrapFewShot(
-        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
-    )
-    compiled_student = bootstrap.compile(
-        student, teacher=teacher, trainset=trainset
-    )
+    bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
 
     # Check that validation examples are part of student's demos after compilation
-    assert len(compiled_student.predictor.demos) >= len(
-        valset
-    ), "Validation set not used in compiled student demos"
+    assert len(compiled_student.predictor.demos) >= len(valset), "Validation set not used in compiled student demos"
diff --git a/tests/teleprompt/test_copro_optimizer.py b/tests/teleprompt/test_copro_optimizer.py
index c0fe712bcf..2618367cac 100644
--- a/tests/teleprompt/test_copro_optimizer.py
+++ b/tests/teleprompt/test_copro_optimizer.py
@@ -1,20 +1,26 @@
 import textwrap
+
 import dspy
+from dspy import Example
 from dspy.teleprompt.signature_opt import COPRO
 from dspy.utils.dummies import DummyLM
-from dspy import Example
+
 
 # Define a simple metric function for testing
 def simple_metric(example, prediction):
     # Simplified metric for testing: true if prediction matches expected output
     return example.output == prediction.output
 
+
 # Example training and validation sets
 trainset = [
     Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs(
+        "input"
+    ),
 ]
 
+
 def test_signature_optimizer_initialization():
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
     assert optimizer.metric == simple_metric, "Metric not correctly initialized"
@@ -22,6 +28,7 @@ def test_signature_optimizer_initialization():
     assert optimizer.depth == 1, "Depth not correctly initialized"
     assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
 
+
 class SimpleModule(dspy.Module):
     def __init__(self, signature):
         super().__init__()
@@ -31,15 +38,25 @@ def __init__(self, signature):
     def forward(self, **kwargs):
         return self.predictor(**kwargs)
 
+
 def test_signature_optimizer_optimization_process():
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
-    dspy.settings.configure(lm=DummyLM(["Optimized instruction 1", "Optimized instruction 2"]))
-    
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                "[[ ## proposed_instruction ## ]]\nOptimized instruction 1\n\n"
+                "[[ ## proposed_prefix_for_output_field ## ]]\nOptimized instruction 2",
+            ]
+        )
+    )
+
     student = SimpleModule("input -> output")
-    
+
     # Assuming the compile method of COPRO requires a student module, a development set, and evaluation kwargs
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
-    
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
     # Check that the optimized student has been modified from the original
     # This check can be more specific based on how the optimization modifies the student
     assert optimized_student is not student, "Optimization did not modify the student"
@@ -47,75 +64,137 @@ def test_signature_optimizer_optimization_process():
     # Further tests can be added to verify the specifics of the optimization process,
     # such as checking the instructions of the optimized student's predictors.
 
+
 def test_signature_optimizer_statistics_tracking():
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
     optimizer.track_stats = True  # Enable statistics tracking
 
-    dspy.settings.configure(lm=DummyLM(["Optimized instruction"]))
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                "[[ ## proposed_instruction ## ]]\nOptimized instruction 1\n\n"
+                "[[ ## proposed_prefix_for_output_field ## ]]\nOptimized instruction 2",
+            ]
+        )
+    )
     student = SimpleModule("input -> output")
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
 
     # Verify that statistics have been tracked and attached to the optimized student
-    assert hasattr(optimized_student, 'total_calls'), "Total calls statistic not tracked"
-    assert hasattr(optimized_student, 'results_best'), "Best results statistics not tracked"
+    assert hasattr(optimized_student, "total_calls"), "Total calls statistic not tracked"
+    assert hasattr(optimized_student, "results_best"), "Best results statistics not tracked"
+
 
 # Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before
 
+
 def test_optimization_and_output_verification():
-    lm = DummyLM([
-        "Optimized Prompt",
-        "Optimized Prefix",
-    ])
+    lm = DummyLM(
+        [
+            "[[ ## proposed_instruction ## ]]\nOptimized Prompt\n\n[[ ## proposed_prefix_for_output_field ## ]]\nOptimized Prefix",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+        ]
+    )
     dspy.settings.configure(lm=lm)
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
-    
+
     student = SimpleModule("input -> output")
-    
+
     # Compile the student with the optimizer
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
-    
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
+
     # Simulate calling the optimized student with a new input
     test_input = "What is the capital of France?"
     prediction = optimized_student(input=test_input)
 
     print(lm.get_convo(-1))
-    
-    assert prediction.output == "No more responses"
 
-    assert lm.get_convo(-1) == textwrap.dedent("""\
-        Optimized Prompt
+    assert prediction.output == "Paris"
+
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+                Your input fields are:
+                1. `input` (str)
+
+                Your output fields are:
+                1. `reasoning` (str)
+                2. `output` (str)
 
-        ---
+                All interactions will be structured in the following way, with the appropriate values filled in.
 
-        Follow the following format.
+                [[ ## input ## ]]
+                {input}
 
-        Input: ${input}
-        Reasoning: Let's think step by step in order to ${produce the output}. We ...
-        Optimized Prefix ${output}
+                [[ ## reasoning ## ]]
+                {reasoning}
 
-        ---
+                [[ ## output ## ]]
+                {output}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is:
+                        Optimized Prompt"""
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                [[ ## input ## ]]
+                What is the capital of France?
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `output`, and then ending with the marker for `completed`."""
+            ),
+        },
+    ]
 
-        Input: What is the capital of France?
-        Reasoning: Let's think step by step in order to No more responses
-        Optimized Prefix No more responses""")
 
 def test_statistics_tracking_during_optimization():
-    dspy.settings.configure(lm=DummyLM(["Optimized instruction for stats tracking"]))
-    
+    dspy.settings.configure(
+        lm=DummyLM(
+            [
+                "[[ ## proposed_instruction ## ]]\nOptimized Prompt\n\n"
+                "[[ ## proposed_prefix_for_output_field ## ]]\nOptimized Prefix",
+            ]
+        )
+    )
+
     optimizer = COPRO(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
     optimizer.track_stats = True  # Enable statistics tracking
-    
+
     student = SimpleModule("input -> output")
-    optimized_student = optimizer.compile(student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+    optimized_student = optimizer.compile(
+        student, trainset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False}
+    )
 
     # Verify that statistics have been tracked
-    assert hasattr(optimized_student, 'total_calls'), "Optimizer did not track total metric calls"
+    assert hasattr(optimized_student, "total_calls"), "Optimizer did not track total metric calls"
     assert optimized_student.total_calls > 0, "Optimizer reported no metric calls"
-    
+
     # Check if the results_best and results_latest contain valid statistics
-    assert 'results_best' in optimized_student.__dict__, "Optimizer did not track the best results"
-    assert 'results_latest' in optimized_student.__dict__, "Optimizer did not track the latest results"
+    assert "results_best" in optimized_student.__dict__, "Optimizer did not track the best results"
+    assert "results_latest" in optimized_student.__dict__, "Optimizer did not track the latest results"
     assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics"
-    assert len(optimized_student.results_latest) > 0, "Optimizer did not properly populate the latest results statistics"
+    assert (
+        len(optimized_student.results_latest) > 0
+    ), "Optimizer did not properly populate the latest results statistics"
 
     # Additional detailed checks can be added here to verify the contents of the tracked statistics
diff --git a/tests/teleprompt/test_mipro_optimizer.py b/tests/teleprompt/test_mipro_optimizer.py
index 17e94a580e..b586068f93 100644
--- a/tests/teleprompt/test_mipro_optimizer.py
+++ b/tests/teleprompt/test_mipro_optimizer.py
@@ -1,11 +1,13 @@
+import re
 import textwrap
+
 import pytest
-import re
+
 import dspy
 from dsp.modules import LM
+from dspy import Example
 from dspy.teleprompt.signature_opt_bayesian import MIPRO
 from dspy.utils.dummies import DummyLM
-from dspy import Example
 
 
 # Define a simple metric function for testing
@@ -13,6 +15,7 @@ def simple_metric(example, prediction, trace=None):
     # Simplified metric for testing: true if prediction matches expected output
     return example.output == prediction.output
 
+
 # Some example data
 capitals = {
     "Germany": "Berlin",
@@ -31,10 +34,11 @@ def simple_metric(example, prediction, trace=None):
 # Example training and validation sets
 trainset = [
     Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(
-        input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
-    ).with_inputs("input"),
-] + [Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input") for country, capital in capitals.items()]
+    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+] + [
+    Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input")
+    for country, capital in capitals.items()
+]
 
 
 class ConditionalLM(LM):
@@ -73,10 +77,10 @@ def basic_request(self, prompt, num_candidates=1, **kwargs):
             # For other questions, the model will answer with the last word of the question.
             else:
                 answer = current_question.split()[-1]
-            
+
             answer = "think deeply.\nOutput: " + answer
 
-        RED, GREEN, RESET = '\033[91m', '\033[92m', '\033[0m'
+        RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
         print("=== DummyLM ===")
         print(prompt, end="")
         print(f"{RED}{answer}{RESET}")
@@ -108,20 +112,14 @@ def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
 
     def get_convo(self, index):
         """get the prompt + anwer from the ith message"""
-        return self.history[index]['prompt'] \
-            + " " \
-            + self.history[index]['response']['choices'][0]['text']
+        return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
 
 
 def test_bayesian_signature_optimizer_initialization():
-    optimizer = MIPRO(
-        metric=simple_metric, num_candidates=10, init_temperature=1.4, verbose=True, track_stats=True
-    )
+    optimizer = MIPRO(metric=simple_metric, num_candidates=10, init_temperature=1.4, verbose=True, track_stats=True)
     assert optimizer.metric == simple_metric, "Metric not correctly initialized"
     assert optimizer.num_candidates == 10, "Incorrect 'num_candidates' parameter initialization"
-    assert (
-        optimizer.init_temperature == 1.4
-    ), "Initial temperature not correctly initialized"
+    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
     assert optimizer.verbose is True, "Verbose flag not correctly initialized"
     assert optimizer.track_stats is True, "Track stats flag not correctly initialized"
 
@@ -165,9 +163,7 @@ def test_signature_optimizer_optimization_process():
 
 
 def test_signature_optimizer_bad_lm():
-    dspy.settings.configure(
-        lm=DummyLM([f"Optimized instruction {i}" for i in range(30)])
-    )
+    dspy.settings.configure(lm=DummyLM([f"Optimized instruction {i}" for i in range(30)]))
     student = SimpleModule(signature="input -> output")
     optimizer = MIPRO(
         metric=simple_metric,
@@ -228,40 +224,23 @@ def test_optimization_and_output_verification():
 
     assert prediction.output == "Madrid"
 
-    expected_lm_output = textwrap.dedent(
-        """\
-        Input:
-
-        ---
-        
-        Follow the following format.
-        
-        Input: ${input}
-        Reasoning: Let's think step by step in order to ${produce the output}. We ...
-        Output: ${output}
-
-        ---
-
-        Input: What is the capital of France?
-        Reasoning: Let's think step by step in order to think deeply.
-        Output: Paris
-
-        ---
-
-        Input: What is the capital of Norway?
-        Reasoning: Let's think step by step in order to think deeply.
-        Output: Oslo
-
-        ---
-
-        Input: What does the fox say?
-        Output: Ring-ding-ding-ding-dingeringeding!
-
-        ---
-
-        Input: What is the capital of Spain?
-        Reasoning: Let's think step by step in order to think deeply.
-        Output: Madrid"""
-    )
-
-    assert lm.get_convo(-1) == expected_lm_output
\ No newline at end of file
+    for message in lm.get_convo(-1)[0]:
+        print("----")
+        print(message["content"])
+        print("----")
+    assert lm.get_convo(-1)[0] == [
+        {
+            "role": "system",
+            "content": textwrap.dedent(
+                """\
+            """
+            ),
+        },
+        {
+            "role": "user",
+            "content": textwrap.dedent(
+                """\
+                """
+            ),
+        },
+    ]
diff --git a/tests/teleprompt/test_random_search.py b/tests/teleprompt/test_random_search.py
index e76178c80d..7da4069aaa 100644
--- a/tests/teleprompt/test_random_search.py
+++ b/tests/teleprompt/test_random_search.py
@@ -1,8 +1,9 @@
 import dspy
-from dspy.predict import Predict
-from dspy.utils.dummies import DummyLM
 from dspy import Example
+from dspy.predict import Predict
 from dspy.teleprompt import BootstrapFewShotWithRandomSearch
+from dspy.utils.dummies import DummyLM
+
 
 class SimpleModule(dspy.Module):
     def __init__(self, signature):
@@ -12,9 +13,11 @@ def __init__(self, signature):
     def forward(self, **kwargs):
         return self.predictor(**kwargs)
 
+
 def simple_metric(example, prediction, trace=None):
     return example.output == prediction.output
 
+
 def test_basic_workflow():
     """Test to ensure the basic compile flow runs without errors."""
     student = SimpleModule("input -> output")
@@ -34,4 +37,3 @@ def test_basic_workflow():
         Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
     ]
     optimizer.compile(student, teacher=teacher, trainset=trainset)
-

From f817dd368f9983b9de6d3328dbd10a52f05e7427 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 05:00:22 +0000
Subject: [PATCH 06/17] feat(dspy): revert removal of whitespace

---
 tests/functional/test_functional.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
index b7d3a14872..01bc9bc531 100644
--- a/tests/functional/test_functional.py
+++ b/tests/functional/test_functional.py
@@ -405,6 +405,7 @@ def flight_information(email: str) -> TravelInformation:
     assert flight_information(email="Some email") == TravelInformation(
         origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
     )
+
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
@@ -434,9 +435,9 @@ def flight_information(email: str) -> TravelInformation:
 
             [[ ## completed ## ]]
 
-            In adhering to this structure, your objective is:
+            In adhering to this structure, your objective is: 
                     Given the fields `email`, produce the fields `flight_information`."""
-            ),
+            ),  # noqa
         },
         {
             "role": "user",
@@ -507,9 +508,9 @@ def get_user_details() -> UserDetails:
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Make a very succinct json object that validates with the following schema"""
-            ),
+            ),  # noqa
         },
         {
             "role": "user",
@@ -752,9 +753,9 @@ class ScoredSignature(dspy.Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `attempted_signatures`, produce the fields `proposed_signature`."""
-            ),
+            ),  # noqa
         },
         {
             "role": "user",
@@ -820,9 +821,9 @@ class QuestionSignature(dspy.Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `topic`, produce the fields `question`."""
-            ),
+            ),  # noqa
         },
         {
             "role": "user",
@@ -975,9 +976,9 @@ def test_demos():
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `input`, produce the fields `output`."""
-            ),
+            ),  # noqa
         },
         {
             "role": "user",

From 476b86580997f5cc7a7302b588bc83403fc4ec02 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 06:03:20 +0000
Subject: [PATCH 07/17] feat(dspy): update remaining tests

---
 .../dsp_LM/teleprompt/test_mipro_optimizer.py |  24 --
 tests/predict/test_chain_of_thought.py        |  34 ++-
 .../test_chain_of_thought_with_hint.py        |  72 ++++-
 tests/predict/test_multi_chain_comparison.py  |   2 +-
 tests/predict/test_predict.py                 |  68 +++--
 tests/predict/test_program_of_thought.py      | 164 +++++++++---
 tests/predict/test_react.py                   |  57 +---
 tests/predict/test_retry.py                   |  28 +-
 tests/signatures/test_signature.py            |  12 +-
 tests/teleprompt/test_bootstrap.py            |   4 +-
 tests/teleprompt/test_copro_optimizer.py      |   4 +-
 tests/teleprompt/test_mipro_optimizer.py      | 246 ------------------
 12 files changed, 293 insertions(+), 422 deletions(-)
 delete mode 100644 tests/teleprompt/test_mipro_optimizer.py

diff --git a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
index 7536470b98..695dc28073 100644
--- a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
+++ b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
@@ -210,28 +210,4 @@ def test_optimization_and_output_verification():
     test_input = "What is the capital of Spain?"
     prediction = optimized_student(input=test_input)
 
-    print("CORRECT ANSWER")
-    print(lm.get_convo(-1))
-
     assert prediction.output == "Madrid"
-
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-            """
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                """
-            ),
-        },
-    ]
diff --git a/tests/predict/test_chain_of_thought.py b/tests/predict/test_chain_of_thought.py
index 78cf692649..51fd88f4ee 100644
--- a/tests/predict/test_chain_of_thought.py
+++ b/tests/predict/test_chain_of_thought.py
@@ -6,32 +6,56 @@
 
 
 def test_initialization_with_string_signature():
-    lm = DummyLM(["find the number after 1", "2"])
+    lm = DummyLM(["[[ ## reasoning ## ]]\nfind the number after 1\n\n[[ ## answer ## ]]\n2"])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThought("question -> answer")
     assert list(predict.extended_signature.output_fields.keys()) == [
-        "rationale",
+        "reasoning",
         "answer",
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
     for message in lm.get_convo(-1)[0]:
         print("----")
-        print(message["content"])
+        print(textwrap.indent(message["content"], "                "))
         print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `question` (str)
+
+                Your output fields are:
+                1. `reasoning` (str)
+                2. `answer` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## reasoning ## ]]
+                {reasoning}
+
+                [[ ## answer ## ]]
+                {answer}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the fields `question`, produce the fields `answer`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## question ## ]]
+                What is 1+1?
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `answer`, and then ending with the marker for `completed`."""
             ),
         },
     ]
diff --git a/tests/predict/test_chain_of_thought_with_hint.py b/tests/predict/test_chain_of_thought_with_hint.py
index 99c7f8e3b5..4ae6128176 100644
--- a/tests/predict/test_chain_of_thought_with_hint.py
+++ b/tests/predict/test_chain_of_thought_with_hint.py
@@ -17,30 +17,54 @@ def test_cot_with_no_hint():
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `question` (str)
+
+                Your output fields are:
+                1. `rationale` (str): ${produce the answer}. We ...
+                2. `answer` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## rationale ## ]]
+                {rationale}
+
+                [[ ## answer ## ]]
+                {answer}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the fields `question`, produce the fields `answer`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## question ## ]]
+                What is 1+1?
+
+                Respond with the corresponding output fields, starting with the field `rationale`, then `answer`, and then ending with the marker for `completed`."""
             ),
         },
     ]
 
 
 def test_cot_with_hint():
-    lm = DummyLM(["[[ ## rationale ## ]]\nfind the number after 1\n\n[[ ## answer ## ]]\n2"])
+    lm = DummyLM(
+        [
+            "[[ ## rationale ## ]]\nfind the number after 1\n\n[[ ## hint ## ]]\nIs it helicopter?\n\n[[ ## answer ## ]]\n2"
+        ]
+    )
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     assert list(predict.extended_signature2.output_fields.keys()) == [
@@ -52,21 +76,49 @@ def test_cot_with_hint():
 
     for message in lm.get_convo(-1)[0]:
         print("----")
-        print(message["content"])
+        print(textwrap.indent(message["content"], "                "))
         print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `question` (str)
+
+                Your output fields are:
+                1. `rationale` (str): ${produce the answer}. We ...
+                2. `hint` (str)
+                3. `answer` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## rationale ## ]]
+                {rationale}
+
+                [[ ## hint ## ]]
+                {hint}
+
+                [[ ## answer ## ]]
+                {answer}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the fields `question`, produce the fields `answer`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## question ## ]]
+                What is 1+1?
+
+                Respond with the corresponding output fields, starting with the field `rationale`, then `hint`, then `answer`, and then ending with the marker for `completed`."""
             ),
         },
     ]
diff --git a/tests/predict/test_multi_chain_comparison.py b/tests/predict/test_multi_chain_comparison.py
index 8c936a2d80..87e2af1a72 100644
--- a/tests/predict/test_multi_chain_comparison.py
+++ b/tests/predict/test_multi_chain_comparison.py
@@ -30,7 +30,7 @@ class BasicQA(dspy.Signature):
 
     # Call the MultiChainComparison on the completions
     question = "What is the color of the sky?"
-    lm = DummyLM(["my rationale", "blue"])
+    lm = DummyLM(["[[ ## rationale ## ]]\nmy rationale\n\n[[ ## answer ## ]]\nblue"])
     dspy.settings.configure(lm=lm)
     final_pred = compare_answers(completions, question=question)
 
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
index 55a37e4b3e..8ccb1970d5 100644
--- a/tests/predict/test_predict.py
+++ b/tests/predict/test_predict.py
@@ -41,28 +41,48 @@ def test_lm_after_dump_and_load_state():
 
 def test_call_method():
     predict_instance = Predict("input -> output")
-    lm = DummyLM(["test output"])
+    lm = DummyLM(["[[ ## output ## ]]\ntest output"])
     dspy.settings.configure(lm=lm)
     result = predict_instance(input="test input")
     assert result.output == "test output"
 
     for message in lm.get_convo(-1)[0]:
         print("----")
-        print(message["content"])
+        print(textwrap.indent(message["content"], "                "))
         print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `input` (str)
+
+                Your output fields are:
+                1. `output` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## input ## ]]
+                {input}
+
+                [[ ## output ## ]]
+                {output}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the fields `input`, produce the fields `output`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## input ## ]]
+                test input
+
+                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
             ),
         },
     ]
@@ -150,14 +170,16 @@ class Output(pydantic.BaseModel):
 
 def test_forward_method():
     program = Predict("question -> answer")
-    dspy.settings.configure(lm=DummyLM([]))
+    dspy.settings.configure(lm=DummyLM(["[[ ## answer ## ]]\nNo more responses"]))
     result = program(question="What is 1+1?").answer
     assert result == "No more responses"
 
 
 def test_forward_method2():
     program = Predict("question -> answer1, answer2")
-    dspy.settings.configure(lm=DummyLM(["my first answer", "my second answer"]))
+    dspy.settings.configure(
+        lm=DummyLM(["[[ ## answer1 ## ]]\nmy first answer\n\n[[ ## answer2 ## ]]\nmy second answer"])
+    )
     result = program(question="What is 1+1?")
     assert result.answer1 == "my first answer"
     assert result.answer2 == "my second answer"
@@ -172,7 +194,7 @@ def test_config_management():
 
 def test_multi_output():
     program = Predict("question -> answer", n=2)
-    dspy.settings.configure(lm=DummyLM(["my first answer", "my second answer"]))
+    dspy.settings.configure(lm=DummyLM(["[[ ## answer ## ]]\nmy first answer", "[[ ## answer ## ]]\nmy second answer"]))
     results = program(question="What is 1+1?")
     assert results.completions.answer[0] == "my first answer"
     assert results.completions.answer[1] == "my second answer"
@@ -183,8 +205,8 @@ def test_multi_output2():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "my 0 answer\nAnswer 2: my 2 answer",
-                "my 1 answer\nAnswer 2: my 3 answer",
+                "[[ ## answer1 ## ]]\nmy 0 answer\n\n[[ ## answer2 ## ]]\nmy 2 answer",
+                "[[ ## answer1 ## ]]\nmy 1 answer\n\n[[ ## answer2 ## ]]\nmy 3 answer",
             ],
         )
     )
@@ -215,27 +237,43 @@ class OutputOnlySignature(dspy.Signature):
 
     predictor = Predict(OutputOnlySignature)
 
-    lm = DummyLM(["short answer"])
+    lm = DummyLM(["[[ ## output ## ]]\nshort answer"])
     dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
 
     for message in lm.get_convo(-1)[0]:
         print("----")
-        print(message["content"])
+        print(textwrap.indent(message["content"], "                "))
         print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+
+
+                Your output fields are:
+                1. `output` (str)
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+
+
+                [[ ## output ## ]]
+                {output}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the fields , produce the fields `output`."""
             ),
         },
         {
             "role": "user",
-            "content": textwrap.dedent(
-                """\
-                """
+            "content": (
+                "Respond with the corresponding output fields, "
+                "starting with the field `output`, and then ending with the marker for `completed`."
             ),
         },
     ]
diff --git a/tests/predict/test_program_of_thought.py b/tests/predict/test_program_of_thought.py
index 2980272f9a..5bb6547539 100644
--- a/tests/predict/test_program_of_thought.py
+++ b/tests/predict/test_program_of_thought.py
@@ -11,96 +11,194 @@ class BasicQA(Signature):
 
 
 def test_pot_code_generation():
-    pot = ProgramOfThought(BasicQA)
     lm = DummyLM(
         [
-            "Reason_A",
-            "```python\nresult = 1+1\n```",
-            "Reason_B",
-            "2",
+            "[[ ## reasoning ## ]]\nReason_A\n\n[[ ## generated_code ## ]]\n```python\nresult = 1+1\n```",
+            "[[ ## reasoning ## ]]\nReason_B\n\n[[ ## answer ## ]]\n2",
         ]
     )
     dspy.settings.configure(lm=lm)
+    pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
+
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `question` (str)
+                2. `final_generated_code` (str): python code that answers the question
+                3. `code_output` (str): output of previously-generated python code
+
+                Your output fields are:
+                1. `reasoning` (str)
+                2. `answer` (str): often between 1 and 5 words
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## final_generated_code ## ]]
+                {final_generated_code}
+
+                [[ ## code_output ## ]]
+                {code_output}
+
+                [[ ## reasoning ## ]]
+                {reasoning}
+
+                [[ ## answer ## ]]
+                {answer}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## question ## ]]
+                What is 1+1?
+
+                [[ ## final_generated_code ## ]]
+                result = 1+1
+
+                [[ ## code_output ## ]]
+                2
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `answer`, and then ending with the marker for `completed`."""
             ),
         },
     ]
 
 
 def test_pot_code_generation_with_error():
-    pot = ProgramOfThought(BasicQA)
     lm = DummyLM(
         [
-            "Reason_A",
-            "```python\nresult = 1+0/0\n```",
-            "Reason_B",  # Error: division by zero
-            "```python\nresult = 1+1\n```",
-            "Reason_C",
-            "2",
+            "[[ ## reasoning ## ]]\nReason_A\n\n[[ ## generated_code ## ]]\n```python\nresult = 1+0/0\n```",
+            "[[ ## reasoning ## ]]\nReason_B\n\n[[ ## generated_code ## ]]\n```python\nresult = 1+1\n```",
+            "[[ ## reasoning ## ]]\nReason_C\n\n[[ ## answer ## ]]\n2",
         ]
     )
     dspy.settings.configure(lm=lm)
+
+    pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
 
     # The first code example failed
-    for message in lm.get_convo(2)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
-    assert lm.get_convo(2)[0] == [
+    assert lm.get_convo(1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `question` (str)
+                2. `previous_code` (str): previously-generated python code that errored
+                3. `error` (str): error message from previously-generated python code
+
+                Your output fields are:
+                1. `reasoning` (str)
+                2. `generated_code` (str): python code that answers the question
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## previous_code ## ]]
+                {previous_code}
+
+                [[ ## error ## ]]
+                {error}
+
+                [[ ## reasoning ## ]]
+                {reasoning}
+
+                [[ ## generated_code ## ]]
+                {generated_code}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        You are given `question`, `previous_code`, `error` due to an error in previous code.
+                        Your task is to correct the error and provide the new `generated_code`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## question ## ]]
+                What is 1+1?
+
+                [[ ## previous_code ## ]]
+                result = 1+0/0
+
+                [[ ## error ## ]]
+                division by zero
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `generated_code`, and then ending with the marker for `completed`."""
             ),
         },
     ]
-
-    # The second code example succeeded
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
             "content": textwrap.dedent(
                 """\
-            """
+                Your input fields are:
+                1. `question` (str)
+                2. `final_generated_code` (str): python code that answers the question
+                3. `code_output` (str): output of previously-generated python code
+
+                Your output fields are:
+                1. `reasoning` (str)
+                2. `answer` (str): often between 1 and 5 words
+
+                All interactions will be structured in the following way, with the appropriate values filled in.
+
+                [[ ## question ## ]]
+                {question}
+
+                [[ ## final_generated_code ## ]]
+                {final_generated_code}
+
+                [[ ## code_output ## ]]
+                {code_output}
+
+                [[ ## reasoning ## ]]
+                {reasoning}
+
+                [[ ## answer ## ]]
+                {answer}
+
+                [[ ## completed ## ]]
+
+                In adhering to this structure, your objective is: 
+                        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`."""
             ),
         },
         {
             "role": "user",
             "content": textwrap.dedent(
                 """\
-                """
+                [[ ## question ## ]]
+                What is 1+1?
+
+                [[ ## final_generated_code ## ]]
+                result = 1+1
+
+                [[ ## code_output ## ]]
+                2
+
+                Respond with the corresponding output fields, starting with the field `reasoning`, then `answer`, and then ending with the marker for `completed`."""
             ),
         },
     ]
diff --git a/tests/predict/test_react.py b/tests/predict/test_react.py
index 9b187471b4..8ee626aeb3 100644
--- a/tests/predict/test_react.py
+++ b/tests/predict/test_react.py
@@ -8,8 +8,7 @@ def test_example_no_tools():
     # Createa a simple dataset which the model will use with the Retrieve tool.
     lm = dspy.utils.DummyLM(
         [
-            "Initial thoughts",  # Thought_1
-            "Finish[blue]",  # Action_1
+            "[[ ## Thought_1 ## ]]\nInitial thoughts\n\n[[ ## Action_1 ## ]]\nFinish[blue]",
         ]
     )
     dspy.settings.configure(lm=lm, rm=dummy_rm())
@@ -24,28 +23,13 @@ def test_example_no_tools():
     result = program(question=question)
     assert result.answer == "blue"
 
-    # For debugging
-    print("---")
-    for row in lm.history:
-        print(row["prompt"])
-        print("Response:", row["response"]["choices"][0]["text"])
-        print("---")
-
-    assert lm.get_convo(-1).endswith(
-        "Question: What is the color of the sky?\n"
-        "Thought 1: Initial thoughts\n"
-        "Action 1: Finish[blue]"
-    )
-
 
 def test_example_search():
     # Createa a simple dataset which the model will use with the Retrieve tool.
     lm = dspy.utils.DummyLM(
         [
-            "Initial thoughts",  # Thought_1
-            "Search[the color of the sky]",  # Thought_1
-            "More thoughts",  # Thought_2
-            "Finish[blue]",  # Action_2
+            "[[ ## Thought_1 ## ]]\nInitial thoughts\n\n[[ ## Action_1 ## ]]\nSearch[the color of the sky]",
+            "[[ ## Thought_2 ## ]]\nMore thoughts\n\n[[ ## Action_2 ## ]]\nFinish[blue]\n\n",
         ]
     )
     rm = dummy_rm(
@@ -72,21 +56,6 @@ def test_example_search():
     result = program(question=question)
     assert result.answer == "blue"
 
-    # For debugging
-    print(lm.get_convo(-1))
-
-    assert lm.get_convo(-1).endswith(
-        "Question: What is the color of the sky?\n\n"
-        "Thought 1: Initial thoughts\n\n"
-        "Action 1: Search[the color of the sky]\n\n"
-        "Observation 1:\n"
-        "[1] «We all know the color of the sky is blue.»\n"
-        "[2] «Somethng about the sky colors»\n"
-        "[3] «This sentence is completely irellevant to answer the question.»\n\n"
-        "Thought 2: More thoughts\n\n"
-        "Action 2: Finish[blue]"
-    )
-
 
 class DummyTool1:
     name = "Tool1"
@@ -122,12 +91,9 @@ def __call__(self, *args, **kwargs):
 def test_custom_tools():
     lm = dspy.utils.DummyLM(
         [
-            "Initial thoughts",
-            "Tool1[foo]",
-            "More thoughts",
-            "Tool2[bar]",
-            "Even more thoughts",
-            "Finish[baz]",
+            "[[ ## Thought_1 ## ]]\nInitial thoughts\n\n[[ ## Action_1 ## ]]\nTool1[foo]",
+            "[[ ## Thought_2 ## ]]\nMore thoughts\n\n[[ ## Action_2 ## ]]\nTool2[bar]",
+            "[[ ## Thought_3 ## ]]\nEven more thoughts\n\n[[ ## Action_3 ## ]]\nFinish[baz]",
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -143,17 +109,6 @@ def test_custom_tools():
     # each tool should be called only once
     assert tool1.num_calls == 1
     assert tool2.num_calls == 1
-    assert lm.get_convo(-1).endswith(
-        "Question: What is the color of the sky?\n\n"
-        "Thought 1: Initial thoughts\n\n"
-        "Action 1: Tool1[foo]\n\n"
-        "Observation 1: tool 1 output\n\n"
-        "Thought 2: More thoughts\n\n"
-        "Action 2: Tool2[bar]\n\n"
-        "Observation 2: tool 2 output\n\n"
-        "Thought 3: Even more thoughts\n\n"
-        "Action 3: Finish[baz]"
-    )
 
 
 def test_signature_instructions():
diff --git a/tests/predict/test_retry.py b/tests/predict/test_retry.py
index 5ceee8a627..16050e355d 100644
--- a/tests/predict/test_retry.py
+++ b/tests/predict/test_retry.py
@@ -14,7 +14,7 @@ def test_retry_simple():
         assert f"past_{field}" in retry_module.new_signature.input_fields
     assert "feedback" in retry_module.new_signature.input_fields
 
-    lm = DummyLM(["blue"])
+    lm = DummyLM(["[[ ## answer ## ]]\nblue"])
     dspy.settings.configure(lm=lm)
     result = retry_module.forward(
         question="What color is the sky?",
@@ -23,18 +23,10 @@ def test_retry_simple():
     )
     assert result.answer == "blue"
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1).endswith(
-        "Question: What color is the sky?\n\n"
-        "Previous Answer: red\n\n"
-        "Instructions: Try harder\n\n"
-        "Answer: blue"
-    )
-
 
 def test_retry_forward_with_feedback():
     # First we make a mistake, then we fix it
-    lm = DummyLM(["red", "blue"])
+    lm = DummyLM(["[[ ## answer ## ]]\nred", "[[ ## answer ## ]]\nblue"])
     dspy.settings.configure(lm=lm, trace=[])
 
     class SimpleModule(dspy.Module):
@@ -58,18 +50,10 @@ def forward(self, **kwargs):
 
     assert result.answer == "blue"
 
-    print(lm.get_convo(-1))
-    assert lm.get_convo(-1).endswith(
-        "Question: What color is the sky?\n\n"
-        "Previous Answer: red\n\n"
-        "Instructions: Please think harder\n\n"
-        "Answer: blue"
-    )
-
 
 def test_retry_forward_with_typed_predictor():
     # First we make a mistake, then we fix it
-    lm = DummyLM(['{"answer":"red"}', '{"answer":"blue"}'])
+    lm = DummyLM(['[[ ## output ## ]]\n{"answer":"red"}', '[[ ## output ## ]]\n{"answer":"blue"}'])
     dspy.settings.configure(lm=lm, trace=[])
 
     class AnswerQuestion(dspy.Signature):
@@ -103,9 +87,3 @@ def forward(self, **kwargs):
     result = program(question="What color is the sky?")
 
     assert result.answer == "blue"
-    assert lm.get_convo(-1).endswith(
-        'Input: {"question":"What color is the sky?"}\n\n'
-        'Previous Output: {"answer":"red"}\n\n'
-        'Instructions: Please think harder\n\n'
-        'Output: {"answer":"blue"}'
-    )
diff --git a/tests/signatures/test_signature.py b/tests/signatures/test_signature.py
index 52f6f74777..292bafdf25 100644
--- a/tests/signatures/test_signature.py
+++ b/tests/signatures/test_signature.py
@@ -182,6 +182,9 @@ class SubSignature(Signature):
 
 
 def test_multiline_instructions():
+    lm = DummyLM(["[[ ## output ## ]]\nshort answer"])
+    dspy.settings.configure(lm=lm)
+
     class MySignature(Signature):
         """First line
         Second line
@@ -190,15 +193,8 @@ class MySignature(Signature):
         output = OutputField()
 
     predictor = dspy.Predict(MySignature)
-
-    lm = DummyLM(["[[ ## output ## ]]\nshort answer"])
-    dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
@@ -219,7 +215,7 @@ class MySignature(Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         First line
                         Second line
                             Third line"""
diff --git a/tests/teleprompt/test_bootstrap.py b/tests/teleprompt/test_bootstrap.py
index 05431a632e..ed3e614469 100644
--- a/tests/teleprompt/test_bootstrap.py
+++ b/tests/teleprompt/test_bootstrap.py
@@ -82,7 +82,7 @@ def test_bootstrap_effectiveness():
 
     for message in lm.get_convo(-1)[0]:
         print("----")
-        print(message["content"])
+        print(textwrap.indent(message["content"], "                "))
         print("----")
     assert lm.get_convo(-1)[0] == [
         {
@@ -105,7 +105,7 @@ def test_bootstrap_effectiveness():
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `input`, produce the fields `output`."""
             ),
         },
diff --git a/tests/teleprompt/test_copro_optimizer.py b/tests/teleprompt/test_copro_optimizer.py
index 2618367cac..7f391f3515 100644
--- a/tests/teleprompt/test_copro_optimizer.py
+++ b/tests/teleprompt/test_copro_optimizer.py
@@ -123,7 +123,7 @@ def test_optimization_and_output_verification():
 
     for message in lm.get_convo(-1)[0]:
         print("----")
-        print(message["content"])
+        print(textwrap.indent(message["content"], "                "))
         print("----")
     assert lm.get_convo(-1)[0] == [
         {
@@ -150,7 +150,7 @@ def test_optimization_and_output_verification():
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Optimized Prompt"""
             ),
         },
diff --git a/tests/teleprompt/test_mipro_optimizer.py b/tests/teleprompt/test_mipro_optimizer.py
deleted file mode 100644
index b586068f93..0000000000
--- a/tests/teleprompt/test_mipro_optimizer.py
+++ /dev/null
@@ -1,246 +0,0 @@
-import re
-import textwrap
-
-import pytest
-
-import dspy
-from dsp.modules import LM
-from dspy import Example
-from dspy.teleprompt.signature_opt_bayesian import MIPRO
-from dspy.utils.dummies import DummyLM
-
-
-# Define a simple metric function for testing
-def simple_metric(example, prediction, trace=None):
-    # Simplified metric for testing: true if prediction matches expected output
-    return example.output == prediction.output
-
-
-# Some example data
-capitals = {
-    "Germany": "Berlin",
-    "France": "Paris",
-    "Denmark": "Copenhagen",
-    "Sweden": "Stockholm",
-    "Norway": "Oslo",
-}
-# Not used for training data
-extra_capitals = {
-    "Spain": "Madrid",
-    "Portugal": "Lisbon",
-    "Italy": "Rome",
-}
-
-# Example training and validation sets
-trainset = [
-    Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
-    Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
-] + [
-    Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input")
-    for country, capital in capitals.items()
-]
-
-
-class ConditionalLM(LM):
-    def __init__(self):
-        super().__init__("conditional-lm")
-
-    def basic_request(self, prompt, num_candidates=1, **kwargs):
-        # If we are in the "optimization" stage, we don't say much.
-        if prompt.endswith("Observations:"):
-            answer = " (*silence*)"
-        elif prompt.endswith("Proposed Instruction:"):
-            answer = " Input: "
-        elif prompt.endswith("Proposed Prefix For Output Field:"):
-            answer = " Output: "
-        elif prompt.endswith("Summary:"):
-            answer = " summarizing..."
-        else:
-            pairs = re.findall(r"Input: (.*?)\n(?:Reasoning:.*?\n)?Output: (.*?)\n", prompt, re.DOTALL)
-
-            # breakpoint()
-            print("PROMPT:", prompt)
-            print("PAIRS:", pairs)
-
-            last = re.search(r"Input: (.*)\nReasoning: (.*)$", prompt)
-            current_question = last.group(1)
-
-            if match := re.match(r"What is the capital of (.*?)\?", current_question):
-                country = match.group(1)
-                # If we had a previous example of a question about a capital, the model
-                # has learned the format, and will answer with question correctly.
-                if any("capital" in question for question, _ in pairs):
-                    answer = (capitals | extra_capitals)[country]
-                # Otherwise, it is confused and will answer with the country's name.
-                else:
-                    answer = country
-            # For other questions, the model will answer with the last word of the question.
-            else:
-                answer = current_question.split()[-1]
-
-            answer = "think deeply.\nOutput: " + answer
-
-        RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
-        print("=== DummyLM ===")
-        print(prompt, end="")
-        print(f"{RED}{answer}{RESET}")
-        print("===")
-
-        dummy_response = {"choices": []}
-        for _ in range(num_candidates):
-            dummy_response["choices"].append(
-                {
-                    "text": answer,
-                    "finish_reason": "done",
-                }
-            )
-
-        # Simulate processing and storing the request and response.
-        history_entry = {
-            "prompt": prompt,
-            "response": dummy_response,
-            "kwargs": kwargs,
-            "raw_kwargs": kwargs,
-        }
-        self.history.append(history_entry)
-
-        return dummy_response
-
-    def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
-        response = self.basic_request(prompt, **kwargs)
-        return [choice["text"] for choice in response["choices"]]
-
-    def get_convo(self, index):
-        """get the prompt + anwer from the ith message"""
-        return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
-
-
-def test_bayesian_signature_optimizer_initialization():
-    optimizer = MIPRO(metric=simple_metric, num_candidates=10, init_temperature=1.4, verbose=True, track_stats=True)
-    assert optimizer.metric == simple_metric, "Metric not correctly initialized"
-    assert optimizer.num_candidates == 10, "Incorrect 'num_candidates' parameter initialization"
-    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
-    assert optimizer.verbose is True, "Verbose flag not correctly initialized"
-    assert optimizer.track_stats is True, "Track stats flag not correctly initialized"
-
-
-class SimpleModule(dspy.Module):
-    def __init__(self, signature):
-        super().__init__()
-        # SignatureOptimizer doesn't work with dspy.Predict
-        self.predictor = dspy.ChainOfThought(signature)
-
-    def forward(self, **kwargs):
-        return self.predictor(**kwargs)
-
-
-def test_signature_optimizer_optimization_process():
-    lm = ConditionalLM()
-    dspy.settings.configure(lm=lm)
-
-    student = SimpleModule(signature="input -> output")
-
-    optimizer = MIPRO(
-        metric=simple_metric,
-        num_candidates=10,
-        init_temperature=1.4,
-        verbose=False,
-        track_stats=False,
-    )
-
-    # Adjustments: Include required parameters for the compile method
-    optimized_student = optimizer.compile(
-        student=student,
-        trainset=trainset,
-        num_trials=10,
-        max_bootstrapped_demos=3,
-        max_labeled_demos=5,
-        eval_kwargs={"num_threads": 1, "display_progress": False},
-        requires_permission_to_run=False,
-    )
-
-    assert len(optimized_student.predictor.demos) == 5
-
-
-def test_signature_optimizer_bad_lm():
-    dspy.settings.configure(lm=DummyLM([f"Optimized instruction {i}" for i in range(30)]))
-    student = SimpleModule(signature="input -> output")
-    optimizer = MIPRO(
-        metric=simple_metric,
-        num_candidates=10,
-        init_temperature=1.4,
-        verbose=False,
-        track_stats=False,
-    )
-
-    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DummyLM,
-    # which only outputs "Optimized instruction i" this means that none of the bootstrapped examples are successful,
-    # and therefore the set of examples that we're using to generate new prompts is empty
-    with pytest.raises(ValueError):
-        _optimized_student = optimizer.compile(
-            student=student,
-            trainset=trainset,
-            num_trials=10,
-            max_bootstrapped_demos=3,
-            max_labeled_demos=5,
-            eval_kwargs={"num_threads": 1, "display_progress": False},
-            requires_permission_to_run=False,
-        )
-
-
-def test_optimization_and_output_verification():
-    # Make a language model that is always right, except on the last
-    # example in the train set.
-    lm = ConditionalLM()
-    dspy.settings.configure(lm=lm)
-
-    optimizer = MIPRO(
-        metric=simple_metric,
-        num_candidates=10,
-        init_temperature=1.4,
-        verbose=False,
-        track_stats=True,
-    )
-
-    student = SimpleModule("input -> output")
-
-    # Compile the student with the optimizer
-    optimized_student = optimizer.compile(
-        student=student,
-        trainset=trainset,
-        num_trials=4,
-        max_bootstrapped_demos=2,
-        max_labeled_demos=3,
-        eval_kwargs={"num_threads": 1, "display_progress": False},
-        requires_permission_to_run=False,
-    )
-
-    # Simulate calling the optimized student with a new input
-    test_input = "What is the capital of Spain?"
-    prediction = optimized_student(input=test_input)
-
-    print("CORRECT ANSWER")
-    print(lm.get_convo(-1))
-
-    assert prediction.output == "Madrid"
-
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(message["content"])
-        print("----")
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-            """
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                """
-            ),
-        },
-    ]

From 789b7a1d1a371504cec2e92d32661265d9b8be59 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 06:06:07 +0000
Subject: [PATCH 08/17] feat(dspy): revert changes to MIPRO tests

---
 .../dsp_LM/teleprompt/test_mipro_optimizer.py | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
index 695dc28073..9ed1b594fd 100644
--- a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
+++ b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
@@ -115,6 +115,15 @@ def get_convo(self, index):
         return self.history[index]["prompt"] + " " + self.history[index]["response"]["choices"][0]["text"]
 
 
+def test_bayesian_signature_optimizer_initialization():
+    optimizer = MIPRO(metric=simple_metric, num_candidates=10, init_temperature=1.4, verbose=True, track_stats=True)
+    assert optimizer.metric == simple_metric, "Metric not correctly initialized"
+    assert optimizer.num_candidates == 10, "Incorrect 'num_candidates' parameter initialization"
+    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
+    assert optimizer.verbose is True, "Verbose flag not correctly initialized"
+    assert optimizer.track_stats is True, "Track stats flag not correctly initialized"
+
+
 class SimpleModule(dspy.Module):
     def __init__(self, signature):
         super().__init__()
@@ -210,4 +219,45 @@ def test_optimization_and_output_verification():
     test_input = "What is the capital of Spain?"
     prediction = optimized_student(input=test_input)
 
+    print("CORRECT ANSWER")
+    print(lm.get_convo(-1))
+
     assert prediction.output == "Madrid"
+
+    expected_lm_output = textwrap.dedent(
+        """\
+        Input:
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Reasoning: Let's think step by step in order to ${produce the output}. We ...
+        Output: ${output}
+
+        ---
+
+        Input: What is the capital of France?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Paris
+
+        ---
+
+        Input: What is the capital of Norway?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Oslo
+
+        ---
+
+        Input: What does the fox say?
+        Output: Ring-ding-ding-ding-dingeringeding!
+
+        ---
+
+        Input: What is the capital of Spain?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Madrid"""
+    )
+
+    assert lm.get_convo(-1) == expected_lm_output

From 03d8f2b2acaf0c9d80da0ff01ba913e12b8e3dc5 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 06:15:52 +0000
Subject: [PATCH 09/17] feat(dspy): clear settings between tests

---
 dsp/utils/settings.py | 42 ++++++++++++++++++++++--------------------
 tests/conftest.py     | 13 +++++++++++++
 2 files changed, 35 insertions(+), 20 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py
index df073c7075..ea02059e48 100644
--- a/dsp/utils/settings.py
+++ b/dsp/utils/settings.py
@@ -3,6 +3,27 @@
 
 from dsp.utils.utils import dotdict
 
+DEFAULT_CONFIG = dotdict(
+    lm=None,
+    adapter=None,
+    rm=None,
+    branch_idx=0,
+    reranker=None,
+    compiled_lm=None,
+    force_reuse_cached_compilation=False,
+    compiling=False,
+    skip_logprobs=False,
+    trace=[],
+    release=0,
+    bypass_assert=False,
+    bypass_suggest=False,
+    assert_failures=0,
+    suggest_failures=0,
+    langchain_history=[],
+    experimental=False,
+    backoff_time=10,
+)
+
 
 class Settings:
     """DSP configuration settings."""
@@ -25,26 +46,7 @@ def __new__(cls):
             #  TODO: remove first-class support for re-ranker and potentially combine with RM to form a pipeline of sorts
             #  eg: RetrieveThenRerankPipeline(RetrievalModel, Reranker)
             #  downstream operations like dsp.retrieve would use configs from the defined pipeline.
-            config = dotdict(
-                lm=None,
-                adapter=None,
-                rm=None,
-                branch_idx=0,
-                reranker=None,
-                compiled_lm=None,
-                force_reuse_cached_compilation=False,
-                compiling=False,  # TODO: can probably be removed
-                skip_logprobs=False,
-                trace=[],
-                release=0,
-                bypass_assert=False,
-                bypass_suggest=False,
-                assert_failures=0,
-                suggest_failures=0,
-                langchain_history=[],
-                experimental=False,
-                backoff_time = 10
-            )
+            config = DEFAULT_CONFIG
             cls._instance.__append(config)
 
         return cls._instance
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000..0bb74ac90e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,13 @@
+import pytest
+
+import dspy
+from dsp.utils.settings import DEFAULT_CONFIG
+
+
+@pytest.fixture(autouse=True)
+def clear_settings():
+    """Ensures that the settings are cleared after each test."""
+
+    yield
+
+    dspy.settings.configure(**DEFAULT_CONFIG, inherit_config=False)

From 843d63bd8eadd153a6ddc2a7f5cd74f01f8dff42 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 06:22:42 +0000
Subject: [PATCH 10/17] feat(dspy): rm print

---
 tests/predict/test_chain_of_thought.py           | 4 ----
 tests/predict/test_chain_of_thought_with_hint.py | 4 ----
 tests/predict/test_predict.py                    | 8 --------
 tests/teleprompt/test_bootstrap.py               | 4 ----
 tests/teleprompt/test_copro_optimizer.py         | 4 ----
 5 files changed, 24 deletions(-)

diff --git a/tests/predict/test_chain_of_thought.py b/tests/predict/test_chain_of_thought.py
index 51fd88f4ee..bdc9122c7e 100644
--- a/tests/predict/test_chain_of_thought.py
+++ b/tests/predict/test_chain_of_thought.py
@@ -15,10 +15,6 @@ def test_initialization_with_string_signature():
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(textwrap.indent(message["content"], "                "))
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
diff --git a/tests/predict/test_chain_of_thought_with_hint.py b/tests/predict/test_chain_of_thought_with_hint.py
index 4ae6128176..3fff5d2cb2 100644
--- a/tests/predict/test_chain_of_thought_with_hint.py
+++ b/tests/predict/test_chain_of_thought_with_hint.py
@@ -74,10 +74,6 @@ def test_cot_with_hint():
     ]
     assert predict(question="What is 1+1?", hint="think small").answer == "2"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(textwrap.indent(message["content"], "                "))
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
index 8ccb1970d5..cf499474ee 100644
--- a/tests/predict/test_predict.py
+++ b/tests/predict/test_predict.py
@@ -46,10 +46,6 @@ def test_call_method():
     result = predict_instance(input="test input")
     assert result.output == "test output"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(textwrap.indent(message["content"], "                "))
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
@@ -241,10 +237,6 @@ class OutputOnlySignature(dspy.Signature):
     dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(textwrap.indent(message["content"], "                "))
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
diff --git a/tests/teleprompt/test_bootstrap.py b/tests/teleprompt/test_bootstrap.py
index ed3e614469..ebf23f3b48 100644
--- a/tests/teleprompt/test_bootstrap.py
+++ b/tests/teleprompt/test_bootstrap.py
@@ -80,10 +80,6 @@ def test_bootstrap_effectiveness():
     prediction = compiled_student(input=trainset[0].input)
     assert prediction.output == trainset[0].output
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(textwrap.indent(message["content"], "                "))
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",
diff --git a/tests/teleprompt/test_copro_optimizer.py b/tests/teleprompt/test_copro_optimizer.py
index 7f391f3515..301106dae8 100644
--- a/tests/teleprompt/test_copro_optimizer.py
+++ b/tests/teleprompt/test_copro_optimizer.py
@@ -121,10 +121,6 @@ def test_optimization_and_output_verification():
 
     assert prediction.output == "Paris"
 
-    for message in lm.get_convo(-1)[0]:
-        print("----")
-        print(textwrap.indent(message["content"], "                "))
-        print("----")
     assert lm.get_convo(-1)[0] == [
         {
             "role": "system",

From 6d7c968bd11b9bc94ecef45dc7dff8e407975390 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 06:38:47 +0000
Subject: [PATCH 11/17] feat(dspy): handle case where type origin doesn't have
 a  __name__ attr

---
 dspy/adapters/chat_adapter.py       | 97 +++++++++++++++++------------
 tests/functional/test_functional.py | 22 +++----
 2 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/dspy/adapters/chat_adapter.py b/dspy/adapters/chat_adapter.py
index e2c3abeba3..261c15667c 100644
--- a/dspy/adapters/chat_adapter.py
+++ b/dspy/adapters/chat_adapter.py
@@ -1,13 +1,14 @@
-import re
 import ast
 import json
+import re
 import textwrap
+from typing import get_args, get_origin
 
 from pydantic import TypeAdapter
+
 from .base import Adapter
-from typing import get_origin, get_args
 
-field_header_pattern = re.compile(r'\[\[ ## (\w+) ## \]\]')
+field_header_pattern = re.compile(r"\[\[ ## (\w+) ## \]\]")
 
 
 class ChatAdapter(Adapter):
@@ -20,9 +21,11 @@ def format(self, signature, demos, inputs):
         # Extract demos where some of the output_fields are not filled in.
         incomplete_demos = [demo for demo in demos if not all(k in demo for k in signature.fields)]
         complete_demos = [demo for demo in demos if demo not in incomplete_demos]
-        incomplete_demos = [demo for demo in incomplete_demos \
-                            if any(k in demo for k in signature.input_fields) and \
-                                any(k in demo for k in signature.output_fields)]
+        incomplete_demos = [
+            demo
+            for demo in incomplete_demos
+            if any(k in demo for k in signature.input_fields) and any(k in demo for k in signature.output_fields)
+        ]
 
         demos = incomplete_demos + complete_demos
 
@@ -31,20 +34,22 @@ def format(self, signature, demos, inputs):
         for demo in demos:
             messages.append(format_turn(signature, demo, role="user", incomplete=demo in incomplete_demos))
             messages.append(format_turn(signature, demo, role="assistant", incomplete=demo in incomplete_demos))
-        
+
         messages.append(format_turn(signature, inputs, role="user"))
 
         return messages
-    
+
     def parse(self, signature, completion, _parse_values=True):
         sections = [(None, [])]
 
         for line in completion.splitlines():
             match = field_header_pattern.match(line.strip())
-            if match: sections.append((match.group(1), []))
-            else: sections[-1][1].append(line)
+            if match:
+                sections.append((match.group(1), []))
+            else:
+                sections[-1][1].append(line)
 
-        sections = [(k, '\n'.join(v).strip()) for k, v in sections]
+        sections = [(k, "\n".join(v).strip()) for k, v in sections]
 
         fields = {}
         for k, v in sections:
@@ -52,23 +57,29 @@ def parse(self, signature, completion, _parse_values=True):
                 try:
                     fields[k] = parse_value(v, signature.output_fields[k].annotation) if _parse_values else v
                 except Exception as e:
-                    raise ValueError(f"Error parsing field {k}: {e}.\n\n\t\tOn attempting to parse the value\n```\n{v}\n```")
+                    raise ValueError(
+                        f"Error parsing field {k}: {e}.\n\n\t\tOn attempting to parse the value\n```\n{v}\n```"
+                    )
 
         if fields.keys() != signature.output_fields.keys():
             raise ValueError(f"Expected {signature.output_fields.keys()} but got {fields.keys()}")
 
         return fields
 
+
 def format_blob(blob):
-    if '\n' not in blob and "«" not in blob and "»" not in blob: return f"«{blob}»"
+    if "\n" not in blob and "«" not in blob and "»" not in blob:
+        return f"«{blob}»"
 
-    modified_blob = blob.replace('\n', '\n    ')
+    modified_blob = blob.replace("\n", "\n    ")
     return f"«««\n    {modified_blob}\n»»»"
 
 
 def format_list(items):
-    if len(items) == 0: return "N/A"
-    if len(items) == 1: return format_blob(items[0])
+    if len(items) == 0:
+        return "N/A"
+    if len(items) == 1:
+        return format_blob(items[0])
 
     return "\n".join([f"[{idx+1}] {format_blob(txt)}" for idx, txt in enumerate(items)])
 
@@ -79,21 +90,25 @@ def format_fields(fields):
         v = v if not isinstance(v, list) else format_list(v)
         output.append(f"[[ ## {k} ## ]]\n{v}")
 
-    return '\n\n'.join(output).strip()
-        
+    return "\n\n".join(output).strip()
+
 
 def parse_value(value, annotation):
-    if annotation is str: return str(value)
+    if annotation is str:
+        return str(value)
     parsed_value = value
     if isinstance(value, str):
-        try: parsed_value = json.loads(value)
+        try:
+            parsed_value = json.loads(value)
         except json.JSONDecodeError:
-            try: parsed_value = ast.literal_eval(value)
-            except (ValueError, SyntaxError): parsed_value = value
+            try:
+                parsed_value = ast.literal_eval(value)
+            except (ValueError, SyntaxError):
+                parsed_value = value
     return TypeAdapter(annotation).validate_python(parsed_value)
 
 
-def format_turn(signature, values, role, incomplete=False):       
+def format_turn(signature, values, role, incomplete=False):
     content = []
 
     if role == "user":
@@ -101,42 +116,46 @@ def format_turn(signature, values, role, incomplete=False):
         if incomplete:
             content.append("This is an example of the task, though some input or output fields are not supplied.")
     else:
-        field_names, values = list(signature.output_fields.keys()) + ['completed'], {**values, 'completed': ''}
+        field_names, values = list(signature.output_fields.keys()) + ["completed"], {**values, "completed": ""}
 
     if not incomplete:
         if not set(values).issuperset(set(field_names)):
             raise ValueError(f"Expected {field_names} but got {values.keys()}")
-    
+
     content.append(format_fields({k: values.get(k, "Not supplied for this particular example.") for k in field_names}))
 
     if role == "user":
-        content.append("Respond with the corresponding output fields, starting with the field " +
-                       ", then ".join(f"`{f}`" for f in signature.output_fields) +
-                       ", and then ending with the marker for `completed`.")
+        content.append(
+            "Respond with the corresponding output fields, starting with the field "
+            + ", then ".join(f"`{f}`" for f in signature.output_fields)
+            + ", and then ending with the marker for `completed`."
+        )
 
-    return {"role": role, "content": '\n\n'.join(content).strip()}
+    return {"role": role, "content": "\n\n".join(content).strip()}
 
 
 def get_annotation_name(annotation):
     origin = get_origin(annotation)
     args = get_args(annotation)
     if origin is None:
-        if hasattr(annotation, '__name__'):
+        if hasattr(annotation, "__name__"):
             return annotation.__name__
         else:
             return str(annotation)
     else:
-        args_str = ', '.join(get_annotation_name(arg) for arg in args)
-        return f"{origin.__name__}[{args_str}]"
+        args_str = ", ".join(get_annotation_name(arg) for arg in args)
+        return f"{get_annotation_name(origin)}[{args_str}]"
+
 
 def enumerate_fields(fields):
     parts = []
     for idx, (k, v) in enumerate(fields.items()):
         parts.append(f"{idx+1}. `{k}`")
         parts[-1] += f" ({get_annotation_name(v.annotation)})"
-        parts[-1] += f": {v.json_schema_extra['desc']}" if v.json_schema_extra['desc'] != f'${{{k}}}' else ''
+        parts[-1] += f": {v.json_schema_extra['desc']}" if v.json_schema_extra["desc"] != f"${{{k}}}" else ""
+
+    return "\n".join(parts).strip()
 
-    return '\n'.join(parts).strip()
 
 def prepare_instructions(signature):
     parts = []
@@ -144,12 +163,12 @@ def prepare_instructions(signature):
     parts.append("Your output fields are:\n" + enumerate_fields(signature.output_fields))
     parts.append("All interactions will be structured in the following way, with the appropriate values filled in.")
 
-    parts.append(format_fields({f : f"{{{f}}}" for f in signature.input_fields}))
-    parts.append(format_fields({f : f"{{{f}}}" for f in signature.output_fields}))
-    parts.append(format_fields({'completed' : ""}))
+    parts.append(format_fields({f: f"{{{f}}}" for f in signature.input_fields}))
+    parts.append(format_fields({f: f"{{{f}}}" for f in signature.output_fields}))
+    parts.append(format_fields({"completed": ""}))
 
     instructions = textwrap.dedent(signature.instructions)
-    objective = ('\n' + ' ' * 8).join([''] + instructions.splitlines())
+    objective = ("\n" + " " * 8).join([""] + instructions.splitlines())
     parts.append(f"In adhering to this structure, your objective is: {objective}")
 
     # parts.append("You will receive some input fields in each interaction. " +
@@ -157,4 +176,4 @@ def prepare_instructions(signature):
     #              ", then ".join(f"`{f}`" for f in signature.output_fields) +
     #              ", and then ending with the marker for `completed`.")
 
-    return '\n\n'.join(parts).strip()
+    return "\n\n".join(parts).strip()
diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
index 01bc9bc531..0341bdede7 100644
--- a/tests/functional/test_functional.py
+++ b/tests/functional/test_functional.py
@@ -435,7 +435,7 @@ def flight_information(email: str) -> TravelInformation:
 
             [[ ## completed ## ]]
 
-            In adhering to this structure, your objective is: 
+            In adhering to this structure, your objective is:
                     Given the fields `email`, produce the fields `flight_information`."""
             ),  # noqa
         },
@@ -508,7 +508,7 @@ def get_user_details() -> UserDetails:
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is: 
+                In adhering to this structure, your objective is:
                         Make a very succinct json object that validates with the following schema"""
             ),  # noqa
         },
@@ -590,8 +590,7 @@ def test_parse_type_string():
 
 
 def test_literal():
-    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
-    lm = DummyLM(['[[ ## output ## ]]\n"2"', '[[ ## output ## ]]\n"3"'])
+    lm = DummyLM(['[[ ## f ## ]]\n"2"', '[[ ## f ## ]]\n"3"'])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -602,8 +601,7 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_missmatch():
-    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
-    lm = DummyLM([f'[[ ## output ## ]]\n"{i}"' for i in range(5, 100)])
+    lm = DummyLM([f'[[ ## f ## ]]\n"{i}"' for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -617,8 +615,7 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_int():
-    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
-    lm = DummyLM(["[[ ## output ## ]]\n2", "[[ ## output ## ]]\n3"])
+    lm = DummyLM(["[[ ## f ## ]]\n2", "[[ ## f ## ]]\n3"])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -629,8 +626,7 @@ def f() -> Literal[2, 3]:
 
 
 def test_literal_int_missmatch():
-    pytest.skip("This test is not working as Literal type does not have a __name__ attribute")
-    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(5, 100)])
+    lm = DummyLM([f"[[ ## f ## ]]\n{i}" for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -753,7 +749,7 @@ class ScoredSignature(dspy.Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is: 
+                In adhering to this structure, your objective is:
                         Given the fields `attempted_signatures`, produce the fields `proposed_signature`."""
             ),  # noqa
         },
@@ -821,7 +817,7 @@ class QuestionSignature(dspy.Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is: 
+                In adhering to this structure, your objective is:
                         Given the fields `topic`, produce the fields `question`."""
             ),  # noqa
         },
@@ -976,7 +972,7 @@ def test_demos():
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is: 
+                In adhering to this structure, your objective is:
                         Given the fields `input`, produce the fields `output`."""
             ),  # noqa
         },

From 5221ab3081974f542a64d4f923793752e6e56ed9 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 06:48:47 +0000
Subject: [PATCH 12/17] feat(dspy): whitespace

---
 tests/functional/test_functional.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
index 0341bdede7..1f0242763a 100644
--- a/tests/functional/test_functional.py
+++ b/tests/functional/test_functional.py
@@ -435,7 +435,7 @@ def flight_information(email: str) -> TravelInformation:
 
             [[ ## completed ## ]]
 
-            In adhering to this structure, your objective is:
+            In adhering to this structure, your objective is: 
                     Given the fields `email`, produce the fields `flight_information`."""
             ),  # noqa
         },
@@ -508,7 +508,7 @@ def get_user_details() -> UserDetails:
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Make a very succinct json object that validates with the following schema"""
             ),  # noqa
         },
@@ -749,7 +749,7 @@ class ScoredSignature(dspy.Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `attempted_signatures`, produce the fields `proposed_signature`."""
             ),  # noqa
         },
@@ -817,7 +817,7 @@ class QuestionSignature(dspy.Signature):
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `topic`, produce the fields `question`."""
             ),  # noqa
         },
@@ -972,7 +972,7 @@ def test_demos():
 
                 [[ ## completed ## ]]
 
-                In adhering to this structure, your objective is:
+                In adhering to this structure, your objective is: 
                         Given the fields `input`, produce the fields `output`."""
             ),  # noqa
         },

From dafd8ff8f5fa3ce5747fc0de485cb6ce1a165a3d Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 19:31:29 +0100
Subject: [PATCH 13/17] deepcopy(DEFAULT_CONFIG) to avoid reference

---
 dsp/utils/settings.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py
index ea02059e48..66fb4b1ab5 100644
--- a/dsp/utils/settings.py
+++ b/dsp/utils/settings.py
@@ -1,4 +1,5 @@
-import threading
+import threading\
+from copy import deepcopy
 from contextlib import contextmanager
 
 from dsp.utils.utils import dotdict
@@ -46,8 +47,9 @@ def __new__(cls):
             #  TODO: remove first-class support for re-ranker and potentially combine with RM to form a pipeline of sorts
             #  eg: RetrieveThenRerankPipeline(RetrievalModel, Reranker)
             #  downstream operations like dsp.retrieve would use configs from the defined pipeline.
-            config = DEFAULT_CONFIG
-            cls._instance.__append(config)
+
+            # make a deepcopy of the default config to avoid modifying the default config
+            cls._instance.__append(deepcopy(DEFAULT_CONFIG))
 
         return cls._instance
 

From 5e03bc4fb996fba10a1aad84d8e09a58c538338e Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Sun, 6 Oct 2024 19:34:29 +0100
Subject: [PATCH 14/17] deepcopy(DEFAULT_CONFIG) to avoid reference

---
 dsp/utils/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py
index 66fb4b1ab5..61fc37ae7e 100644
--- a/dsp/utils/settings.py
+++ b/dsp/utils/settings.py
@@ -1,4 +1,4 @@
-import threading\
+import threading
 from copy import deepcopy
 from contextlib import contextmanager
 

From 53b24c84c736013fd88f02312d629d33df856cd2 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Mon, 7 Oct 2024 11:17:23 +0000
Subject: [PATCH 15/17] feat(dspy): remove files where DSPDummyLM not used

---
 dsp/modules/dummy_lm.py                       |   2 +-
 tests/dsp_LM/evaluate/test_metrics.py         |  36 ----
 tests/dsp_LM/examples/test_baleen.py          | 122 --------------
 tests/dsp_LM/functional/test_functional.py    |   2 +-
 .../dsp_LM/functional/test_signature_typed.py | 155 ------------------
 tests/dsp_LM/modules/test_aws_models.py       |  70 --------
 .../dsp_LM/modules/test_cloudflare_models.py  |  61 -------
 tests/dsp_LM/modules/test_hf_model.py         |  31 ----
 .../modules/vectorizer/test_fastembed.py      |  43 -----
 tests/dsp_LM/predict/test_aggregation.py      |  43 -----
 tests/dsp_LM/predict/test_knn.py              |  51 ------
 tests/dsp_LM/predict/test_langchain.py        |  55 -------
 tests/dsp_LM/primitives/test_example.py       | 110 -------------
 tests/dsp_LM/primitives/test_module.py        |  49 ------
 .../primitives/test_python_interpreter.py     |  53 ------
 .../retrieve/integration_test_pgvectorrm.py   |  94 -----------
 tests/dsp_LM/retrieve/test_llama_index_rm.py  |   4 +-
 tests/dsp_LM/teleprompt/test_bootstrap.py     |   4 +-
 tests/dsp_LM/teleprompt/test_ensemble.py      |  59 -------
 tests/dsp_LM/teleprompt/test_knn_fewshot.py   |   4 +-
 .../dsp_LM/teleprompt/test_mipro_optimizer.py |   4 +-
 21 files changed, 10 insertions(+), 1042 deletions(-)
 delete mode 100644 tests/dsp_LM/evaluate/test_metrics.py
 delete mode 100644 tests/dsp_LM/examples/test_baleen.py
 delete mode 100644 tests/dsp_LM/functional/test_signature_typed.py
 delete mode 100644 tests/dsp_LM/modules/test_aws_models.py
 delete mode 100644 tests/dsp_LM/modules/test_cloudflare_models.py
 delete mode 100644 tests/dsp_LM/modules/test_hf_model.py
 delete mode 100644 tests/dsp_LM/modules/vectorizer/test_fastembed.py
 delete mode 100644 tests/dsp_LM/predict/test_aggregation.py
 delete mode 100644 tests/dsp_LM/predict/test_knn.py
 delete mode 100644 tests/dsp_LM/predict/test_langchain.py
 delete mode 100644 tests/dsp_LM/primitives/test_example.py
 delete mode 100644 tests/dsp_LM/primitives/test_module.py
 delete mode 100644 tests/dsp_LM/primitives/test_python_interpreter.py
 delete mode 100644 tests/dsp_LM/retrieve/integration_test_pgvectorrm.py
 delete mode 100644 tests/dsp_LM/teleprompt/test_ensemble.py

diff --git a/dsp/modules/dummy_lm.py b/dsp/modules/dummy_lm.py
index 0509004c72..49f35fa72a 100644
--- a/dsp/modules/dummy_lm.py
+++ b/dsp/modules/dummy_lm.py
@@ -5,7 +5,7 @@
 
 
 # This testing module was moved in PR #735 to patch Arize Phoenix logging
-class DspDummyLM(LM):
+class DSPDummyLM(LM):
     """Dummy language model for unit testing purposes."""
 
     def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
diff --git a/tests/dsp_LM/evaluate/test_metrics.py b/tests/dsp_LM/evaluate/test_metrics.py
deleted file mode 100644
index 04e32e68ca..0000000000
--- a/tests/dsp_LM/evaluate/test_metrics.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# FILEPATH: /Users/ahle/repos/dspy/tests/evaluate/test_metrics.py
-
-import dsp
-import dspy
-from dspy.evaluate.metrics import answer_exact_match
-from dspy.predict import Predict
-
-
-def test_answer_exact_match_string():
-    example = dspy.Example(
-        question="What is 1+1?",
-        answer="2",
-    ).with_inputs("question")
-    pred = Predict("question -> answer")
-    pred.answer = "2"
-    assert answer_exact_match(example, pred)
-
-
-def test_answer_exact_match_list():
-    example = dspy.Example(
-        question="What is 1+1?",
-        answer=["2", "two"],
-    ).with_inputs("question")
-    pred = Predict("question -> answer")
-    pred.answer = "2"
-    assert answer_exact_match(example, pred)
-
-
-def test_answer_exact_match_no_match():
-    example = dspy.Example(
-        question="What is 1+1?",
-        answer="2",
-    ).with_inputs("question")
-    pred = Predict("question -> answer")
-    pred.answer = "3"
-    assert not answer_exact_match(example, pred)
diff --git a/tests/dsp_LM/examples/test_baleen.py b/tests/dsp_LM/examples/test_baleen.py
deleted file mode 100644
index f0b8042699..0000000000
--- a/tests/dsp_LM/examples/test_baleen.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import pytest
-
-import dspy
-import dspy.evaluate
-from dsp.utils import deduplicate
-from dspy.datasets import HotPotQA
-from dspy.evaluate.evaluate import Evaluate
-from dspy.teleprompt.bootstrap import BootstrapFewShot
-
-
-class GenerateAnswer(dspy.Signature):
-    """Answer questions with short factoid answers."""
-
-    context = dspy.InputField(desc="may contain relevant facts")
-    question = dspy.InputField()
-    answer = dspy.OutputField(desc="often between 1 and 5 words")
-
-
-class GenerateSearchQuery(dspy.Signature):
-    """Write a simple search query that will help answer a complex question."""
-
-    context = dspy.InputField(desc="may contain relevant facts")
-    question = dspy.InputField()
-    query = dspy.OutputField()
-
-
-class SimplifiedBaleen(dspy.Module):
-    def __init__(self, passages_per_hop=3, max_hops=2):
-        super().__init__()
-
-        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
-        self.retrieve = dspy.Retrieve(k=passages_per_hop)
-        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
-        self.max_hops = max_hops
-
-    def forward(self, question):
-        context = []
-
-        for hop in range(self.max_hops):
-            query = self.generate_query[hop](context=context, question=question).query
-            passages = self.retrieve(query).passages
-            context = deduplicate(context + passages)
-
-        pred = self.generate_answer(context=context, question=question)
-        return dspy.Prediction(context=context, answer=pred.answer)
-
-
-def load_hotpotqa():
-    # Load the dataset.
-    dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)
-    # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
-    trainset = [x.with_inputs("question") for x in dataset.train]
-    devset = [x.with_inputs("question") for x in dataset.dev]
-    return trainset, devset
-
-
-# @pytest.mark.slow_test
-# TODO: Find a way to make this test run without openai
-def _test_baleen():
-    lm = dspy.OpenAI(model="gpt-3.5-turbo")
-    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
-    dspy.settings.configure(lm=lm, rm=rm)
-
-    # Ask any question you like to this simple RAG program.
-    my_question = "How many storeys are in the castle that David Gregory inherited?"
-
-    # Get the prediction. This contains `pred.context` and `pred.answer`.
-    uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
-    pred = uncompiled_baleen(my_question)
-
-    assert pred.answer == "five"
-
-
-def validate_context_and_answer_and_hops(example, pred, trace=None):
-    if not dspy.evaluate.answer_exact_match(example, pred):
-        return False
-    if not dspy.evaluate.answer_passage_match(example, pred):
-        return False
-
-    hops = [example.question] + [outputs.query for *_, outputs in trace if "query" in outputs]
-
-    if max([len(h) for h in hops]) > 100:
-        return False
-    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))):
-        return False
-
-    return True
-
-
-def gold_passages_retrieved(example, pred, trace=None):
-    gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
-    found_titles = set(map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context]))
-
-    return gold_titles.issubset(found_titles)
-
-
-# @pytest.mark.slow_test
-# TODO: Find a way to make this test run without the slow hotpotqa dataset
-def _test_compiled_baleen():
-    trainset, devset = load_hotpotqa()
-    lm = dspy.OpenAI(model="gpt-3.5-turbo")
-    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
-    dspy.settings.configure(lm=lm, rm=rm)
-
-    uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
-
-    teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
-    compiled_baleen = teleprompter.compile(
-        SimplifiedBaleen(),
-        teacher=SimplifiedBaleen(passages_per_hop=2),
-        trainset=trainset,
-    )
-
-    evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
-    uncompiled_baleen_retrieval_score = evaluate_on_hotpotqa(
-        uncompiled_baleen, metric=gold_passages_retrieved, display=False
-    )
-    # assert uncompiled_baleen_retrieval_score / 100 == 18 / 50
-
-    compiled_baleen_retrieval_score = evaluate_on_hotpotqa(compiled_baleen, metric=gold_passages_retrieved)
-    # assert compiled_baleen_retrieval_score / 100 == 27 / 50
-    assert uncompiled_baleen_retrieval_score < compiled_baleen_retrieval_score
diff --git a/tests/dsp_LM/functional/test_functional.py b/tests/dsp_LM/functional/test_functional.py
index bc844a82c2..31d0416b7a 100644
--- a/tests/dsp_LM/functional/test_functional.py
+++ b/tests/dsp_LM/functional/test_functional.py
@@ -202,7 +202,7 @@ def simple_metric(example, prediction, trace=None):
     assert demos[0].output == trainset[0].output
 
     # Test the compiled student's prediction.
-    # We are using a DspDummyLM with follow_examples=True, which means that
+    # We are using a DSPDummyLM with follow_examples=True, which means that
     # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
     # on the second output, if it seems an example that perfectly matches the
     # prompt, it will use that instead. That is why we expect "blue" here.
diff --git a/tests/dsp_LM/functional/test_signature_typed.py b/tests/dsp_LM/functional/test_signature_typed.py
deleted file mode 100644
index 6e875d851d..0000000000
--- a/tests/dsp_LM/functional/test_signature_typed.py
+++ /dev/null
@@ -1,155 +0,0 @@
-from typing import Any, Optional, Union
-
-import pydantic
-import pytest
-
-import dspy
-from dspy.functional import TypedPredictor
-from dspy.signatures.signature import signature_to_template
-
-
-def get_field_and_parser(signature: dspy.Signature) -> tuple[Any, Any]:
-    module = TypedPredictor(signature)
-    signature = module._prepare_signature()
-    assert "answer" in signature.fields, "'answer' not in signature.fields"
-    field = signature.fields.get("answer")
-    parser = field.json_schema_extra.get("parser")
-    return field, parser
-
-
-class Mysubmodel(pydantic.BaseModel):
-    sub_floating: float
-
-
-class MyModel(pydantic.BaseModel):
-    floating: float
-    string: str
-    boolean: bool
-    integer: int
-    optional: Optional[str]
-    sequence_of_strings: list[str]
-    union: Union[str, float]
-    submodel: Mysubmodel
-    optional_submodel: Optional[Mysubmodel]
-    optional_existing_submodule: Optional[Mysubmodel]
-
-
-def build_model_instance() -> MyModel:
-    return MyModel(
-        floating=3.14,
-        string="foobar",
-        boolean=True,
-        integer=42,
-        optional=None,
-        sequence_of_strings=["foo", "bar"],
-        union=3.14,
-        submodel=Mysubmodel(sub_floating=42.42),
-        optional_submodel=None,
-        optional_existing_submodule=Mysubmodel(sub_floating=42.42),
-    )
-
-
-@pytest.mark.parametrize(
-    "test_type,serialized, expected", [(str, "foo", "foo"), (int, "42", 42), (float, "42.42", 42.42)]
-)
-def test_basic_types(test_type: type, serialized: str, expected: Any):
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: test_type = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-    assert parser is test_type, "Parser is not correct for 'answer'"
-    assert parser(serialized) == expected, f"{test_type}({serialized})!= {expected}"
-
-
-def test_boolean():
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: bool = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-    assert parser("true"), f"Parsing 'true' failed"
-    assert not parser("false"), f"Parsing 'false' failed"
-
-
-@pytest.mark.parametrize(
-    "test_type,serialized, expected",
-    [(list[str], '["foo", "bar"]', ["foo", "bar"]), (tuple[int, float], "[42, 3.14]", (42, 3.14))],
-)
-def test_sequences(test_type: type, serialized: str, expected: Any):
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: test_type = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-
-    assert parser(serialized) == expected, f"Parsing {expected} failed"
-
-
-@pytest.mark.parametrize(
-    "test_type,serialized, expected",
-    [
-        (Optional[str], '"foobar"', "foobar"),
-        (Optional[str], "null", None),
-        (Union[str, float], "3.14", 3.14),
-        (Union[str, bool], "true", True),
-    ],
-)
-def test_unions(test_type: type, serialized: str, expected: Any):
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: test_type = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-
-    assert parser(serialized) == expected, f"Parsing {expected} failed"
-
-
-def test_pydantic():
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: MyModel = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-
-    instance = build_model_instance()
-    parsed_instance = parser(instance.model_dump_json())
-
-    assert parsed_instance == instance, f"{instance} != {parsed_instance}"
-
-
-def test_optional_pydantic():
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: Optional[MyModel] = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-
-    instance = build_model_instance()
-    parsed_instance = parser(instance.model_dump_json())
-    assert parsed_instance == instance, f"{instance} != {parsed_instance}"
-
-    # Check null case
-    parsed_instance = parser("null")
-    assert parsed_instance == None, "Optional[MyModel] should be None"
-
-
-def test_dataclass():
-    from dataclasses import dataclass
-
-    @dataclass(frozen=True)
-    class MyDataclass:
-        string: str
-        number: int
-        floating: float
-        boolean: bool
-
-    class MySignature(dspy.Signature):
-        question: str = dspy.InputField()
-        answer: MyDataclass = dspy.OutputField()
-
-    _, parser = get_field_and_parser(MySignature)
-
-    instance = MyDataclass("foobar", 42, 3.14, True)
-    parsed_instance = parser('{"string": "foobar", "number": 42, "floating": 3.14, "boolean": true}')
-    assert parsed_instance == instance, f"{instance} != {parsed_instance}"
diff --git a/tests/dsp_LM/modules/test_aws_models.py b/tests/dsp_LM/modules/test_aws_models.py
deleted file mode 100644
index b6e018b337..0000000000
--- a/tests/dsp_LM/modules/test_aws_models.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""Tests for AWS models.
-Note: Requires configuration of your AWS credentials with the AWS CLI and creating sagemaker endpoints.
-TODO: Create mock fixtures for pytest to remove the need for AWS credentials and endpoints.
-"""
-
-import dsp
-import dspy
-
-
-def get_lm(lm_provider: str, model_path: str, **kwargs) -> dsp.modules.lm.LM:
-    """get the language model"""
-    # extract model vendor and name from model name
-    # Model path format is <MODEL_VENDOR>/<MODEL_NAME_OR_ENDPOINT>
-    model_vendor = model_path.split("/")[0]
-    model_name = model_path.split("/")[1]
-
-    if lm_provider == "Bedrock":
-        bedrock = dspy.Bedrock(region_name="us-west-2")
-        if model_vendor == "mistral":
-            return dspy.AWSMistral(bedrock, model_name, **kwargs)
-        elif model_vendor == "anthropic":
-            return dspy.AWSAnthropic(bedrock, model_name, **kwargs)
-        elif model_vendor == "meta":
-            return dspy.AWSMeta(bedrock, model_name, **kwargs)
-        else:
-            raise ValueError(
-                "Model vendor missing or unsupported: Model path format is <MODEL_VENDOR>/<MODEL_NAME_OR_ENDPOINT>"
-            )
-    elif lm_provider == "Sagemaker":
-        sagemaker = dspy.Sagemaker(region_name="us-west-2")
-        if model_vendor == "mistral":
-            return dspy.AWSMistral(sagemaker, model_name, **kwargs)
-        elif model_vendor == "meta":
-            return dspy.AWSMeta(sagemaker, model_name, **kwargs)
-        else:
-            raise ValueError(
-                "Model vendor missing or unsupported: Model path format is <MODEL_VENDOR>/<MODEL_NAME_OR_ENDPOINT>"
-            )
-    else:
-        raise ValueError(f"Unsupported model: {model_name}")
-
-
-def run_tests():
-    """Test the providers and models"""
-    # Configure your AWS credentials with the AWS CLI before running this script
-    provider_model_tuples = [
-        ("Bedrock", "mistral/mistral.mixtral-8x7b-instruct-v0:1"),
-        ("Bedrock", "anthropic/anthropic.claude-3-haiku-20240307-v1:0"),
-        ("Bedrock", "anthropic/anthropic.claude-3-sonnet-20240229-v1:0"),
-        ("Bedrock", "meta/meta.llama2-70b-chat-v1"),
-        ("Bedrock", "meta/meta.llama3-8b-instruct-v1:0"),
-        ("Bedrock", "meta/meta.llama3-70b-instruct-v1:0"),
-        # ('Sagemaker', 'mistral/<YOUR_ENDPOINT_NAME>'),  # REPLACE YOUR_ENDPOINT_NAME with your sagemaker endpoint
-    ]
-
-    predict_func = dspy.Predict("question -> answer")
-    for provider, model_path in provider_model_tuples:
-        print(f"Provider: {provider}, Model: {model_path}")
-        lm = get_lm(provider, model_path)
-        with dspy.context(lm=lm):
-            question = "What is the capital of France?"
-            answer = predict_func(question=question).answer
-            print(f"Question: {question}\nAnswer: {answer}")
-            print("---------------------------------")
-            lm.inspect_history()
-            print("---------------------------------\n")
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/tests/dsp_LM/modules/test_cloudflare_models.py b/tests/dsp_LM/modules/test_cloudflare_models.py
deleted file mode 100644
index 188d2424e4..0000000000
--- a/tests/dsp_LM/modules/test_cloudflare_models.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Tests for Cloudflare models.
-Note: Requires configuration of your Cloudflare account_id and api_key.
-"""
-
-import dspy
-
-models = {
-    "@cf/qwen/qwen1.5-0.5b-chat": "https://huggingface.co/qwen/qwen1.5-0.5b-chat",
-    "@hf/meta-llama/meta-llama-3-8b-instruct": "https://llama.meta.com",
-    "@hf/nexusflow/starling-lm-7b-beta": "https://huggingface.co/Nexusflow/Starling-LM-7B-beta",
-    "@cf/meta/llama-3-8b-instruct": "https://llama.meta.com",
-    "@hf/thebloke/neural-chat-7b-v3-1-awq": "",
-    "@cf/meta/llama-2-7b-chat-fp16": "https://ai.meta.com/llama/",
-    "@cf/mistral/mistral-7b-instruct-v0.1": "https://mistral.ai/news/announcing-mistral-7b/",
-    "@cf/tinyllama/tinyllama-1.1b-chat-v1.0": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "@hf/mistral/mistral-7b-instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
-    "@cf/fblgit/una-cybertron-7b-v2-bf16": "",
-    "@hf/thebloke/codellama-7b-instruct-awq": "https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-AWQ",
-    "@cf/thebloke/discolm-german-7b-v1-awq": "https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-AWQ",
-    "@cf/meta/llama-2-7b-chat-int8": "https://ai.meta.com/llama/",
-    "@hf/thebloke/mistral-7b-instruct-v0.1-awq": "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
-    "@hf/thebloke/openchat_3.5-awq": "",
-    "@cf/qwen/qwen1.5-7b-chat-awq": "https://huggingface.co/qwen/qwen1.5-7b-chat-awq",
-    "@hf/thebloke/llama-2-13b-chat-awq": "https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ",
-    "@hf/thebloke/deepseek-coder-6.7b-base-awq": "",
-    "@hf/thebloke/openhermes-2.5-mistral-7b-awq": "",
-    "@hf/thebloke/deepseek-coder-6.7b-instruct-awq": "",
-    "@cf/deepseek-ai/deepseek-math-7b-instruct": "https://huggingface.co/deepseek-ai/deepseek-math-7b-instruct",
-    "@cf/tiiuae/falcon-7b-instruct": "https://huggingface.co/tiiuae/falcon-7b-instruct",
-    "@hf/nousresearch/hermes-2-pro-mistral-7b": "https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B",
-    "@hf/thebloke/zephyr-7b-beta-awq": "https://huggingface.co/TheBloke/zephyr-7B-beta-AWQ",
-    "@cf/qwen/qwen1.5-1.8b-chat": "https://huggingface.co/qwen/qwen1.5-1.8b-chat",
-    "@cf/defog/sqlcoder-7b-2": "https://huggingface.co/defog/sqlcoder-7b-2",
-    "@cf/microsoft/phi-2": "https://huggingface.co/microsoft/phi-2",
-    "@hf/google/gemma-7b-it": "https://ai.google.dev/gemma/docs",
-}
-
-
-def get_lm(name: str):  # -> dspy.LM:
-    return dspy.CloudflareAI(model=name)
-
-
-def run_tests():
-    """Test the providers and models"""
-    # Configure your AWS credentials with the AWS CLI before running this script
-    models
-
-    predict_func = dspy.Predict("question -> answer")
-    for model_name in models.keys():
-        lm = get_lm(model_name)
-        with dspy.context(lm=lm):
-            question = "What is the capital of France?"
-            answer = predict_func(question=question).answer
-            print(f"Question: {question}\nAnswer: {answer}")
-            print("---------------------------------")
-            lm.inspect_history()
-            print("---------------------------------\n")
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/tests/dsp_LM/modules/test_hf_model.py b/tests/dsp_LM/modules/test_hf_model.py
deleted file mode 100644
index 0b06c80429..0000000000
--- a/tests/dsp_LM/modules/test_hf_model.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from pytest_mock.plugin import MockerFixture
-from transformers import AutoModelForSeq2SeqLM
-
-import dspy
-
-
-class MockConfig:
-    def __init__(self, architectures: list[str]):
-        self.architectures = architectures
-
-
-def test_load_gated_model(mocker: MockerFixture):
-    conf = MockConfig(architectures=["ConditionalGeneration"])
-    mocker.patch("transformers.AutoModelForSeq2SeqLM.from_pretrained")
-    mocker.patch("transformers.AutoConfig.from_pretrained", return_value=conf)
-    mocker.patch("transformers.AutoTokenizer.from_pretrained")
-
-    some_token = "asdfasdfasdf"
-    model = "google/gemma-7b"
-    _ = dspy.HFModel(model, token=some_token)
-    AutoModelForSeq2SeqLM.from_pretrained.assert_called_with(model, device_map="auto", token=some_token)
-
-
-def test_load_ungated_model(mocker: MockerFixture):
-    conf = MockConfig(architectures=["ConditionalGeneration"])
-    mocker.patch("transformers.AutoModelForSeq2SeqLM.from_pretrained")
-    mocker.patch("transformers.AutoConfig.from_pretrained", return_value=conf)
-    mocker.patch("transformers.AutoTokenizer.from_pretrained")
-    _ = dspy.HFModel("openai-community/gpt2")
-    # no token used in automodel
-    AutoModelForSeq2SeqLM.from_pretrained.assert_called_with("openai-community/gpt2", device_map="auto", token=None)
diff --git a/tests/dsp_LM/modules/vectorizer/test_fastembed.py b/tests/dsp_LM/modules/vectorizer/test_fastembed.py
deleted file mode 100644
index e9a7335a4f..0000000000
--- a/tests/dsp_LM/modules/vectorizer/test_fastembed.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pytest
-
-from dsp.modules.sentence_vectorizer import FastEmbedVectorizer
-from dspy.primitives.example import Example
-
-# Skip the test if the 'fastembed' package is not installed
-pytest.importorskip("fastembed", reason="'fastembed' is not installed. Use `pip install fastembed` to install it.")
-
-
-@pytest.mark.parametrize(
-    "n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5"), (512, "jinaai/jina-embeddings-v2-small-en")]
-)
-def test_fastembed_with_examples(n_dims, model_name):
-    vectorizer = FastEmbedVectorizer(model_name)
-
-    examples = [
-        Example(query="What's the price today?", response="The price is $10.00").with_inputs("query", "response"),
-        Example(query="What's the weather today?", response="The weather is sunny").with_inputs("query", "response"),
-        Example(query="Who was leading the team?", response="It was Jim. Rather enthusiastic guy.").with_inputs(
-            "query", "response"
-        ),
-    ]
-
-    embeddings = vectorizer(examples)
-
-    assert embeddings.shape == (len(examples), n_dims)
-
-
-@pytest.mark.parametrize(
-    "n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5"), (512, "jinaai/jina-embeddings-v2-small-en")]
-)
-def test_fastembed_with_strings(n_dims, model_name):
-    vectorizer = FastEmbedVectorizer(model_name)
-
-    inputs = [
-        "Jonathan Kent is a fictional character appearing in American comic books published by DC Comics.",
-        "Clark Kent is a fictional character appearing in American comic books published by DC Comics.",
-        "Martha Kent is a fictional character appearing in American comic books published by DC Comics.",
-    ]
-
-    embeddings = vectorizer(inputs)
-
-    assert embeddings.shape == (len(inputs), n_dims)
diff --git a/tests/dsp_LM/predict/test_aggregation.py b/tests/dsp_LM/predict/test_aggregation.py
deleted file mode 100644
index eb1d975368..0000000000
--- a/tests/dsp_LM/predict/test_aggregation.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from dsp.utils import normalize_text
-from dspy.predict.aggregation import majority
-from dspy.primitives.prediction import Completions, Prediction
-
-
-def test_majority_with_prediction():
-    prediction = Prediction.from_completions([{"answer": "2"}, {"answer": "2"}, {"answer": "3"}])
-    result = majority(prediction)
-    assert result.completions[0]["answer"] == "2"
-
-
-def test_majority_with_completions():
-    completions = Completions([{"answer": "2"}, {"answer": "2"}, {"answer": "3"}])
-    result = majority(completions)
-    assert result.completions[0]["answer"] == "2"
-
-
-def test_majority_with_list():
-    completions = [{"answer": "2"}, {"answer": "2"}, {"answer": "3"}]
-    result = majority(completions)
-    assert result.completions[0]["answer"] == "2"
-
-
-def test_majority_with_normalize():
-    completions = [{"answer": "2"}, {"answer": " 2"}, {"answer": "3"}]
-    result = majority(completions, normalize=normalize_text)
-    assert result.completions[0]["answer"] == "2"
-
-
-def test_majority_with_field():
-    completions = [
-        {"answer": "2", "other": "1"},
-        {"answer": "2", "other": "1"},
-        {"answer": "3", "other": "2"},
-    ]
-    result = majority(completions, field="other")
-    assert result.completions[0]["other"] == "1"
-
-
-def test_majority_with_no_majority():
-    completions = [{"answer": "2"}, {"answer": "3"}, {"answer": "4"}]
-    result = majority(completions)
-    assert result.completions[0]["answer"] == "2"  # The first completion is returned in case of a tie
diff --git a/tests/dsp_LM/predict/test_knn.py b/tests/dsp_LM/predict/test_knn.py
deleted file mode 100644
index 7f35b42e4f..0000000000
--- a/tests/dsp_LM/predict/test_knn.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-import pytest
-
-import dsp
-import dspy
-from dspy.predict import KNN
-from dspy.utils import DummyVectorizer
-
-
-def mock_example(question: str, answer: str) -> dsp.Example:
-    """Creates a mock DSP example with specified question and answer."""
-    return dspy.Example(question=question, answer=answer).with_inputs("question")
-
-
-@pytest.fixture
-def setup_knn():
-    """Sets up a KNN instance with a mocked vectorizer for testing."""
-    dsp.SentenceTransformersVectorizer = DummyVectorizer
-    trainset = [
-        mock_example("What is the capital of France?", "Paris"),
-        mock_example("What is the largest ocean?", "Pacific"),
-        mock_example("What is 2+2?", "4"),
-    ]
-    knn = KNN(k=2, trainset=trainset)
-    return knn
-
-
-def test_knn_initialization(setup_knn):
-    """Tests the KNN initialization and checks if the trainset vectors are correctly created."""
-    knn = setup_knn
-    assert knn.k == 2, "Incorrect k value"
-    assert len(knn.trainset_vectors) == 3, "Incorrect size of trainset vectors"
-    assert isinstance(knn.trainset_vectors, np.ndarray), "Trainset vectors should be a NumPy array"
-
-
-def test_knn_query(setup_knn):
-    """Tests the KNN query functionality for retrieving the nearest neighbors."""
-    knn = setup_knn
-    query = {"question": "What is 3+3?"}  # A query close to "What is 2+2?"
-    nearest_samples = knn(**query)
-    assert len(nearest_samples) == 2, "Incorrect number of nearest samples returned"
-    assert nearest_samples[0].answer == "4", "Incorrect nearest sample returned"
-
-
-def test_knn_query_specificity(setup_knn):
-    """Tests the KNN query functionality for specificity of returned examples."""
-    knn = setup_knn
-    query = {"question": "What is the capital of Germany?"}  # A query close to "What is the capital of France?"
-    nearest_samples = knn(**query)
-    assert len(nearest_samples) == 2, "Incorrect number of nearest samples returned"
-    assert "Paris" in [sample.answer for sample in nearest_samples], "Expected Paris to be a nearest sample answer"
diff --git a/tests/dsp_LM/predict/test_langchain.py b/tests/dsp_LM/predict/test_langchain.py
deleted file mode 100644
index 89fa1d75d9..0000000000
--- a/tests/dsp_LM/predict/test_langchain.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import pytest
-
-pytest.importorskip("langchain")
-
-import os
-
-from langchain import hub
-from langchain_chroma import Chroma
-from langchain_community.embeddings import FakeEmbeddings
-from langchain_core.documents import Document
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnablePassthrough
-from langchain_openai import ChatOpenAI
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-
-from dspy.predict.langchain import LangChainModule, LangChainPredict
-
-
-def test_copying_module():
-    os.environ["OPENAI_API_KEY"] = "fake-key"
-    llm = ChatOpenAI(model="gpt-4o-mini")
-    docs = [Document(page_content="Hello, world!", metadata={"source": "https://example.com"})]
-
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=10)
-    splits = text_splitter.split_documents(docs)
-    vectorstore = Chroma.from_documents(documents=splits, embedding=FakeEmbeddings(size=5))
-
-    # Retrieve and generate using the relevant snippets of the blog.
-    retriever = vectorstore.as_retriever()
-
-    prompt = hub.pull("rlm/rag-prompt")
-
-    def format_docs(docs):
-        return "\n\n".join(doc.page_content for doc in docs)
-
-    rag_chain = (
-        {"context": retriever | format_docs, "question": RunnablePassthrough()}
-        | LangChainPredict(prompt, llm)
-        | StrOutputParser()
-    )
-    # Now we wrap it in LangChainModule.
-    rag_dspy_module = LangChainModule(rag_chain)
-
-    copied_module = rag_dspy_module.reset_copy()
-    assert len(copied_module.chain.steps) == len(rag_dspy_module.chain.steps)
-    for module, copied_module in zip(rag_dspy_module.chain.steps, copied_module.chain.steps):
-        if isinstance(module, LangChainPredict):
-            # The LangChainPredict modules are deep copied.
-            assert module != copied_module
-            assert module.langchain_llm.model_name == copied_module.langchain_llm.model_name
-        else:
-            # The rest of the modules are just copied by reference.
-            assert module == copied_module
-    # Clean up.
-    os.environ["OPENAI_API_KEY"] = None
diff --git a/tests/dsp_LM/primitives/test_example.py b/tests/dsp_LM/primitives/test_example.py
deleted file mode 100644
index 39ad0b749f..0000000000
--- a/tests/dsp_LM/primitives/test_example.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import pytest
-
-from dspy import Example
-
-
-def test_example_initialization():
-    example = Example(a=1, b=2)
-    assert example.a == 1
-    assert example.b == 2
-
-
-def test_example_initialization_from_base():
-    base = Example(a=1, b=2)
-    example = Example(base=base, c=3)
-    assert example.a == 1
-    assert example.b == 2
-    assert example.c == 3
-
-
-def test_example_initialization_from_dict():
-    base_dict = {"a": 1, "b": 2}
-    example = Example(base=base_dict, c=3)
-    assert example.a == 1
-    assert example.b == 2
-    assert example.c == 3
-
-
-def test_example_set_get_item():
-    example = Example()
-    example["a"] = 1
-    assert example["a"] == 1
-
-
-def test_example_attribute_access():
-    example = Example(a=1)
-    assert example.a == 1
-    example.a = 2
-    assert example.a == 2
-
-
-def test_example_deletion():
-    example = Example(a=1, b=2)
-    del example["a"]
-    with pytest.raises(AttributeError):
-        _ = example.a
-
-
-def test_example_len():
-    example = Example(a=1, b=2, dspy_hidden=3)
-    assert len(example) == 2
-
-
-def test_example_repr_str():
-    example = Example(a=1)
-    assert repr(example) == "Example({'a': 1}) (input_keys=None)"
-    assert str(example) == "Example({'a': 1}) (input_keys=None)"
-
-
-def test_example_eq():
-    example1 = Example(a=1, b=2)
-    example2 = Example(a=1, b=2)
-    assert example1 == example2
-    assert example1 != ""
-
-
-def test_example_hash():
-    example1 = Example(a=1, b=2)
-    example2 = Example(a=1, b=2)
-    assert hash(example1) == hash(example2)
-
-
-def test_example_keys_values_items():
-    example = Example(a=1, b=2, dspy_hidden=3)
-    assert set(example.keys()) == {"a", "b"}
-    assert 1 in example.values()
-    assert ("b", 2) in example.items()
-
-
-def test_example_get():
-    example = Example(a=1, b=2)
-    assert example.get("a") == 1
-    assert example.get("c", "default") == "default"
-
-
-def test_example_with_inputs():
-    example = Example(a=1, b=2).with_inputs("a")
-    assert example._input_keys == {"a"}
-
-
-def test_example_inputs_labels():
-    example = Example(a=1, b=2).with_inputs("a")
-    inputs = example.inputs()
-    assert inputs.toDict() == {"a": 1}
-    labels = example.labels()
-    assert labels.toDict() == {"b": 2}
-
-
-def test_example_copy_without():
-    example = Example(a=1, b=2)
-    copied = example.copy(c=3)
-    assert copied.a == 1
-    assert copied.c == 3
-    without_a = copied.without("a")
-    with pytest.raises(AttributeError):
-        _ = without_a.a
-
-
-def test_example_to_dict():
-    example = Example(a=1, b=2)
-    assert example.toDict() == {"a": 1, "b": 2}
diff --git a/tests/dsp_LM/primitives/test_module.py b/tests/dsp_LM/primitives/test_module.py
deleted file mode 100644
index 4ba6df220b..0000000000
--- a/tests/dsp_LM/primitives/test_module.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import threading
-
-import dspy
-
-
-def test_deepcopy_basic():
-    signature = dspy.Signature("q -> a")
-    cot = dspy.ChainOfThought(signature)
-    cot_copy = cot.deepcopy()
-    assert len(cot.parameters()) == len(cot_copy.parameters())
-    # Parameters should be different objects with the same values.
-    assert id(cot.parameters()[0]) != id(cot_copy.parameters()[0])
-    assert cot.parameters()[0].__dict__ == cot_copy.parameters()[0].__dict__
-
-
-def test_deepcopy_with_uncopyable_modules():
-    class CustomClass(dspy.Module):
-        def __init__(self):
-            self.lock = threading.Lock()  # Non-copyable object.
-            self.cot = dspy.ChainOfThought(dspy.Signature("q -> a"))
-
-    model = CustomClass()
-    model_copy = model.deepcopy()
-    assert len(model.parameters()) == len(model_copy.parameters())
-    # The lock should be refer to the same object (shallow copy).
-    assert id(model.lock) == id(model_copy.lock)
-    # Parameters should be different objects with the same values.
-    assert id(model.parameters()[0]) != id(model_copy.parameters()[0])
-    assert model.parameters()[0].__dict__ == model_copy.parameters()[0].__dict__
-
-
-def test_deepcopy_with_nested_modules():
-    class CustomClass1(dspy.Module):
-        def __init__(self):
-            self.lock = threading.Lock()  # Non-copyable object.
-            self.cot = dspy.ChainOfThought(dspy.Signature("q -> a"))
-
-    class CustomClass2(dspy.Module):
-        def __init__(self):
-            self.submodel = CustomClass1()
-
-    model = CustomClass2()
-    model_copy = model.deepcopy()
-    assert len(model.parameters()) == len(model_copy.parameters())
-    # The lock should be refer to the same object (shallow copy).
-    assert id(model.submodel.lock) == id(model_copy.submodel.lock)
-    # Parameters should be different objects with the same values.
-    assert id(model.parameters()[0]) != id(model_copy.parameters()[0])
-    assert model.parameters()[0].__dict__ == model_copy.parameters()[0].__dict__
diff --git a/tests/dsp_LM/primitives/test_python_interpreter.py b/tests/dsp_LM/primitives/test_python_interpreter.py
deleted file mode 100644
index a1c0713887..0000000000
--- a/tests/dsp_LM/primitives/test_python_interpreter.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pytest
-
-from dspy.primitives.python_interpreter import CodePrompt, PythonInterpreter, TextPrompt
-
-
-def test_execute_simple_code():
-    interpreter = PythonInterpreter(action_space={"print": print})
-    code = "print('Hello, World!')"
-    result = interpreter.execute(code)
-    assert result is None, "Simple print statement should return None"
-
-
-def test_action_space_limitation():
-    def func(string):
-        pass
-
-    interpreter = PythonInterpreter(action_space={})
-    code = "func('This should not execute')"
-    with pytest.raises(Exception):
-        interpreter.execute(code)
-
-
-def test_import_whitelist():
-    interpreter = PythonInterpreter(action_space={}, import_white_list=["math"])
-    code = "import math\nresult = math.sqrt(4)"
-    result = interpreter.execute(code)
-    assert result == 2, "Should be able to import and use math.sqrt"
-
-
-def test_fuzzy_variable_matching():
-    interpreter = PythonInterpreter(action_space={})
-    code = "result = number + 1"
-    result = interpreter.execute(code, fuzz_state={"number": 4})
-    assert result == 5, "Fuzzy variable matching should work"
-
-
-def test_text_prompt_keyword_extraction():
-    prompt = TextPrompt("Hello {name}, how are you?")
-    assert "name" in prompt.key_words, "Keyword 'name' should be extracted"
-
-
-def test_text_prompt_formatting():
-    prompt = TextPrompt("Hello {name}, how are you?")
-    formatted = prompt.format(name="Alice")
-    assert formatted == "Hello Alice, how are you?", "Should format with provided value"
-
-
-def test_code_prompt_execution():
-    action_space = {"len": len}
-    interpreter = PythonInterpreter(action_space=action_space)
-    code_prompt = CodePrompt("result = len('hello')")
-    result, _ = code_prompt.execute(interpreter)
-    assert result == 5, "Code execution should return the length of 'hello'"
diff --git a/tests/dsp_LM/retrieve/integration_test_pgvectorrm.py b/tests/dsp_LM/retrieve/integration_test_pgvectorrm.py
deleted file mode 100644
index 524209bea5..0000000000
--- a/tests/dsp_LM/retrieve/integration_test_pgvectorrm.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""Instructions:
-Add to dev container features:
-    "ghcr.io/itsmechlark/features/postgresql:1": {},
-    "ghcr.io/robbert229/devcontainer-features/postgresql-client:1": {}
-Add to .personalization.sh:
-    sudo apt install -y postgresql-16-pgvector
-
-    sudo /etc/init.d/postgresql restart
-
-    psql -v ON_ERROR_STOP=1 --user ${PGUSER} <<EOF
-    create extension if not exists vector;
-    EOF
-poetry install -E postgres
-"""
-import psycopg2
-import pytest
-
-from dspy.primitives.example import Example
-from dspy.retrieve.pgvector_rm import PgVectorRM
-
-DB_URL = "postgresql://postgres:password@localhost/postgres"
-PG_TABLE_NAME = "test_table"
-
-
-def get_pgvectorrm():
-    openai_client = None  # Mock or use a real OpenAI client
-    pgvectorrm = PgVectorRM(
-        DB_URL, PG_TABLE_NAME, openai_client=openai_client, embedding_func=lambda x: "[2,3,4]", include_similarity=True
-    )
-    return pgvectorrm
-
-
-@pytest.fixture
-def setup_pgvectorrm():
-    pgvectorrm = get_pgvectorrm()
-    conn = psycopg2.connect(DB_URL)
-    cursor = conn.cursor()
-    cursor.execute(f"DROP TABLE IF EXISTS {PG_TABLE_NAME}")
-    conn.commit()
-
-    cursor.execute(f"CREATE TABLE IF NOT EXISTS {PG_TABLE_NAME} (id SERIAL PRIMARY KEY, text TEXT, embedding VECTOR)")
-    cursor.execute(
-        f"INSERT INTO {PG_TABLE_NAME} (text, embedding) VALUES ('Dummy text1', '[1,2,3]'), ('Dummy text2', '[4,5,6]')"
-    )
-    conn.commit()
-
-    yield pgvectorrm
-
-    cursor.execute(f"TRUNCATE TABLE {PG_TABLE_NAME}")
-    conn.commit()
-
-    cursor.close()
-    conn.close()
-
-
-def test_pgvectorrm_retrieve(setup_pgvectorrm):
-    pgvectorrm = setup_pgvectorrm
-    query = "test query"
-    results = pgvectorrm(query)
-    assert len(results) == 2
-    assert results == [
-        Example(text="Dummy text2", similarity=0.9946115458726394),
-        Example(text="Dummy text1", similarity=0.9925833339709302),
-    ]
-
-
-@pytest.mark.parametrize(
-    "k, expected",
-    [
-        (1, 1),
-        (2, 2),
-        (3, 2),  # Assuming only 2 entries exist
-    ],
-)
-def test_pgvectorrm_retrieve_diff_k(setup_pgvectorrm, k, expected):
-    setup_pgvectorrm.k = k
-    query = "test query"
-    results = setup_pgvectorrm(query)
-    assert len(results) == expected
-
-
-def test_empty_table():
-    # Assuming setup_pgvectorrm cleans up after yielding
-    query = "test query"
-    results = get_pgvectorrm()(query)
-    assert len(results) == 0
-
-
-def test_retrieval_without_similarity(setup_pgvectorrm):
-    setup_pgvectorrm.include_similarity = False
-    query = "test query"
-    results = setup_pgvectorrm(query)
-    # Ensure 'similarity' key is not in results
-    assert all("similarity" not in result for result in results)
diff --git a/tests/dsp_LM/retrieve/test_llama_index_rm.py b/tests/dsp_LM/retrieve/test_llama_index_rm.py
index 497ed87fc7..735c1a9407 100644
--- a/tests/dsp_LM/retrieve/test_llama_index_rm.py
+++ b/tests/dsp_LM/retrieve/test_llama_index_rm.py
@@ -3,7 +3,7 @@
 import pytest
 
 import dspy
-from dsp.modules.dummy_lm import DspDummyLM
+from dsp.modules.dummy_lm import DSPDummyLM
 from dspy.datasets import HotPotQA
 
 try:
@@ -38,7 +38,7 @@ def rag_setup() -> dict:
         "index": index,
         "retriever": retriever,
         "rm": rm,
-        "lm": DspDummyLM(answers=dummyset),
+        "lm": DSPDummyLM(answers=dummyset),
         "trainset": trainset,
         "devset": devset,
     }
diff --git a/tests/dsp_LM/teleprompt/test_bootstrap.py b/tests/dsp_LM/teleprompt/test_bootstrap.py
index 6bc41ec610..936daf8e4b 100644
--- a/tests/dsp_LM/teleprompt/test_bootstrap.py
+++ b/tests/dsp_LM/teleprompt/test_bootstrap.py
@@ -65,7 +65,7 @@ def test_bootstrap_effectiveness():
     assert compiled_student.predictor.demos[0].output == trainset[0].output
 
     # Test the compiled student's prediction.
-    # We are using a DspDummyLM with follow_examples=True, which means that
+    # We are using a DSPDummyLM with follow_examples=True, which means that
     # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
     # on the second output, if it seems an example that perfectly matches the
     # prompt, it will use that instead. That is why we expect "blue" here.
@@ -115,7 +115,7 @@ def forward(self, **kwargs):
     student = SimpleModule("input -> output")
     teacher = BuggyModule("input -> output")
 
-    # Setup DspDummyLM to simulate an error scenario
+    # Setup DSPDummyLM to simulate an error scenario
     lm = DSPDummyLM(
         [
             "Initial thoughts",  # Simulate initial teacher's prediction
diff --git a/tests/dsp_LM/teleprompt/test_ensemble.py b/tests/dsp_LM/teleprompt/test_ensemble.py
deleted file mode 100644
index c779e87823..0000000000
--- a/tests/dsp_LM/teleprompt/test_ensemble.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import pytest
-
-import dspy
-from dspy.teleprompt import Ensemble
-
-
-class MockProgram(dspy.Module):
-    def __init__(self, output):
-        super().__init__()
-        self.output = output
-
-    def forward(self, *args, **kwargs):
-        return self.output
-
-
-# Simple reduction function to test with
-def mock_reduce_fn(outputs):
-    return sum(outputs) / len(outputs)
-
-
-def test_ensemble_without_reduction():
-    """Test that Ensemble correctly combines outputs without applying a reduce_fn."""
-    programs = [MockProgram(i) for i in range(5)]
-    ensemble = Ensemble()
-    ensembled_program = ensemble.compile(programs)
-
-    outputs = ensembled_program()
-    assert len(outputs) == 5, "Ensemble did not combine the correct number of outputs"
-
-
-def test_ensemble_with_reduction():
-    """Test that Ensemble correctly applies a reduce_fn to combine outputs."""
-    programs = [MockProgram(i) for i in range(5)]
-    ensemble = Ensemble(reduce_fn=mock_reduce_fn)
-    ensembled_program = ensemble.compile(programs)
-
-    output = ensembled_program()
-    expected_output = sum(range(5)) / 5
-    assert output == expected_output, "Ensemble did not correctly apply the reduce_fn"
-
-
-def test_ensemble_with_size_limitation():
-    """Test that specifying a size limits the number of programs used in the ensemble."""
-    programs = [MockProgram(i) for i in range(10)]
-    ensemble_size = 3
-    ensemble = Ensemble(size=ensemble_size)
-    ensembled_program = ensemble.compile(programs)
-
-    outputs = ensembled_program()
-    assert len(outputs) == ensemble_size, "Ensemble did not respect the specified size limitation"
-
-
-def test_ensemble_deterministic_behavior():
-    """Verify that the Ensemble class raises an assertion for deterministic behavior."""
-    with pytest.raises(
-        AssertionError,
-        match="TODO: Implement example hashing for deterministic ensemble.",
-    ):
-        Ensemble(deterministic=True)
diff --git a/tests/dsp_LM/teleprompt/test_knn_fewshot.py b/tests/dsp_LM/teleprompt/test_knn_fewshot.py
index 771e01d557..97c2dbbe3d 100644
--- a/tests/dsp_LM/teleprompt/test_knn_fewshot.py
+++ b/tests/dsp_LM/teleprompt/test_knn_fewshot.py
@@ -43,7 +43,7 @@ def _test_knn_few_shot_compile(setup_knn_few_shot):
     student = SimpleModule("input -> output")
     teacher = SimpleModule("input -> output")  # Assuming teacher uses the same module type
 
-    # Setup DspDummyLM with a response for a query similar to one of the training examples
+    # Setup DSPDummyLM with a response for a query similar to one of the training examples
     lm = DSPDummyLM(["Madrid", "10"])
     dspy.settings.configure(lm=lm)  # Responses for the capital of Spain and the result of 5+5)
 
@@ -60,6 +60,6 @@ def _test_knn_few_shot_compile(setup_knn_few_shot):
     print("CONVO")
     print(lm.get_convo(-1))
 
-    # Validate that the output corresponds to one of the expected DspDummyLM responses
+    # Validate that the output corresponds to one of the expected DSPDummyLM responses
     # This assumes the compiled_student's forward method will execute the predictor with the given query
     assert output in ["Madrid", "10"], "The compiled student did not return the correct output based on the query"
diff --git a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
index 9ed1b594fd..86d8c00d0d 100644
--- a/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
+++ b/tests/dsp_LM/teleprompt/test_mipro_optimizer.py
@@ -81,7 +81,7 @@ def basic_request(self, prompt, num_candidates=1, **kwargs):
             answer = "think deeply.\nOutput: " + answer
 
         RED, GREEN, RESET = "\033[91m", "\033[92m", "\033[0m"
-        print("=== DspDummyLM ===")
+        print("=== DSPDummyLM ===")
         print(prompt, end="")
         print(f"{RED}{answer}{RESET}")
         print("===")
@@ -173,7 +173,7 @@ def test_signature_optimizer_bad_lm():
         track_stats=False,
     )
 
-    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DspDummyLM,
+    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DSPDummyLM,
     # which only outputs "Optimized instruction i" this means that none of the bootstrapped examples are successful,
     # and therefore the set of examples that we're using to generate new prompts is empty
     with pytest.raises(ValueError):

From 738ea87a348ac284a325c160a7f455ba0cd2518b Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Mon, 7 Oct 2024 14:47:15 +0000
Subject: [PATCH 16/17] feat(dspy): update DummyLM to use ChatAdapter's
 format_field to format outputs

---
 dspy/utils/dummies.py                         |  11 +-
 tests/evaluate/test_evaluate.py               |  16 +-
 tests/functional/test_functional.py           | 383 +++---------------
 tests/functional/test_signature_opt_typed.py  |  14 +-
 tests/predict/test_chain_of_thought.py        |  43 +-
 .../test_chain_of_thought_with_hint.py        |  94 +----
 tests/predict/test_multi_chain_comparison.py  |   2 +-
 tests/predict/test_predict.py                 |  86 +---
 tests/predict/test_program_of_thought.py      | 176 +-------
 tests/predict/test_react.py                   |  12 +-
 tests/predict/test_retry.py                   |  12 +-
 tests/primitives/test_program.py              |   4 +-
 tests/signatures/test_signature.py            |  34 +-
 tests/teleprompt/test_bootstrap.py            |  67 +--
 tests/teleprompt/test_copro_optimizer.py      |  74 +---
 15 files changed, 138 insertions(+), 890 deletions(-)

diff --git a/dspy/utils/dummies.py b/dspy/utils/dummies.py
index 3288facf5f..bdc51f0aa0 100644
--- a/dspy/utils/dummies.py
+++ b/dspy/utils/dummies.py
@@ -7,7 +7,7 @@
 
 from dsp.modules import LM as DSPLM
 from dsp.utils.utils import dotdict
-from dspy.adapters.chat_adapter import field_header_pattern
+from dspy.adapters.chat_adapter import field_header_pattern, format_fields
 from dspy.clients.lm import LM
 
 
@@ -98,7 +98,7 @@ def get_convo(self, index) -> str:
 
 
 class DummyLM(LM):
-    def __init__(self, answers: Union[list[str], dict[str, str]], follow_examples: bool = False):
+    def __init__(self, answers: Union[list[dict[str, str]], dict[str, dict[str, str]]], follow_examples: bool = False):
         super().__init__("dummy", "chat", 0.0, 1000, True)
         self.answers = answers
         if isinstance(answers, list):
@@ -133,10 +133,13 @@ def __call__(self, prompt=None, messages=None, **kwargs):
                 outputs.append(self._use_example(messages))
             elif isinstance(self.answers, dict):
                 outputs.append(
-                    next((v for k, v in self.answers.items() if k in messages[-1]["content"]), "No more responses")
+                    next(
+                        (format_fields(v) for k, v in self.answers.items() if k in messages[-1]["content"]),
+                        "No more responses",
+                    )
                 )
             else:
-                outputs.append(next(self.answers, "No more responses"))
+                outputs.append(format_fields(next(self.answers, {"answer": "No more responses"})))
 
             # Logging, with removed api key & where `cost` is None on cache hit.
             kwargs = {k: v for k, v in kwargs.items() if not k.startswith("api_")}
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index 01c3e5739b..a8552c7aa6 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -37,8 +37,8 @@ def test_evaluate_call():
     dspy.settings.configure(
         lm=DummyLM(
             {
-                "What is 1+1?": "[[ ## answer ## ]]\n2",
-                "What is 2+2?": "[[ ## answer ## ]]\n4",
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
             }
         )
     )
@@ -55,9 +55,7 @@ def test_evaluate_call():
 
 
 def test_multithread_evaluate_call():
-    dspy.settings.configure(
-        lm=DummyLM({"What is 1+1?": "[[ ## answer ## ]]\n2", "What is 2+2?": "[[ ## answer ## ]]\n4"})
-    )
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     assert program(question="What is 1+1?").answer == "2"
@@ -80,9 +78,7 @@ def __call__(self, *args, **kwargs):
             time.sleep(1)
             return super().__call__(*args, **kwargs)
 
-    dspy.settings.configure(
-        lm=SlowLM({"What is 1+1?": "[[ ## answer ## ]]\n2", "What is 2+2?": "[[ ## answer ## ]]\n4"})
-    )
+    dspy.settings.configure(lm=SlowLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
 
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
@@ -112,9 +108,7 @@ def sleep_then_interrupt():
 
 
 def test_evaluate_call_bad():
-    dspy.settings.configure(
-        lm=DummyLM({"What is 1+1?": "[[ ## answer ## ]]\n0", "What is 2+2?": "[[ ## answer ## ]]\n0"})
-    )
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "0"}, "What is 2+2?": {"answer": "0"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
     program = Predict("question -> answer")
     ev = Evaluate(
diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
index 1f0242763a..61ac8c15a1 100644
--- a/tests/functional/test_functional.py
+++ b/tests/functional/test_functional.py
@@ -21,7 +21,7 @@ def hard_question(topic: str) -> str:
         """Think of a hard factual question about a topic."""
 
     expected = "What is the speed of light?"
-    lm = DummyLM([f"[[ ## hard_question ## ]]\n{expected}"])
+    lm = DummyLM([{"hard_question": expected}])
     dspy.settings.configure(lm=lm)
 
     question = hard_question(topic="Physics")
@@ -36,7 +36,7 @@ def hard_questions(topics: List[str]) -> List[str]:
         pass
 
     expected = ["What is the speed of light?", "What is the speed of sound?"]
-    lm = DummyLM(['[[ ## hard_questions ## ]]\n["What is the speed of light?", "What is the speed of sound?"]'])
+    lm = DummyLM([{"hard_questions": '["What is the speed of light?", "What is the speed of sound?"]'}])
     dspy.settings.configure(lm=lm)
 
     question = hard_questions(topics=["Physics", "Music"])
@@ -54,7 +54,7 @@ def hard_question(topic: str) -> Question:
         """Think of a hard factual question about a topic."""
 
     expected = "What is the speed of light?"
-    lm = DummyLM([f'[[ ## hard_question ## ]]\n{{"value": "{expected}"}}'])
+    lm = DummyLM([{"hard_question": f'{{"value": "{expected}"}}'}])
     dspy.settings.configure(lm=lm)
 
     question = hard_question(topic="Physics")
@@ -75,7 +75,7 @@ def answer(question: Question) -> Answer:
         pass
 
     question = Question(value="What is the speed of light?")
-    lm = DummyLM([f'[[ ## answer ## ]]\n{{"value": "3e8"}}'])
+    lm = DummyLM([{"answer": '{"value": "3e8"}'}])
     dspy.settings.configure(lm=lm)
 
     result = answer(question=question)
@@ -110,10 +110,10 @@ def forward(self, **kwargs):
 
     lm = DummyLM(
         [
-            "[[ ## hard_question ## ]]\nWhat is the speed of light?",
-            "[[ ## reasoning ## ]]\nSome bad reasoning, 3e8 m/s.\n\n[[ ## answer ## ]]\n3e8",  # Bad answer 1
-            "[[ ## json_object ## ]]\n{...}",  # Model is asked to create an example
-            f"[[ ## reasoning ## ]]\nSome good reasoning, 3e8 m/s.\n\n[[ ## answer ## ]]\n{expected.model_dump_json()}",  # Good answer
+            {"hard_question": "What is the speed of light?"},
+            {"reasoning": "Some bad reasoning, 3e8 m/s.", "answer": "3e8"},  # Bad answer 1
+            {"json_object": "{...}"},  # Model is asked to create an example
+            {"reasoning": "Some good reasoning, 3e8 m/s.", "answer": f"{expected.model_dump_json()}"},  # Good answer
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -143,7 +143,7 @@ class MySignature(dspy.Signature):
     expected = "What is the speed of light?"
     lm = DummyLM(
         [
-            f"[[ ## output ## ]]\n{Question(value=expected).model_dump_json()}",
+            {"output": f"{Question(value=expected).model_dump_json()}"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -242,24 +242,6 @@ def simple_metric(example, prediction, trace=None):
     prediction = compiled_student(input=trainset[0].input)
     assert prediction == trainset[0].output
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": "Your input fields are:\n1. `input` (str)\n\nYour output fields are:\n1. `output` (str)\n\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## input ## ]]\n{input}\n\n[[ ## output ## ]]\n{output}\n\n[[ ## completed ## ]]\n\nIn adhering to this structure, your objective is: \n        Given the fields `input`, produce the fields `output`.",
-        },
-        {
-            "role": "user",
-            "content": "[[ ## input ## ]]\nWhat is the color of the sky?\n\nRespond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`.",
-        },
-        {"role": "assistant", "content": "[[ ## output ## ]]\nblue\n\n[[ ## completed ## ]]"},
-        {
-            "role": "user",
-            "content": "[[ ## input ## ]]\nWhat is the color of the sky?\n\nRespond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`.",
-        },
-    ]
-
-    assert lm.get_convo(-1)[1] == ["[[ ## output ## ]]\nblue\n\n[[ ## completed ## ]]"]
-
 
 def test_regex():
     class TravelInformation(BaseModel):
@@ -281,11 +263,11 @@ def flight_information(email: str) -> TravelInformation:
     lm = DummyLM(
         [
             # Example with a bad origin code.
-            '[[ ## flight_information ## ]]\n{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            {"flight_information": '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}'},
             # Example to help the model understand
-            "[[ ## json_object ## ]]\n{...}",
+            {"json_object": "{...}"},
             # Fixed
-            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+            {"flight_information": '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -341,11 +323,12 @@ def flight_information(email: str) -> TravelInformation:
         [
             # Example with a bad origin code.
             (
-                "[[ ## flight_information ## ]]\nHere is your json: "
-                "{"
-                '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
-                '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
-                '"date": "2022-12-25"}'
+                {
+                    "flight_information": "Here is your json: {"
+                    '"origin": {"code":"JFK", "lat":40.6446, "lon":-73.7797}, '
+                    '"destination": {"code":"LAX", "lat":33.942791, "lon":-118.410042}, '
+                    '"date": "2022-12-25"}'
+                }
             ),
         ]
     )
@@ -370,8 +353,8 @@ def flight_information(email: str) -> TravelInformation:
 
     lm = DummyLM(
         [
-            '[[ ## flight_information ## ]]\n{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
-            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
+            {"flight_information": '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}'},
+            {"flight_information": '{"origin": "JFK", "destination": "LAX", "date": "bad date"}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -393,11 +376,11 @@ def flight_information(email: str) -> TravelInformation:
     lm = DummyLM(
         [
             # First origin is wrong, then destination, then all is good
-            '[[ ## flight_information ## ]]\n{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
-            "[[ ## json_object ## ]]\n{...}",  # Example to help the model understand
-            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
-            "[[ ## json_object ## ]]\n{...}",  # Example to help the model understand
-            '[[ ## flight_information ## ]]\n{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+            {"flight_information": '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}'},
+            {"json_object": "{...}"},  # Example to help the model understand
+            {"flight_information": '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}'},
+            {"json_object": "{...}"},  # Example to help the model understand
+            {"flight_information": '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -406,57 +389,6 @@ def flight_information(email: str) -> TravelInformation:
         origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
     )
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-            Your input fields are:
-            1. `email` (str)
-            2. `error_flight_information_0` (str): An error to avoid in the future
-            3. `error_flight_information_1` (str): An error to avoid in the future
-
-            Your output fields are:
-            1. `flight_information` (TravelInformation): ${flight_information}. Respond with a single JSON object. JSON Schema: {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
-
-            All interactions will be structured in the following way, with the appropriate values filled in.
-
-            [[ ## email ## ]]
-            {email}
-
-            [[ ## error_flight_information_0 ## ]]
-            {error_flight_information_0}
-
-            [[ ## error_flight_information_1 ## ]]
-            {error_flight_information_1}
-
-            [[ ## flight_information ## ]]
-            {flight_information}
-
-            [[ ## completed ## ]]
-
-            In adhering to this structure, your objective is: 
-                    Given the fields `email`, produce the fields `flight_information`."""
-            ),  # noqa
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-            [[ ## email ## ]]
-            Some email
-
-            [[ ## error_flight_information_0 ## ]]
-            String should match pattern '^[A-Z]{3}$': origin (error type: string_pattern_mismatch)
-
-            [[ ## error_flight_information_1 ## ]]
-            String should match pattern '^[A-Z]{3}$': destination (error type: string_pattern_mismatch)
-
-            Respond with the corresponding output fields, starting with the field `flight_information`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_field_validator():
     class UserDetails(BaseModel):
@@ -476,54 +408,12 @@ def get_user_details() -> UserDetails:
 
     # Keep making the mistake (lower case name) until we run
     # out of retries.
-    lm = DummyLM(
-        [
-            '[[ ## get_user_details ## ]]\n{"name": "lower case name", "age": 25}',
-        ]
-        * 10
-    )
+    lm = DummyLM([{"get_user_details": '{"name": "lower case name", "age": 25}'}] * 10)
     dspy.settings.configure(lm=lm)
 
     with pytest.raises(ValueError):
         get_user_details()
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `json_schema` (str)
-
-                Your output fields are:
-                1. `json_object` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## json_schema ## ]]
-                {json_schema}
-
-                [[ ## json_object ## ]]
-                {json_object}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Make a very succinct json object that validates with the following schema"""
-            ),  # noqa
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## json_schema ## ]]
-                {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
-
-                Respond with the corresponding output fields, starting with the field `json_object`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_annotated_field():
     @predictor
@@ -531,7 +421,7 @@ def test(input: Annotated[str, Field(description="description")]) -> Annotated[f
         pass
 
     # First try 0, which fails, then try 0.5, which passes
-    lm = DummyLM(["[[ ## test ## ]]\n0", "[[ ## test ## ]]\n0.5"])
+    lm = DummyLM([{"test": "0"}, {"test": "0.5"}])
     dspy.settings.configure(lm=lm)
 
     output = test(input="input")
@@ -540,7 +430,7 @@ def test(input: Annotated[str, Field(description="description")]) -> Annotated[f
 
 
 def test_multiple_outputs():
-    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(100)])
+    lm = DummyLM([{"output": f"{i}"} for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     test = TypedPredictor("input -> output")
@@ -549,7 +439,7 @@ def test_multiple_outputs():
 
 
 def test_multiple_outputs_int():
-    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(100)])
+    lm = DummyLM([{"output": f"{i}"} for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     class TestSignature(dspy.Signature):
@@ -566,9 +456,9 @@ def test_multiple_outputs_int_cot():
     # Note: Multiple outputs only work when the language model "speculatively" generates all the outputs in one go.
     lm = DummyLM(
         [
-            "[[ ## reasoning ## ]]\nthoughts 0\n\n[[ ## output ## ]]\n0\n",
-            "[[ ## reasoning ## ]]\nthoughts 1\n\n[[ ## output ## ]]\n1\n",
-            "[[ ## reasoning ## ]]\nthoughts 2\n\n[[ ## output ## ]]\n2\n",
+            {"reasoning": "thoughts 0", "output": "0"},
+            {"reasoning": "thoughts 1", "output": "1"},
+            {"reasoning": "thoughts 2", "output": "2"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -580,7 +470,7 @@ def test_multiple_outputs_int_cot():
 
 
 def test_parse_type_string():
-    lm = DummyLM([f"[[ ## output ## ]]\n{i}" for i in range(100)])
+    lm = DummyLM([{"output": f"{i}"} for i in range(100)])
     dspy.settings.configure(lm=lm)
 
     test = TypedPredictor("input:int -> output:int")
@@ -590,7 +480,7 @@ def test_parse_type_string():
 
 
 def test_literal():
-    lm = DummyLM(['[[ ## f ## ]]\n"2"', '[[ ## f ## ]]\n"3"'])
+    lm = DummyLM([{"f": '"2"'}, {"f": '"3"'}])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -601,7 +491,7 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_missmatch():
-    lm = DummyLM([f'[[ ## f ## ]]\n"{i}"' for i in range(5, 100)])
+    lm = DummyLM([{"f": f"{i}"} for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -615,7 +505,7 @@ def f() -> Literal["2", "3"]:
 
 
 def test_literal_int():
-    lm = DummyLM(["[[ ## f ## ]]\n2", "[[ ## f ## ]]\n3"])
+    lm = DummyLM([{"f": "2"}, {"f": "3"}])
     dspy.settings.configure(lm=lm)
 
     @predictor
@@ -626,7 +516,7 @@ def f() -> Literal[2, 3]:
 
 
 def test_literal_int_missmatch():
-    lm = DummyLM([f"[[ ## f ## ]]\n{i}" for i in range(5, 100)])
+    lm = DummyLM([{"f": f"{i}"} for i in range(5, 100)])
     dspy.settings.configure(lm=lm)
 
     @predictor(max_retries=1)
@@ -645,8 +535,8 @@ class SimpleOutput(dspy.Signature):
 
     lm = DummyLM(
         [
-            "[[ ## output ## ]]\n2.1",  # Bad output
-            "[[ ## output ## ]]\n0.5",  # Good output
+            {"output": "2.1"},  # Bad output
+            {"output": "0.5"},  # Good output
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -668,12 +558,12 @@ class ExampleSignature(dspy.Signature):
 
     lm = DummyLM(
         [
-            '[[ ## fact ## ]]\n{"fact": "The sky is blue", "varacity": true}',
-            '[[ ## fact ## ]]\n{"fact": "The sky is green", "varacity": false}',
-            '[[ ## fact ## ]]\n{"fact": "The sky is red", "varacity": true}',
-            '[[ ## fact ## ]]\n{"fact": "The earth is flat", "varacity": false}',
-            '[[ ## fact ## ]]\n{"fact": "The earth is round", "varacity": true}',
-            '[[ ## fact ## ]]\n{"fact": "The earth is a cube", "varacity": false}',
+            {"fact": '{"fact": "The sky is blue", "varacity": true}'},
+            {"fact": '{"fact": "The sky is green", "varacity": false}'},
+            {"fact": '{"fact": "The sky is red", "varacity": true}'},
+            {"fact": '{"fact": "The earth is flat", "varacity": false}'},
+            {"fact": '{"fact": "The earth is round", "varacity": true}'},
+            {"fact": '{"fact": "The earth is a cube", "varacity": false}'},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -709,7 +599,7 @@ class ScoredSignature(dspy.Signature):
 
     program = TypedChainOfThought(ScoredSignature)
 
-    lm = DummyLM(["[[ ## reasoning ## ]]\nThoughts\n\n[[ ## proposed_signature ## ]]\nOutput"])
+    lm = DummyLM([{"reasoning": "Thoughts", "proposed_signature": "Output"}])
     dspy.settings.configure(lm=lm)
 
     output = program(
@@ -724,49 +614,6 @@ class ScoredSignature(dspy.Signature):
 
     assert output == "Output"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `attempted_signatures` (list[ScoredString])
-
-                Your output fields are:
-                1. `reasoning` (str): ${produce the proposed_signature}. We ...
-                2. `proposed_signature` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## attempted_signatures ## ]]
-                {attempted_signatures}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## proposed_signature ## ]]
-                {proposed_signature}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `attempted_signatures`, produce the fields `proposed_signature`."""
-            ),  # noqa
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## attempted_signatures ## ]]
-                [1] «string='string 1' score=0.5»
-                [2] «string='string 2' score=0.4»
-                [3] «string='string 3' score=0.3»
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `proposed_signature`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_custom_reasoning_field():
     class Question(pydantic.BaseModel):
@@ -784,7 +631,7 @@ class QuestionSignature(dspy.Signature):
     program = TypedChainOfThought(QuestionSignature, reasoning=reasoning)
 
     expected = "What is the speed of light?"
-    lm = DummyLM([f'[[ ## reasoning ## ]]\nThoughts\n\n[[ ## question ## ]]\n{{"value": "{expected}"}}'])
+    lm = DummyLM([{"reasoning": "Thoughts", "question": f'{{"value": "{expected}"}}'}])
     dspy.settings.configure(lm=lm)
 
     output = program(topic="Physics")
@@ -792,47 +639,6 @@ class QuestionSignature(dspy.Signature):
     assert isinstance(output.question, Question)
     assert output.question.value == expected
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `topic` (str)
-
-                Your output fields are:
-                1. `reasoning` (str): ${topic}, we should ...
-                2. `question` (Question): ${question}. Respond with a single JSON object. JSON Schema: {"properties": {"value": {"title": "Value", "type": "string"}}, "required": ["value"], "title": "Question", "type": "object"}
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## topic ## ]]
-                {topic}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `topic`, produce the fields `question`."""
-            ),  # noqa
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## topic ## ]]
-                Physics
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `question`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_generic_signature():
     T = TypeVar("T")
@@ -845,7 +651,7 @@ class GenericSignature(dspy.Signature, Generic[T]):
     predictor = TypedPredictor(GenericSignature[int])
     assert predictor.signature.instructions == "My signature"
 
-    lm = DummyLM(["[[ ## output ## ]]\n23"])
+    lm = DummyLM([{"output": "23"}])
     dspy.settings.configure(lm=lm)
 
     assert predictor().output == 23
@@ -858,7 +664,7 @@ class ValidatedSignature(dspy.Signature):
         @pydantic.field_validator("a")
         @classmethod
         def space_in_a(cls, a: str) -> str:
-            if not " " in a:
+            if " " not in a:
                 raise ValueError("a must contain a space")
             return a
 
@@ -883,10 +689,10 @@ def next_square(n: int) -> Annotated[int, AfterValidator(check_square)]:
 
     lm = DummyLM(
         [
-            "[[ ## next_square ## ]]\n3",
-            "[[ ## is_square ## ]]\nFalse",
-            "[[ ## next_square ## ]]\n4",
-            "[[ ## is_square ## ]]\nTrue",
+            {"next_square": "3"},
+            {"is_square": "False"},
+            {"next_square": "4"},
+            {"is_square": "True"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -909,7 +715,7 @@ class MySignature(dspy.Signature):
         n: int = dspy.InputField()
         next_square: Annotated[int, AfterValidator(is_square)] = dspy.OutputField()
 
-    lm = DummyLM(["[[ ## next_square ## ]]\n3", "[[ ## next_square ## ]]\n4"])
+    lm = DummyLM([{"next_square": "3"}, {"next_square": "4"}])
     dspy.settings.configure(lm=lm)
 
     m = TypedPredictor(MySignature)(n=2).next_square
@@ -928,7 +734,7 @@ def is_square(n: int) -> int:
     def next_square(n: int) -> Annotated[int, AfterValidator(is_square)]:
         """What is the next square number after n?"""
 
-    lm = DummyLM(["[[ ## next_square ## ]]\n3", "[[ ## next_square ## ]]\n4"])
+    lm = DummyLM([{"next_square": "3"}, {"next_square": "4"}])
     dspy.settings.configure(lm=lm)
 
     m = next_square(n=2)
@@ -946,68 +752,11 @@ def test_demos():
         trainset=[ex.with_inputs("input") for ex in demos],
     )
 
-    lm = DummyLM(["[[ ## output ## ]]\nParis"])
+    lm = DummyLM([{"output": "Paris"}])
     dspy.settings.configure(lm=lm)
 
     assert program(input="What is the capital of France?").output == "Paris"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `input` (str)
-
-                Your output fields are:
-                1. `output` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## input ## ]]
-                {input}
-
-                [[ ## output ## ]]
-                {output}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `input`, produce the fields `output`."""
-            ),  # noqa
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## input ## ]]
-                What is the speed of light?
-
-                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
-            ),
-        },
-        {
-            "role": "assistant",
-            "content": textwrap.dedent(
-                """\
-                [[ ## output ## ]]
-                3e8
-
-                [[ ## completed ## ]]"""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## input ## ]]
-                What is the capital of France?
-
-                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_demos_missing_input_in_demo():
     demos = [dspy.Example(input="What is the speed of light?", output="3e8")]
@@ -1015,7 +764,7 @@ def test_demos_missing_input_in_demo():
         student=dspy.TypedPredictor("input -> output, thoughts"),
         trainset=[ex.with_inputs("input") for ex in demos],
     )
-    lm = DummyLM(["[[ ## thoughts ## ]]\nMy thoughts\n\n[[ ## output ## ]]\nParis"])
+    lm = DummyLM([{"thoughts": "My thoughts", "output": "Paris"}])
     dspy.settings.configure(lm=lm)
     assert program(input="What is the capital of France?").output == "Paris"
 
@@ -1024,10 +773,10 @@ def test_conlist():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "[[ ## make_numbers ## ]]\n[]",
-                "[[ ## make_numbers ## ]]\n[1]",
-                "[[ ## make_numbers ## ]]\n[1, 2]",
-                "[[ ## make_numbers ## ]]\n[1, 2, 3]",
+                {"make_numbers": "[]"},
+                {"make_numbers": "[1]"},
+                {"make_numbers": "[1, 2]"},
+                {"make_numbers": "[1, 2, 3]"},
             ]
         )
     )
@@ -1043,10 +792,10 @@ def test_conlist2():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "[[ ## output ## ]]\n[]",
-                "[[ ## output ## ]]\n[1]",
-                "[[ ## output ## ]]\n[1, 2]",
-                "[[ ## output ## ]]\n[1, 2, 3]",
+                {"output": "[]"},
+                {"output": "[1]"},
+                {"output": "[1, 2]"},
+                {"output": "[1, 2, 3]"},
             ]
         )
     )
@@ -1067,7 +816,7 @@ def check_cateogry(self):
                 raise ValueError(f"category not in {self.allowed_categories}")
             return self
 
-    lm = DummyLM(["[[ ## category ## ]]\nhorse", "[[ ## category ## ]]\ndog"])
+    lm = DummyLM([{"category": "horse"}, {"category": "dog"}])
     dspy.settings.configure(lm=lm)
     predictor = TypedPredictor(MySignature)
 
diff --git a/tests/functional/test_signature_opt_typed.py b/tests/functional/test_signature_opt_typed.py
index 125cec30b7..6778f6b694 100644
--- a/tests/functional/test_signature_opt_typed.py
+++ b/tests/functional/test_signature_opt_typed.py
@@ -1,7 +1,5 @@
 import json
-from typing import Generic, TypeVar
 
-import pydantic
 from pydantic_core import to_jsonable_python
 
 import dspy
@@ -103,12 +101,14 @@ class BasicQA(dspy.Signature):
         question: str = dspy.InputField()
         answer: str = dspy.OutputField()
 
-    qa_model = DummyLM(["[[ ## answer ## ]]\nfoo"] * 100)
+    qa_model = DummyLM([{"answer": "foo"}] * 100)
     prompt_model = DummyLM(
         [
+            {
+                "reasoning": "some thoughts",
+                "proposed_signatures": '[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
+            }
             # Seed prompts
-            "[[ ## reasoning ## ]]\nsome thoughts\n\n"
-            '[[ ## proposed_signatures ## ]]\n[{"instructions": "I", "question_desc": "$q", "question_prefix": "Q:", "answer_desc": "$a", "answer_prefix": "A:"}]',
         ]
     )
     dspy.settings.configure(lm=qa_model)
@@ -165,8 +165,8 @@ class ExpectedSignature2(dspy.Signature):
     qa_model = DummyLM([])
     prompt_model = DummyLM(
         [
-            f"[[ ## reasoning ## ]]\nsome thoughts\n\n[[ ## proposed_signatures ## ]]\n{json.dumps([to_jsonable_python(info1)])}",
-            f"[[ ## reasoning ## ]]\nsome thoughts\n\n[[ ## proposed_signatures ## ]]\n{json.dumps([to_jsonable_python(info2)])}",
+            {"reasoning": "some thoughts", "proposed_signatures": json.dumps([to_jsonable_python(info1)])},
+            {"reasoning": "some thoughts", "proposed_signatures": json.dumps([to_jsonable_python(info2)])},
         ]
     )
     dspy.settings.configure(lm=qa_model)
diff --git a/tests/predict/test_chain_of_thought.py b/tests/predict/test_chain_of_thought.py
index bdc9122c7e..2451bef664 100644
--- a/tests/predict/test_chain_of_thought.py
+++ b/tests/predict/test_chain_of_thought.py
@@ -6,7 +6,7 @@
 
 
 def test_initialization_with_string_signature():
-    lm = DummyLM(["[[ ## reasoning ## ]]\nfind the number after 1\n\n[[ ## answer ## ]]\n2"])
+    lm = DummyLM([{"reasoning": "find the number after 1", "answer": "2"}])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThought("question -> answer")
     assert list(predict.extended_signature.output_fields.keys()) == [
@@ -14,44 +14,3 @@ def test_initialization_with_string_signature():
         "answer",
     ]
     assert predict(question="What is 1+1?").answer == "2"
-
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `question` (str)
-
-                Your output fields are:
-                1. `reasoning` (str)
-                2. `answer` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## answer ## ]]
-                {answer}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `question`, produce the fields `answer`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## question ## ]]
-                What is 1+1?
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `answer`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
diff --git a/tests/predict/test_chain_of_thought_with_hint.py b/tests/predict/test_chain_of_thought_with_hint.py
index 3fff5d2cb2..77c4f9ac2b 100644
--- a/tests/predict/test_chain_of_thought_with_hint.py
+++ b/tests/predict/test_chain_of_thought_with_hint.py
@@ -6,7 +6,7 @@
 
 
 def test_cot_with_no_hint():
-    lm = DummyLM(["[[ ## rationale ## ]]\nfind the number after 1\n\n[[ ## answer ## ]]\n2"])
+    lm = DummyLM([{"rationale": "find the number after 1", "answer": "2"}])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     # Check output fields have the right order
@@ -17,54 +17,9 @@ def test_cot_with_no_hint():
     ]
     assert predict(question="What is 1+1?").answer == "2"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `question` (str)
-
-                Your output fields are:
-                1. `rationale` (str): ${produce the answer}. We ...
-                2. `answer` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## rationale ## ]]
-                {rationale}
-
-                [[ ## answer ## ]]
-                {answer}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `question`, produce the fields `answer`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## question ## ]]
-                What is 1+1?
-
-                Respond with the corresponding output fields, starting with the field `rationale`, then `answer`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_cot_with_hint():
-    lm = DummyLM(
-        [
-            "[[ ## rationale ## ]]\nfind the number after 1\n\n[[ ## hint ## ]]\nIs it helicopter?\n\n[[ ## answer ## ]]\n2"
-        ]
-    )
+    lm = DummyLM([{"rationale": "find the number after 1", "hint": "Is it helicopter?", "answer": "2"}])
     dspy.settings.configure(lm=lm)
     predict = ChainOfThoughtWithHint("question -> answer")
     assert list(predict.extended_signature2.output_fields.keys()) == [
@@ -73,48 +28,3 @@ def test_cot_with_hint():
         "answer",
     ]
     assert predict(question="What is 1+1?", hint="think small").answer == "2"
-
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `question` (str)
-
-                Your output fields are:
-                1. `rationale` (str): ${produce the answer}. We ...
-                2. `hint` (str)
-                3. `answer` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## rationale ## ]]
-                {rationale}
-
-                [[ ## hint ## ]]
-                {hint}
-
-                [[ ## answer ## ]]
-                {answer}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `question`, produce the fields `answer`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## question ## ]]
-                What is 1+1?
-
-                Respond with the corresponding output fields, starting with the field `rationale`, then `hint`, then `answer`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
diff --git a/tests/predict/test_multi_chain_comparison.py b/tests/predict/test_multi_chain_comparison.py
index 87e2af1a72..8354f64d07 100644
--- a/tests/predict/test_multi_chain_comparison.py
+++ b/tests/predict/test_multi_chain_comparison.py
@@ -30,7 +30,7 @@ class BasicQA(dspy.Signature):
 
     # Call the MultiChainComparison on the completions
     question = "What is the color of the sky?"
-    lm = DummyLM(["[[ ## rationale ## ]]\nmy rationale\n\n[[ ## answer ## ]]\nblue"])
+    lm = DummyLM([{"rationale": "my rationale", "answer": "blue"}])
     dspy.settings.configure(lm=lm)
     final_pred = compare_answers(completions, question=question)
 
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
index cf499474ee..b04e087e8b 100644
--- a/tests/predict/test_predict.py
+++ b/tests/predict/test_predict.py
@@ -41,48 +41,11 @@ def test_lm_after_dump_and_load_state():
 
 def test_call_method():
     predict_instance = Predict("input -> output")
-    lm = DummyLM(["[[ ## output ## ]]\ntest output"])
+    lm = DummyLM([{"output": "test output"}])
     dspy.settings.configure(lm=lm)
     result = predict_instance(input="test input")
     assert result.output == "test output"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `input` (str)
-
-                Your output fields are:
-                1. `output` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## input ## ]]
-                {input}
-
-                [[ ## output ## ]]
-                {output}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `input`, produce the fields `output`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## input ## ]]
-                test input
-
-                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_instructions_after_dump_and_load_state():
     predict_instance = Predict(Signature("input -> output", "original instructions"))
@@ -166,16 +129,14 @@ class Output(pydantic.BaseModel):
 
 def test_forward_method():
     program = Predict("question -> answer")
-    dspy.settings.configure(lm=DummyLM(["[[ ## answer ## ]]\nNo more responses"]))
+    dspy.settings.configure(lm=DummyLM([{"answer": "No more responses"}]))
     result = program(question="What is 1+1?").answer
     assert result == "No more responses"
 
 
 def test_forward_method2():
     program = Predict("question -> answer1, answer2")
-    dspy.settings.configure(
-        lm=DummyLM(["[[ ## answer1 ## ]]\nmy first answer\n\n[[ ## answer2 ## ]]\nmy second answer"])
-    )
+    dspy.settings.configure(lm=DummyLM([{"answer1": "my first answer", "answer2": "my second answer"}]))
     result = program(question="What is 1+1?")
     assert result.answer1 == "my first answer"
     assert result.answer2 == "my second answer"
@@ -190,7 +151,7 @@ def test_config_management():
 
 def test_multi_output():
     program = Predict("question -> answer", n=2)
-    dspy.settings.configure(lm=DummyLM(["[[ ## answer ## ]]\nmy first answer", "[[ ## answer ## ]]\nmy second answer"]))
+    dspy.settings.configure(lm=DummyLM([{"answer": "my first answer"}, {"answer": "my second answer"}]))
     results = program(question="What is 1+1?")
     assert results.completions.answer[0] == "my first answer"
     assert results.completions.answer[1] == "my second answer"
@@ -201,8 +162,8 @@ def test_multi_output2():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "[[ ## answer1 ## ]]\nmy 0 answer\n\n[[ ## answer2 ## ]]\nmy 2 answer",
-                "[[ ## answer1 ## ]]\nmy 1 answer\n\n[[ ## answer2 ## ]]\nmy 3 answer",
+                {"answer1": "my 0 answer", "answer2": "my 2 answer"},
+                {"answer1": "my 1 answer", "answer2": "my 3 answer"},
             ],
         )
     )
@@ -233,39 +194,6 @@ class OutputOnlySignature(dspy.Signature):
 
     predictor = Predict(OutputOnlySignature)
 
-    lm = DummyLM(["[[ ## output ## ]]\nshort answer"])
+    lm = DummyLM([{"output": "short answer"}])
     dspy.settings.configure(lm=lm)
     assert predictor().output == "short answer"
-
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-
-
-                Your output fields are:
-                1. `output` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-
-
-                [[ ## output ## ]]
-                {output}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields , produce the fields `output`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": (
-                "Respond with the corresponding output fields, "
-                "starting with the field `output`, and then ending with the marker for `completed`."
-            ),
-        },
-    ]
diff --git a/tests/predict/test_program_of_thought.py b/tests/predict/test_program_of_thought.py
index 5bb6547539..007c4e664b 100644
--- a/tests/predict/test_program_of_thought.py
+++ b/tests/predict/test_program_of_thought.py
@@ -13,8 +13,8 @@ class BasicQA(Signature):
 def test_pot_code_generation():
     lm = DummyLM(
         [
-            "[[ ## reasoning ## ]]\nReason_A\n\n[[ ## generated_code ## ]]\n```python\nresult = 1+1\n```",
-            "[[ ## reasoning ## ]]\nReason_B\n\n[[ ## answer ## ]]\n2",
+            {"reasoning": "Reason_A", "generated_code": "```python\nresult = 1+1\n```"},
+            {"reasoning": "Reason_B", "answer": "2"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -22,68 +22,13 @@ def test_pot_code_generation():
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `question` (str)
-                2. `final_generated_code` (str): python code that answers the question
-                3. `code_output` (str): output of previously-generated python code
-
-                Your output fields are:
-                1. `reasoning` (str)
-                2. `answer` (str): often between 1 and 5 words
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## final_generated_code ## ]]
-                {final_generated_code}
-
-                [[ ## code_output ## ]]
-                {code_output}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## answer ## ]]
-                {answer}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## question ## ]]
-                What is 1+1?
-
-                [[ ## final_generated_code ## ]]
-                result = 1+1
-
-                [[ ## code_output ## ]]
-                2
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `answer`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_pot_code_generation_with_error():
     lm = DummyLM(
         [
-            "[[ ## reasoning ## ]]\nReason_A\n\n[[ ## generated_code ## ]]\n```python\nresult = 1+0/0\n```",
-            "[[ ## reasoning ## ]]\nReason_B\n\n[[ ## generated_code ## ]]\n```python\nresult = 1+1\n```",
-            "[[ ## reasoning ## ]]\nReason_C\n\n[[ ## answer ## ]]\n2",
+            {"reasoning": "Reason_A", "generated_code": "```python\nresult = 1+0/0\n```"},
+            {"reasoning": "Reason_B", "generated_code": "```python\nresult = 1+1\n```"},
+            {"reasoning": "Reason_C", "answer": "2"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -91,114 +36,3 @@ def test_pot_code_generation_with_error():
     pot = ProgramOfThought(BasicQA)
     res = pot(question="What is 1+1?")
     assert res.answer == "2"
-
-    # The first code example failed
-    assert lm.get_convo(1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `question` (str)
-                2. `previous_code` (str): previously-generated python code that errored
-                3. `error` (str): error message from previously-generated python code
-
-                Your output fields are:
-                1. `reasoning` (str)
-                2. `generated_code` (str): python code that answers the question
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## previous_code ## ]]
-                {previous_code}
-
-                [[ ## error ## ]]
-                {error}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## generated_code ## ]]
-                {generated_code}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        You are given `question`, `previous_code`, `error` due to an error in previous code.
-                        Your task is to correct the error and provide the new `generated_code`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## question ## ]]
-                What is 1+1?
-
-                [[ ## previous_code ## ]]
-                result = 1+0/0
-
-                [[ ## error ## ]]
-                division by zero
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `generated_code`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `question` (str)
-                2. `final_generated_code` (str): python code that answers the question
-                3. `code_output` (str): output of previously-generated python code
-
-                Your output fields are:
-                1. `reasoning` (str)
-                2. `answer` (str): often between 1 and 5 words
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## question ## ]]
-                {question}
-
-                [[ ## final_generated_code ## ]]
-                {final_generated_code}
-
-                [[ ## code_output ## ]]
-                {code_output}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## answer ## ]]
-                {answer}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## question ## ]]
-                What is 1+1?
-
-                [[ ## final_generated_code ## ]]
-                result = 1+1
-
-                [[ ## code_output ## ]]
-                2
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `answer`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
diff --git a/tests/predict/test_react.py b/tests/predict/test_react.py
index 8ee626aeb3..2573abcb8d 100644
--- a/tests/predict/test_react.py
+++ b/tests/predict/test_react.py
@@ -8,7 +8,7 @@ def test_example_no_tools():
     # Createa a simple dataset which the model will use with the Retrieve tool.
     lm = dspy.utils.DummyLM(
         [
-            "[[ ## Thought_1 ## ]]\nInitial thoughts\n\n[[ ## Action_1 ## ]]\nFinish[blue]",
+            {"Thought_1": "Initial thoughts", "Action_1": "Finish[blue]"},
         ]
     )
     dspy.settings.configure(lm=lm, rm=dummy_rm())
@@ -28,8 +28,8 @@ def test_example_search():
     # Createa a simple dataset which the model will use with the Retrieve tool.
     lm = dspy.utils.DummyLM(
         [
-            "[[ ## Thought_1 ## ]]\nInitial thoughts\n\n[[ ## Action_1 ## ]]\nSearch[the color of the sky]",
-            "[[ ## Thought_2 ## ]]\nMore thoughts\n\n[[ ## Action_2 ## ]]\nFinish[blue]\n\n",
+            {"Thought_1": "Initial thoughts", "Action_1": "Search[the color of the sky]"},
+            {"Thought_2": "More thoughts", "Action_2": "Finish[blue]\n\n"},
         ]
     )
     rm = dummy_rm(
@@ -91,9 +91,9 @@ def __call__(self, *args, **kwargs):
 def test_custom_tools():
     lm = dspy.utils.DummyLM(
         [
-            "[[ ## Thought_1 ## ]]\nInitial thoughts\n\n[[ ## Action_1 ## ]]\nTool1[foo]",
-            "[[ ## Thought_2 ## ]]\nMore thoughts\n\n[[ ## Action_2 ## ]]\nTool2[bar]",
-            "[[ ## Thought_3 ## ]]\nEven more thoughts\n\n[[ ## Action_3 ## ]]\nFinish[baz]",
+            {"Thought_1": "Initial thoughts", "Action_1": "Tool1[foo]"},
+            {"Thought_2": "More thoughts", "Action_2": "Tool2[bar]"},
+            {"Thought_3": "Even more thoughts", "Action_3": "Finish[baz]"},
         ]
     )
     dspy.settings.configure(lm=lm)
diff --git a/tests/predict/test_retry.py b/tests/predict/test_retry.py
index 16050e355d..687a18dbf7 100644
--- a/tests/predict/test_retry.py
+++ b/tests/predict/test_retry.py
@@ -1,8 +1,10 @@
 import functools
+
+import pydantic
+
 import dspy
-from dspy.utils import DummyLM
 from dspy.primitives.assertions import assert_transform_module, backtrack_handler
-import pydantic
+from dspy.utils import DummyLM
 
 
 def test_retry_simple():
@@ -14,7 +16,7 @@ def test_retry_simple():
         assert f"past_{field}" in retry_module.new_signature.input_fields
     assert "feedback" in retry_module.new_signature.input_fields
 
-    lm = DummyLM(["[[ ## answer ## ]]\nblue"])
+    lm = DummyLM([{"answer": "blue"}])
     dspy.settings.configure(lm=lm)
     result = retry_module.forward(
         question="What color is the sky?",
@@ -26,7 +28,7 @@ def test_retry_simple():
 
 def test_retry_forward_with_feedback():
     # First we make a mistake, then we fix it
-    lm = DummyLM(["[[ ## answer ## ]]\nred", "[[ ## answer ## ]]\nblue"])
+    lm = DummyLM([{"answer": "red"}, {"answer": "blue"}])
     dspy.settings.configure(lm=lm, trace=[])
 
     class SimpleModule(dspy.Module):
@@ -53,7 +55,7 @@ def forward(self, **kwargs):
 
 def test_retry_forward_with_typed_predictor():
     # First we make a mistake, then we fix it
-    lm = DummyLM(['[[ ## output ## ]]\n{"answer":"red"}', '[[ ## output ## ]]\n{"answer":"blue"}'])
+    lm = DummyLM([{"output": '{"answer":"red"}'}, {"output": '{"answer":"blue"}'}])
     dspy.settings.configure(lm=lm, trace=[])
 
     class AnswerQuestion(dspy.Signature):
diff --git a/tests/primitives/test_program.py b/tests/primitives/test_program.py
index 9f271e992b..ea6633682f 100644
--- a/tests/primitives/test_program.py
+++ b/tests/primitives/test_program.py
@@ -39,8 +39,8 @@ def test_forward():
     dspy.settings.configure(
         lm=DummyLM(
             {
-                "What is 1+1?": "[[ ## query ## ]]\nlet me check",
-                "let me check": "[[ ## answer ## ]]\n2",
+                "What is 1+1?": {"query": "let me check"},
+                "let me check": {"answer": "2"},
             }
         )
     )
diff --git a/tests/signatures/test_signature.py b/tests/signatures/test_signature.py
index 292bafdf25..8fb9c5ff24 100644
--- a/tests/signatures/test_signature.py
+++ b/tests/signatures/test_signature.py
@@ -182,7 +182,7 @@ class SubSignature(Signature):
 
 
 def test_multiline_instructions():
-    lm = DummyLM(["[[ ## output ## ]]\nshort answer"])
+    lm = DummyLM([{"output": "short answer"}])
     dspy.settings.configure(lm=lm)
 
     class MySignature(Signature):
@@ -195,38 +195,6 @@ class MySignature(Signature):
     predictor = dspy.Predict(MySignature)
     assert predictor().output == "short answer"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-
-
-                Your output fields are:
-                1. `output` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-
-
-                [[ ## output ## ]]
-                {output}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        First line
-                        Second line
-                            Third line"""
-            ),
-        },
-        {
-            "role": "user",
-            "content": "Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`.",
-        },
-    ]
-
 
 def test_replaced_by_replace_context_manager():
     class SignatureOne(Signature):
diff --git a/tests/teleprompt/test_bootstrap.py b/tests/teleprompt/test_bootstrap.py
index ebf23f3b48..a112947a4b 100644
--- a/tests/teleprompt/test_bootstrap.py
+++ b/tests/teleprompt/test_bootstrap.py
@@ -59,9 +59,7 @@ def test_bootstrap_effectiveness():
     # This test verifies if the bootstrapping process improves the student's predictions
     student = SimpleModule("input -> output")
     teacher = SimpleModule("input -> output")
-    lm = DummyLM(
-        ["[[ ## output ## ]]\nblue", "[[ ## output ## ]]\nRing-ding-ding-ding-dingeringeding!"], follow_examples=True
-    )
+    lm = DummyLM([{"output": "blue"}, {"output": "Ring-ding-ding-ding-dingeringeding!"}], follow_examples=True)
     dspy.settings.configure(lm=lm, trace=[])
 
     bootstrap = BootstrapFewShot(metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1)
@@ -80,63 +78,6 @@ def test_bootstrap_effectiveness():
     prediction = compiled_student(input=trainset[0].input)
     assert prediction.output == trainset[0].output
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `input` (str)
-
-                Your output fields are:
-                1. `output` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## input ## ]]
-                {input}
-
-                [[ ## output ## ]]
-                {output}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Given the fields `input`, produce the fields `output`."""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## input ## ]]
-                What is the color of the sky?
-
-                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
-            ),
-        },
-        {
-            "role": "assistant",
-            "content": textwrap.dedent(
-                """\
-                [[ ## output ## ]]
-                blue
-
-                [[ ## completed ## ]]"""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## input ## ]]
-                What is the color of the sky?
-
-                Respond with the corresponding output fields, starting with the field `output`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_error_handling_during_bootstrap():
     """
@@ -157,7 +98,7 @@ def forward(self, **kwargs):
     # Setup DummyLM to simulate an error scenario
     lm = DummyLM(
         [
-            "[[ ## output ## ]]\nInitial thoughts",  # Simulate initial teacher's prediction
+            {"output": "Initial thoughts"},  # Simulate initial teacher's prediction
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -182,8 +123,8 @@ def test_validation_set_usage():
 
     lm = DummyLM(
         [
-            "[[ ## output ## ]]\nInitial thoughts",
-            "[[ ## output ## ]]\nFinish[blue]",  # Expected output for both training and validation
+            {"output": "Initial thoughts"},
+            {"output": "Finish[blue]"},  # Expected output for both training and validation
         ]
     )
     dspy.settings.configure(lm=lm)
diff --git a/tests/teleprompt/test_copro_optimizer.py b/tests/teleprompt/test_copro_optimizer.py
index 301106dae8..c031d3c7a3 100644
--- a/tests/teleprompt/test_copro_optimizer.py
+++ b/tests/teleprompt/test_copro_optimizer.py
@@ -1,5 +1,3 @@
-import textwrap
-
 import dspy
 from dspy import Example
 from dspy.teleprompt.signature_opt import COPRO
@@ -44,8 +42,10 @@ def test_signature_optimizer_optimization_process():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "[[ ## proposed_instruction ## ]]\nOptimized instruction 1\n\n"
-                "[[ ## proposed_prefix_for_output_field ## ]]\nOptimized instruction 2",
+                {
+                    "proposed_instruction": "Optimized instruction 1",
+                    "proposed_prefix_for_output_field": "Optimized instruction 2",
+                },
             ]
         )
     )
@@ -72,8 +72,10 @@ def test_signature_optimizer_statistics_tracking():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "[[ ## proposed_instruction ## ]]\nOptimized instruction 1\n\n"
-                "[[ ## proposed_prefix_for_output_field ## ]]\nOptimized instruction 2",
+                {
+                    "proposed_instruction": "Optimized instruction 1",
+                    "proposed_prefix_for_output_field": "Optimized instruction 2",
+                },
             ]
         )
     )
@@ -93,14 +95,14 @@ def test_signature_optimizer_statistics_tracking():
 def test_optimization_and_output_verification():
     lm = DummyLM(
         [
-            "[[ ## proposed_instruction ## ]]\nOptimized Prompt\n\n[[ ## proposed_prefix_for_output_field ## ]]\nOptimized Prefix",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
-            "[[ ## reasoning ## ]]\nfrance\n\n[[ ## output ## ]]\nParis",
+            {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
+            {"reasoning": "france", "output": "Paris"},
         ]
     )
     dspy.settings.configure(lm=lm)
@@ -121,54 +123,12 @@ def test_optimization_and_output_verification():
 
     assert prediction.output == "Paris"
 
-    assert lm.get_convo(-1)[0] == [
-        {
-            "role": "system",
-            "content": textwrap.dedent(
-                """\
-                Your input fields are:
-                1. `input` (str)
-
-                Your output fields are:
-                1. `reasoning` (str)
-                2. `output` (str)
-
-                All interactions will be structured in the following way, with the appropriate values filled in.
-
-                [[ ## input ## ]]
-                {input}
-
-                [[ ## reasoning ## ]]
-                {reasoning}
-
-                [[ ## output ## ]]
-                {output}
-
-                [[ ## completed ## ]]
-
-                In adhering to this structure, your objective is: 
-                        Optimized Prompt"""
-            ),
-        },
-        {
-            "role": "user",
-            "content": textwrap.dedent(
-                """\
-                [[ ## input ## ]]
-                What is the capital of France?
-
-                Respond with the corresponding output fields, starting with the field `reasoning`, then `output`, and then ending with the marker for `completed`."""
-            ),
-        },
-    ]
-
 
 def test_statistics_tracking_during_optimization():
     dspy.settings.configure(
         lm=DummyLM(
             [
-                "[[ ## proposed_instruction ## ]]\nOptimized Prompt\n\n"
-                "[[ ## proposed_prefix_for_output_field ## ]]\nOptimized Prefix",
+                {"proposed_instruction": "Optimized Prompt", "proposed_prefix_for_output_field": "Optimized Prefix"},
             ]
         )
     )

From ead37b635fa34b7b814207a7ab10641f271f8d39 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael@raiahealth.com>
Date: Mon, 7 Oct 2024 15:05:50 +0000
Subject: [PATCH 17/17] feat(dspy): add docstring for DummyLM

---
 dspy/utils/dummies.py | 47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/dspy/utils/dummies.py b/dspy/utils/dummies.py
index bdc51f0aa0..b26cdda043 100644
--- a/dspy/utils/dummies.py
+++ b/dspy/utils/dummies.py
@@ -98,6 +98,53 @@ def get_convo(self, index) -> str:
 
 
 class DummyLM(LM):
+    """
+    Dummy language model for unit testing purposes.
+
+    Three modes of operation:
+
+    ## 1. List of dictionaries
+
+    If a list of dictionaries is provided, the dummy model will return the next dictionary
+    in the list for each request, formatted according to the `format_fields` function.
+    from the chat adapter.
+
+    ```python
+        lm = DummyLM([{"answer": "red"}, {"answer": "blue"}])
+        dspy.settings.configure(lm=lm)
+        predictor("What color is the sky?")
+        # Output: "[[## answer ##]]\nred"
+        predictor("What color is the sky?")
+        # Output: "[[## answer ##]]\nblue"
+    ```
+
+    ## 2. Dictionary of dictionaries
+
+    If a dictionary of dictionaries is provided, the dummy model will return the value
+    corresponding to the key which is contained with the final message of the prompt,
+    formatted according to the `format_fields` function from the chat adapter.
+
+    ```python
+        lm = DummyLM({"What color is the sky?": {"answer": "blue"}})
+        dspy.settings.configure(lm=lm)
+        predictor("What color is the sky?")
+        # Output: "[[## answer ##]]\nblue"
+    ```
+
+    ## 3. Follow examples
+
+    If `follow_examples` is set to True, and the prompt contains an example input exactly equal to the prompt,
+    the dummy model will return the output from that example.
+
+    ```python
+        lm = DummyLM([{"answer": "red"}], follow_examples=True)
+        dspy.settings.configure(lm=lm)
+        predictor("What color is the sky?, demos=dspy.Example(input="What color is the sky?", output="blue"))
+        # Output: "[[## answer ##]]\nblue"
+    ```
+
+    """
+
     def __init__(self, answers: Union[list[dict[str, str]], dict[str, dict[str, str]]], follow_examples: bool = False):
         super().__init__("dummy", "chat", 0.0, 1000, True)
         self.answers = answers