From 2d218aa0588ae8a5c2488496c045ca9db6f0efb1 Mon Sep 17 00:00:00 2001
From: Thomas Dybdahl Ahle <thomas@ahle.dk>
Date: Sat, 24 Feb 2024 19:42:32 -0800
Subject: [PATCH 1/6] new Signature class + tests + types

Updated load/dump state to use new predict signature
---
 .gitignore                                    |   5 +-
 dsp/modules/aws_lm.py                         |  50 +--
 dsp/templates/template_v2.py                  |   2 +-
 dspy/functional/__init__.py                   |   1 +
 dspy/functional/functional.py                 | 327 ++++++++++++++
 dspy/predict/__init__.py                      |   1 +
 dspy/predict/aggregation.py                   |  10 +-
 dspy/predict/chain_of_thought.py              |  25 +-
 dspy/predict/chain_of_thought_with_hint.py    |  27 +-
 dspy/predict/langchain.py                     |   2 +
 dspy/predict/multi_chain_comparison.py        |  47 +-
 dspy/predict/predict.py                       |  83 ++--
 dspy/predict/program_of_thought.py            | 168 +++++---
 dspy/predict/react.py                         |  95 +++--
 dspy/predict/retry.py                         |  41 +-
 dspy/primitives/assertions.py                 |  13 +-
 dspy/primitives/python_interpreter.py         |  10 +-
 dspy/signatures/field.py                      |  60 ++-
 dspy/signatures/signature.py                  | 356 ++++++++++------
 dspy/teleprompt/bootstrap.py                  |   2 +-
 dspy/teleprompt/finetune.py                   |   7 +-
 dspy/teleprompt/signature_opt.py              |  47 +-
 dspy/teleprompt/signature_opt_bayesian.py     |  51 ++-
 dspy/utils/__init__.py                        |   1 +
 dspy/utils/dummies.py                         | 144 +++++++
 examples/longformqa/DSPy_LongFormQA_Cache     |   1 +
 .../longformqa/longformqa_assertions.ipynb    |  43 +-
 pyproject.toml                                |  21 +-
 requirements.txt                              |   2 +-
 setup.py                                      |  12 +-
 tests/evaluate/test_evaluate.py               |  59 +++
 tests/evaluate/test_metrics.py                |  32 ++
 tests/examples/test_baleen.py                 | 136 ++++++
 tests/functional/test_functional.py           | 401 ++++++++++++++++++
 tests/predict/test_aggregation.py             |  47 ++
 tests/predict/test_chain_of_thought.py        |  35 ++
 .../test_chain_of_thought_with_hint.py        |  42 ++
 tests/predict/test_knn.py                     |  55 +++
 tests/predict/test_multi_chain_comparison.py  |  38 ++
 tests/predict/test_predict.py                 |  91 ++++
 tests/predict/test_program_of_thought.py      | 121 ++++++
 tests/predict/test_react.py                   |  86 ++++
 tests/predict/test_retry.py                   |  66 +++
 tests/primitives/test_example.py              | 108 +++++
 tests/primitives/test_program.py              |  66 +++
 tests/primitives/test_python_interpreter.py   |  44 ++
 tests/signatures/test_signature.py            | 166 ++++++++
 tests/teleprompt/test_bootstrap.py            | 180 ++++++++
 tests/teleprompt/test_ensemble.py             |  60 +++
 tests/teleprompt/test_finetune.py             |   1 +
 tests/teleprompt/test_knn_fewshot.py          |  72 ++++
 tests/teleprompt/test_signature_opt.py        | 121 ++++++
 .../teleprompt/test_signature_opt_bayesian.py | 176 ++++++++
 53 files changed, 3378 insertions(+), 479 deletions(-)
 create mode 100644 dspy/functional/__init__.py
 create mode 100644 dspy/functional/functional.py
 create mode 100644 dspy/utils/__init__.py
 create mode 100644 dspy/utils/dummies.py
 create mode 160000 examples/longformqa/DSPy_LongFormQA_Cache
 create mode 100644 tests/evaluate/test_evaluate.py
 create mode 100644 tests/evaluate/test_metrics.py
 create mode 100644 tests/examples/test_baleen.py
 create mode 100644 tests/functional/test_functional.py
 create mode 100644 tests/predict/test_aggregation.py
 create mode 100644 tests/predict/test_chain_of_thought.py
 create mode 100644 tests/predict/test_chain_of_thought_with_hint.py
 create mode 100644 tests/predict/test_knn.py
 create mode 100644 tests/predict/test_multi_chain_comparison.py
 create mode 100644 tests/predict/test_predict.py
 create mode 100644 tests/predict/test_program_of_thought.py
 create mode 100644 tests/predict/test_react.py
 create mode 100644 tests/predict/test_retry.py
 create mode 100644 tests/primitives/test_example.py
 create mode 100644 tests/primitives/test_program.py
 create mode 100644 tests/primitives/test_python_interpreter.py
 create mode 100644 tests/signatures/test_signature.py
 create mode 100644 tests/teleprompt/test_bootstrap.py
 create mode 100644 tests/teleprompt/test_ensemble.py
 create mode 100644 tests/teleprompt/test_finetune.py
 create mode 100644 tests/teleprompt/test_knn_fewshot.py
 create mode 100644 tests/teleprompt/test_signature_opt.py
 create mode 100644 tests/teleprompt/test_signature_opt_bayesian.py

diff --git a/.gitignore b/.gitignore
index c47837feb8..7d9919d9aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,9 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+# Vim
+*.swp
+
 # Jupyter Notebook
 .ipynb_checkpoints
 # notebooks/
@@ -42,4 +45,4 @@ finetuning_ckpts/
 .idea
 assertion.log
 *.log
-*.db
\ No newline at end of file
+*.db
diff --git a/dsp/modules/aws_lm.py b/dsp/modules/aws_lm.py
index 00906282a8..05a733851c 100644
--- a/dsp/modules/aws_lm.py
+++ b/dsp/modules/aws_lm.py
@@ -28,7 +28,6 @@ def __init__(
         max_new_tokens: int,
         truncate_long_prompts: bool = False,
         input_output_ratio: int = 3,
-        batch_n: bool = True,
     ) -> None:
         """_summary_
 
@@ -41,7 +40,6 @@ def __init__(
             input_output_ratio (int, optional): The rough size of the number of input tokens to output tokens in the worst case. Defaults to 3.
             temperature (float, optional): _description_. Defaults to 0.0.
             truncate_long_prompts (bool, optional): If True, remove extremely long inputs to context. Defaults to False.
-            batch_n (bool, False): If False, call the LM N times rather than batching. Not all AWS models support the n parameter.
         """
         super().__init__(model=model)
         # AWS doesn't have an equivalent of max_tokens so let's clarify
@@ -50,10 +48,9 @@ def __init__(
         self._max_new_tokens: int = max_new_tokens
         self._model_name: str = model
         self._truncate_long_prompt_prompts: bool = truncate_long_prompts
-        self._batch_n: bool = batch_n
 
         import boto3
-
+        
         self.predictor = boto3.client(service_name, region_name=region_name)
 
     @abstractmethod
@@ -75,7 +72,7 @@ def _sanitize_kwargs(self, query_kwargs: dict[str, Any]) -> dict[str, Any]:
         return query_kwargs
 
     @abstractmethod
-    def _call_model(self, body: str) -> str | list[str]:
+    def _call_model(self, body: str) -> str:
         """Call model, get generated input without the formatted prompt"""
         pass
 
@@ -85,20 +82,7 @@ def _extract_input_parameters(
     ) -> dict[str, str | float | int]:
         pass
 
-    def _simple_api_call(self, formatted_prompt: str, **kwargs) -> str | list[str]:
-        body = self._create_body(formatted_prompt, **kwargs)
-        json_body = json.dumps(body)
-        llm_out: str | list[str] = self._call_model(json_body)
-        if isinstance(llm_out, str):
-            llm_out = llm_out.replace(formatted_prompt, "")
-        else:
-            llm_out = [generated.replace(formatted_prompt, "") for generated in llm_out]
-        self.history.append(
-            {"prompt": formatted_prompt, "response": llm_out, "kwargs": body}
-        )
-        return llm_out
-
-    def basic_request(self, prompt, **kwargs) -> str | list[str]:
+    def basic_request(self, prompt, **kwargs) -> str:
         """Query the endpoint."""
 
         # Remove any texts that are too long
@@ -108,28 +92,16 @@ def basic_request(self, prompt, **kwargs) -> str | list[str]:
             formatted_prompt = self._format_prompt(truncated_prompt)
         else:
             formatted_prompt = self._format_prompt((prompt))
+        body = self._create_body(formatted_prompt, **kwargs)
+        json_body: str = json.dumps(body)
 
-        llm_out: str | list[str]
-        if "n" in kwargs.keys():
-            if self._batch_n:
-                llm_out = self._simple_api_call(
-                    formatted_prompt=formatted_prompt, **kwargs
-                )
-            else:
-                del kwargs["n"]
-                llm_out = []
-                for _ in range(0, kwargs["n"]):
-                    generated: str | list[str] = self._simple_api_call(
-                        formatted_prompt=formatted_prompt, **kwargs
-                    )
-                    if isinstance(generated, str):
-                        llm_out.append(generated)
-                    else:
-                        raise TypeError("Error, list type was returned from LM call")
-        else:
-            llm_out = self._simple_api_call(formatted_prompt=formatted_prompt, **kwargs)
+        generated: str = self._call_model(json_body)
+
+        self.history.append(
+            {"prompt": formatted_prompt, "response": generated, "kwargs": body}
+        )
 
-        return llm_out
+        return generated.replace(formatted_prompt, "")
 
     def _estimate_tokens(self, text: str) -> int:
         return len(text) * CHARS2TOKENS
diff --git a/dsp/templates/template_v2.py b/dsp/templates/template_v2.py
index 0e1b368854..f61085dd05 100644
--- a/dsp/templates/template_v2.py
+++ b/dsp/templates/template_v2.py
@@ -91,8 +91,8 @@ def query(self, example: Example, is_demo: bool = False) -> str:
                 if field.input_variable in self.format_handlers:
                     format_handler = self.format_handlers[field.input_variable]
                 else:
-
                     def format_handler(x):
+                        assert type(x) == str, f"Need format_handler for {field.input_variable} of type {type(x)}"
                         return " ".join(x.split())
                 
                 formatted_value = format_handler(example[field.input_variable])
diff --git a/dspy/functional/__init__.py b/dspy/functional/__init__.py
new file mode 100644
index 0000000000..11fb1bc4d4
--- /dev/null
+++ b/dspy/functional/__init__.py
@@ -0,0 +1 @@
+from .functional import cot, predictor, FunctionalModule, TypedPredictor
diff --git a/dspy/functional/functional.py b/dspy/functional/functional.py
new file mode 100644
index 0000000000..c4388ee862
--- /dev/null
+++ b/dspy/functional/functional.py
@@ -0,0 +1,327 @@
+import inspect, os, openai, dspy, typing, pydantic
+from typing import Annotated
+import typing
+from dsp.templates import passages2text
+import json
+
+
+MAX_RETRIES = 3
+
+
+def predictor(func):
+    signature = _func_to_signature(func)
+    return TypedPredictor(signature, chain_of_thought=False, simple_output=True)
+
+
+def cot(func):
+    signature = _func_to_signature(func)
+    return TypedPredictor(signature, chain_of_thought=True, simple_output=True)
+
+
+class FunctionalModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        for name in dir(self):
+            attr = getattr(self, name)
+            if isinstance(attr, dspy.Module):
+                self.__dict__[name] = attr.copy()
+
+
+class TypedPredictor(dspy.Module):
+    def __init__(self, signature, chain_of_thought=False, simple_output=False):
+        super().__init__()
+        self.signature = signature
+        self.predictor = dspy.Predict(signature)
+        self.chain_of_thought = chain_of_thought
+        self.simple_output = simple_output
+
+    def copy(self):
+        return TypedPredictor(self.signature, self.chain_of_thought, self.simple_output)
+
+    def _prepare_signature(self):
+        """Add formats and parsers to the signature fields, based on the type
+        annotations of the fields."""
+        signature = self.signature
+        for name, field in self.signature.fields.items():
+            is_output = field.json_schema_extra["__dspy_field_type"] == "output"
+            type_ = field.annotation
+            if is_output:
+                if type_ in (str, int, float, bool):
+                    signature = signature.with_updated_fields(
+                        name,
+                        desc=field.json_schema_extra.get("desc", "")
+                        + (f". Respond with a single {type_.__name__} value"),
+                        format=lambda x: x if isinstance(x, str) else str(x),
+                        parser=type_,
+                    )
+                else:
+                    # Anything else we wrap in a pydantic object
+                    unwrap = lambda x: x
+                    if not inspect.isclass(type_) or not issubclass(
+                        type_, pydantic.BaseModel
+                    ):
+                        type_ = pydantic.create_model(
+                            "Output", value=(type_, ...), __base__=pydantic.BaseModel
+                        )
+                        unwrap = lambda x: x.value
+                    signature = signature.with_updated_fields(
+                        name,
+                        desc=field.json_schema_extra.get("desc", "")
+                        + (
+                            f". Respond with a single JSON object using the schema "
+                            + json.dumps(type_.model_json_schema())
+                        ),
+                        format=lambda x: x if isinstance(x, str) else x.json(),
+                        parser=lambda x: unwrap(
+                            type_.model_validate_json(_unwrap_json(x))
+                        ),
+                    )
+            else:  # If input field
+                format = lambda x: x if isinstance(x, str) else str(x)
+                if type_ in (list[str], tuple[str]):
+                    format = passages2text
+                elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):
+                    format = lambda x: x if isinstance(x, str) else x.json()
+                signature = signature.with_updated_fields(name, format=format)
+
+        if self.chain_of_thought:
+            output_keys = ", ".join(signature.output_fields.keys())
+            signature = signature.prepend(
+                "reasoning",
+                dspy.OutputField(
+                    prefix="Reasoning: Let's think step by step in order to",
+                    desc="${produce the " + output_keys + "}. We ...",
+                ),
+            )
+        return signature
+
+    def forward(self, **kwargs):
+        modified_kwargs = kwargs.copy()
+        signature = self._prepare_signature()
+        for try_i in range(MAX_RETRIES):
+            result = self.predictor(**modified_kwargs, new_signature=signature)
+            errors = {}
+            parsed_results = {}
+            # Parse the outputs
+            for name, field in signature.output_fields.items():
+                try:
+                    value = getattr(result, name)
+                    parser = field.json_schema_extra.get("parser", lambda x: x)
+                    parsed_results[name] = parser(value)
+                except (pydantic.ValidationError, ValueError) as e:
+                    errors[name] = e
+            if errors:
+                # Add new fields for each error
+                for name, error in errors.items():
+                    modified_kwargs[f"error_{name}_{try_i}"] = str(error)
+                    signature = signature.append(
+                        f"error_{name}_{try_i}",
+                        dspy.InputField(
+                            prefix=f"Past Error "
+                            + (f"({name}):" if try_i == 0 else f"({name}, {try_i+1}):"),
+                            desc="An error to avoid in the future",
+                        ),
+                    )
+            else:
+                # If there are no errors, we return the parsed results
+                for name, value in parsed_results.items():
+                    setattr(result, name, value)
+                if self.simple_output:
+                    *_, last_output = signature.output_fields.keys()
+                    return result[last_output]
+                return result
+        raise ValueError("Too many retries")
+
+
+def _func_to_signature(func):
+    """Make a dspy.Signature based on a function definition."""
+    sig = inspect.signature(func)
+    annotations = typing.get_type_hints(func)
+    output_key = func.__name__
+    instructions = func.__doc__
+    fields = {}
+
+    # Input fields
+    for param in sig.parameters.values():
+        if param.name == "self":
+            continue
+        # We default to str as the type of the input
+        annotation = annotations.get(param.name, str)
+        kwargs = {}
+        if typing.get_origin(annotation) is Annotated:
+            annotation, kwargs["desc"] = typing.get_args(annotation)
+        fields[param.name] = (annotation, dspy.InputField(**kwargs))
+
+    # Output field
+    kwargs = {}
+    annotation = annotations.get("return", str)
+    if typing.get_origin(annotation) is Annotated:
+        annotation, kwargs["desc"] = typing.get_args(annotation)
+    fields[output_key] = (annotation, dspy.OutputField(**kwargs))
+
+    return dspy.Signature(fields, instructions)
+
+
+def _unwrap_json(output):
+    output = output.strip()
+    if output.startswith("```"):
+        if not output.startswith("```json"):
+            raise ValueError("json output should start with ```json")
+        if not output.endswith("```"):
+            raise ValueError("json output should end with ```")
+        output = output[7:-3].strip()
+    if not output.startswith("{") or not output.endswith("}"):
+        raise ValueError("json output should start and end with { and }")
+    return output
+
+
+################################################################################
+# Example usage
+################################################################################
+
+
+def main():
+    class Answer(pydantic.BaseModel):
+        value: float
+        certainty: float
+        comments: list[str] = pydantic.Field(
+            description="At least two comments about the answer"
+        )
+
+    class QA(dspy.Module):
+        @predictor
+        def hard_question(self, topic: str) -> str:
+            """Think of a hard factual question about a topic. It should be answerable with a number."""
+
+        @cot
+        def answer(self, question: Annotated[str, "Question to answer"]) -> Answer:
+            pass
+
+        def forward(self, **kwargs):
+            question = self.hard_question(**kwargs)
+            return (question, self.answer(question=question))
+
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+    lm = dspy.OpenAI(model="gpt-3.5-turbo", max_tokens=4000)
+    # lm = dspy.OpenAI(model="gpt-4", max_tokens=4000)
+    # lm = dspy.OpenAI(model="gpt-4-preview-1106", max_tokens=4000)
+    with dspy.context(lm=lm):
+        qa = QA()
+        question, answer = qa(topic="Physics")
+        # lm.inspect_history(n=5)
+
+        print("Question:", question)
+        print("Answer:", answer)
+
+
+################################################################################
+# HotpotQA example with SimpleBaleen
+################################################################################
+
+
+def validate_context_and_answer_and_hops(example, pred, trace=None):
+    if not dspy.evaluate.answer_exact_match(example, pred):
+        return False
+    if not dspy.evaluate.answer_passage_match(example, pred):
+        return False
+
+    hops = [example.question] + [
+        outputs.query for *_, outputs in trace if "query" in outputs
+    ]
+
+    if max([len(h) for h in hops]) > 100:
+        return False
+    if any(
+        dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8)
+        for idx in range(2, len(hops))
+    ):
+        return False
+
+    return True
+
+
+def gold_passages_retrieved(example, pred, trace=None):
+    gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
+    found_titles = set(
+        map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context])
+    )
+
+    return gold_titles.issubset(found_titles)
+
+
+def hotpot():
+    from dsp.utils import deduplicate
+    import dspy.evaluate
+    from dspy.datasets import HotPotQA
+    from dspy.evaluate.evaluate import Evaluate
+    from dspy.teleprompt.bootstrap import BootstrapFewShot
+
+    print("Load the dataset.")
+    dataset = HotPotQA(
+        train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0
+    )
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+    print("Done")
+
+    class SimplifiedBaleen(FunctionalModule):
+        def __init__(self, passages_per_hop=3, max_hops=1):
+            super().__init__()
+            self.retrieve = dspy.Retrieve(k=passages_per_hop)
+            self.max_hops = max_hops
+
+        @cot
+        def generate_query(self, context: list[str], question) -> str:
+            """Write a simple search query that will help answer a complex question."""
+            pass
+
+        @cot
+        def generate_answer(self, context: list[str], question) -> str:
+            """Answer questions with short factoid answers."""
+            pass
+
+        def forward(self, question):
+            context = []
+
+            for hop in range(self.max_hops):
+                query = self.generate_query(context=context, question=question)
+                passages = self.retrieve(query).passages
+                context = deduplicate(context + passages)
+
+            answer = self.generate_answer(context=context, question=question)
+            return dspy.Prediction(context=context, answer=answer)
+
+    openai.api_key = os.getenv("OPENAI_API_KEY")
+    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
+    lm = dspy.OpenAI(model="gpt-3.5-turbo", max_tokens=4000)
+    dspy.settings.configure(lm=lm, rm=rm, trace=[])
+
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset, num_threads=10, display_progress=True, display_table=5
+    )
+
+    # uncompiled (i.e., zero-shot) program
+    uncompiled_baleen = SimplifiedBaleen()
+    print(
+        "Uncompiled Baleen retrieval score:",
+        evaluate_on_hotpotqa(uncompiled_baleen, metric=gold_passages_retrieved),
+    )
+
+    # compiled (i.e., few-shot) program
+    compiled_baleen = BootstrapFewShot(
+        metric=validate_context_and_answer_and_hops
+    ).compile(
+        SimplifiedBaleen(),
+        teacher=SimplifiedBaleen(passages_per_hop=2),
+        trainset=trainset,
+    )
+    print(
+        "Compiled Baleen retrieval score:",
+        evaluate_on_hotpotqa(compiled_baleen, metric=gold_passages_retrieved),
+    )
+    # lm.inspect_history(n=5)
+
+
+if __name__ == "__main__":
+    # main()
+    hotpot()
diff --git a/dspy/predict/__init__.py b/dspy/predict/__init__.py
index f646ab69f6..8b0770150b 100644
--- a/dspy/predict/__init__.py
+++ b/dspy/predict/__init__.py
@@ -6,3 +6,4 @@
 from .aggregation import majority
 from .program_of_thought import ProgramOfThought
 from .retry import Retry
+from .knn import KNN
\ No newline at end of file
diff --git a/dspy/predict/aggregation.py b/dspy/predict/aggregation.py
index 2212900c2d..ca4154aa2d 100644
--- a/dspy/predict/aggregation.py
+++ b/dspy/predict/aggregation.py
@@ -26,10 +26,11 @@ def majority(prediction_or_completions, normalize=default_normalize, field=None)
     except:
         signature = None
     
-    try:
-        field = field if field else signature.fields[-1].output_variable
-    except:
-        field = field if field else list(completions[0].keys())[-1]
+    if not field:
+        if signature:
+            field = signature.output_fields[-1]
+        else:
+            field = list(completions[0].keys())[-1]
 
     # Normalize
     normalize = normalize if normalize else lambda x: x
@@ -51,5 +52,4 @@ def majority(prediction_or_completions, normalize=default_normalize, field=None)
     # if input_type == Prediction:
     return Prediction.from_completions([completion], signature=signature)
 
-    return Completions([completion], signature=signature)
 
diff --git a/dspy/predict/chain_of_thought.py b/dspy/predict/chain_of_thought.py
index fceb3b6517..7d50d8a562 100644
--- a/dspy/predict/chain_of_thought.py
+++ b/dspy/predict/chain_of_thought.py
@@ -1,6 +1,7 @@
-import dsp
+import dsp, dspy
+from dspy.signatures.signature import ensure_signature
 
-from .predict import Predict
+from .predict import Predict, signature_to_template
 
 
 # TODO: FIXME: Insert this right before the *first* output field. Also rewrite this to use the new signature system.
@@ -33,24 +34,15 @@ def __init__(self, signature, rationale_type=None, activated=True, **config):
 
         self.activated = activated
 
-        signature = self.signature
-        *keys, last_key = signature.kwargs.keys()
+        signature = ensure_signature(self.signature)
+        *_keys, last_key = signature.output_fields.keys()
 
-        DEFAULT_RATIONALE_TYPE = dsp.Type(
+        rationale_type = rationale_type or dspy.OutputField(
             prefix="Reasoning: Let's think step by step in order to",
             desc="${produce the " + last_key + "}. We ...",
         )
 
-        rationale_type = rationale_type or DEFAULT_RATIONALE_TYPE
-
-        extended_kwargs = {key: signature.kwargs[key] for key in keys}
-        extended_kwargs.update(
-            {"rationale": rationale_type, last_key: signature.kwargs[last_key]}
-        )
-
-        self.extended_signature = dsp.Template(
-            signature.instructions, **extended_kwargs
-        )
+        self.extended_signature = signature.prepend("rationale", rationale_type, type_=str)
 
     def forward(self, **kwargs):
         new_signature = kwargs.pop("new_signature", None)
@@ -62,7 +54,8 @@ def forward(self, **kwargs):
             else:
                 signature = self.signature
         else:
-            signature = dsp.Template(self.signature.instructions, **new_signature)
+            signature = new_signature
+            # template = dsp.Template(self.signature.instructions, **new_signature)
         return super().forward(signature=signature, **kwargs)
 
 
diff --git a/dspy/predict/chain_of_thought_with_hint.py b/dspy/predict/chain_of_thought_with_hint.py
index b968d0bd95..83d5b5b4b4 100644
--- a/dspy/predict/chain_of_thought_with_hint.py
+++ b/dspy/predict/chain_of_thought_with_hint.py
@@ -1,4 +1,4 @@
-import dsp
+import dsp, dspy
 
 from .predict import Predict
 
@@ -9,27 +9,18 @@
 class ChainOfThoughtWithHint(Predict):
     def __init__(self, signature, rationale_type=None, activated=True, **config):
         super().__init__(signature, **config)
-
         self.activated = activated
-
         signature = self.signature
-        *keys, last_key = signature.kwargs.keys()
-
-        DEFAULT_HINT_TYPE = dsp.Type(prefix="Hint:", desc="${hint}")
 
-        DEFAULT_RATIONALE_TYPE = dsp.Type(prefix="Reasoning: Let's think step by step in order to",
-                                          desc="${produce the " + last_key + "}. We ...")
+        *keys, last_key = signature.fields.keys()
+        rationale_type = rationale_type or dspy.OutputField(
+            prefix="Reasoning: Let's think step by step in order to",
+            desc="${produce the " + last_key + "}. We ...",
+        )
+        self.extended_signature1 = self.signature.insert(-2, "rationale", rationale_type, type_=str)
 
-        rationale_type = rationale_type or DEFAULT_RATIONALE_TYPE
-        
-        extended_kwargs1 = {key: signature.kwargs[key] for key in keys}
-        extended_kwargs1.update({'rationale': rationale_type, last_key: signature.kwargs[last_key]})
-
-        extended_kwargs2 = {key: signature.kwargs[key] for key in keys}
-        extended_kwargs2.update({'hint': DEFAULT_HINT_TYPE, 'rationale': rationale_type, last_key: signature.kwargs[last_key]})
-        
-        self.extended_signature1 = dsp.Template(signature.instructions, **extended_kwargs1)
-        self.extended_signature2 = dsp.Template(signature.instructions, **extended_kwargs2)
+        DEFAULT_HINT_TYPE = dspy.OutputField()
+        self.extended_signature2 = self.extended_signature1.insert(-2, "hint", DEFAULT_HINT_TYPE, type_=str)
     
     def forward(self, **kwargs):
         signature = self.signature
diff --git a/dspy/predict/langchain.py b/dspy/predict/langchain.py
index e3ddd37cec..4be855e8db 100644
--- a/dspy/predict/langchain.py
+++ b/dspy/predict/langchain.py
@@ -13,6 +13,8 @@
 from langchain_core.pydantic_v1 import Extra
 from langchain_core.runnables import Runnable
 
+# TODO: This class is currently hard to test, because it hardcodes gpt-4 usage:
+# gpt4T = dspy.OpenAI(model='gpt-4-1106-preview', max_tokens=4000, model_type='chat')
 
 class Template2Signature(dspy.Signature):
     """You are a processor for prompts. I will give you a prompt template (Python f-string) for an arbitrary task for other LMs.
diff --git a/dspy/predict/multi_chain_comparison.py b/dspy/predict/multi_chain_comparison.py
index 99c2b43c5a..89fc732979 100644
--- a/dspy/predict/multi_chain_comparison.py
+++ b/dspy/predict/multi_chain_comparison.py
@@ -1,38 +1,55 @@
+import dspy
+from dspy.signatures.signature import ensure_signature
 from .predict import Predict
 from ..primitives.program import Module
 
 import dsp
 
+
 class MultiChainComparison(Module):
     def __init__(self, signature, M=3, temperature=0.7, **config):
         super().__init__()
 
         self.M = M
-        signature = Predict(signature).signature
-        *keys, last_key = signature.kwargs.keys()
+        signature = ensure_signature(signature)
 
-        extended_kwargs = {key: signature.kwargs[key] for key in keys}
+        *_, self.last_key = signature.output_fields.keys()
 
         for idx in range(M):
-            candidate_type = dsp.Type(prefix=f"Student Attempt #{idx+1}:", desc="${reasoning attempt}")
-            extended_kwargs.update({f'reasoning_attempt_{idx+1}': candidate_type})
-        
-        rationale_type = dsp.Type(prefix="Accurate Reasoning: Thank you everyone. Let's now holistically", desc="${corrected reasoning}")
-        extended_kwargs.update({'rationale': rationale_type, last_key: signature.kwargs[last_key]})
+            signature = signature.append(
+                f"reasoning_attempt_{idx+1}",
+                dspy.InputField(
+                    prefix=f"Student Attempt #{idx+1}:", desc="${reasoning attempt}"
+                ),
+            )
+
+        signature = signature.prepend(
+            "rationale",
+            dspy.OutputField(
+                prefix="Accurate Reasoning: Thank you everyone. Let's now holistically",
+                desc="${corrected reasoning}",
+            ),
+        )
 
-        signature = dsp.Template(signature.instructions, **extended_kwargs)
         self.predict = Predict(signature, temperature=temperature, **config)
-        self.last_key = last_key
-    
+
     def forward(self, completions, **kwargs):
         attempts = []
 
         for c in completions:
-            rationale = c.rationale.strip().split('\n')[0].strip()
-            answer = c[self.last_key].strip().split('\n')[0].strip()
-            attempts.append(f"«I'm trying to {rationale} I'm not sure but my prediction is {answer}»")
+            rationale = c.rationale.strip().split("\n")[0].strip()
+            answer = c[self.last_key].strip().split("\n")[0].strip()
+            attempts.append(
+                f"«I'm trying to {rationale} I'm not sure but my prediction is {answer}»"
+            )
 
         assert len(attempts) == self.M, len(attempts)
 
-        kwargs = {**{f'reasoning_attempt_{idx+1}': attempt for idx, attempt in enumerate(attempts)}, **kwargs}
+        kwargs = {
+            **{
+                f"reasoning_attempt_{idx+1}": attempt
+                for idx, attempt in enumerate(attempts)
+            },
+            **kwargs,
+        }
         return self.predict(**kwargs)
diff --git a/dspy/predict/predict.py b/dspy/predict/predict.py
index c68ac1a5a8..3823a72a78 100644
--- a/dspy/predict/predict.py
+++ b/dspy/predict/predict.py
@@ -3,42 +3,16 @@
 
 from dspy.predict.parameter import Parameter
 from dspy.primitives.prediction import Prediction
-from dspy.signatures.field import InputField, OutputField
-from dspy.signatures.signature import infer_prefix
 
+from dspy.signatures.signature import ensure_signature, signature_to_template
 
 class Predict(Parameter):
     def __init__(self, signature, **config):
         self.stage = random.randbytes(8).hex()
-        self.signature = signature #.signature
+        self.signature = ensure_signature(signature)
         self.config = config
         self.reset()
 
-        # if the signature is a string
-        if isinstance(signature, str):
-            inputs, outputs = signature.split("->")
-            inputs, outputs = inputs.split(","), outputs.split(",")
-            inputs, outputs = [field.strip() for field in inputs], [field.strip() for field in outputs]
-
-            assert all(len(field.split()) == 1 for field in (inputs + outputs))
-
-            inputs_ = ', '.join([f"`{field}`" for field in inputs])
-            outputs_ = ', '.join([f"`{field}`" for field in outputs])
-
-            instructions = f"""Given the fields {inputs_}, produce the fields {outputs_}."""
-
-            inputs = {k: InputField() for k in inputs}
-            outputs = {k: OutputField() for k in outputs}
-
-            for k, v in inputs.items():
-                v.finalize(k, infer_prefix(k))
-            
-            for k, v in outputs.items():
-                v.finalize(k, infer_prefix(k))
-
-            self.signature = dsp.Template(instructions, **inputs, **outputs)
-
-    
     def reset(self):
         self.lm = None
         self.traces = []
@@ -51,43 +25,47 @@ def dump_state(self):
 
         # Cache the signature instructions and the last field's name.
         state["signature_instructions"] = self.signature.instructions
-        state["signature_prefix"] = self.signature.fields[-1].name
+
+        *_, last_key = self.signature.fields.keys()
+        state["signature_prefix"] = self.signature.fields[last_key].json_schema_extra['prefix']
 
         return state
 
     def load_state(self, state):
         for name, value in state.items():
             setattr(self, name, value)
-        
+
         # Reconstruct the signature.
         if "signature_instructions" in state:
             instructions = state["signature_instructions"]
-            self.signature.instructions = instructions
-        
+            self.signature = self.signature.with_instructions(instructions)
+
         if "signature_prefix" in state:
             prefix = state["signature_prefix"]
-            self.signature.fields[-1] = self.signature.fields[-1]._replace(name=prefix)
-    
+            *_, last_key = self.signature.fields.keys()
+            self.signature = self.signature.with_updated_fields(last_key, prefix=prefix)
+
     def __call__(self, **kwargs):
         return self.forward(**kwargs)
-    
+
     def forward(self, **kwargs):
         # Extract the three privileged keyword arguments.
-        new_signature = kwargs.pop("new_signature", None)
-        signature = kwargs.pop("signature", self.signature)
+        new_signature = ensure_signature(kwargs.pop("new_signature", None))
+        signature = ensure_signature(kwargs.pop("signature", self.signature))
         demos = kwargs.pop("demos", self.demos)
         config = dict(**self.config, **kwargs.pop("config", {}))
 
         # Get the right LM to use.
         lm = kwargs.pop("lm", self.lm) or dsp.settings.lm
+        assert lm is not None, "No LM is loaded."
 
         # If temperature is 0.0 but its n > 1, set temperature to 0.7.
         temperature = config.get("temperature", None)
-        temperature = lm.kwargs['temperature'] if temperature is None else temperature
+        temperature = lm.kwargs["temperature"] if temperature is None else temperature
 
         num_generations = config.get("n", None)
         if num_generations is None:
-            num_generations = lm.kwargs.get('n', lm.kwargs.get('num_generations', None))
+            num_generations = lm.kwargs.get("n", lm.kwargs.get("num_generations", None))
 
         if (temperature is None or temperature <= 0.15) and num_generations > 1:
             config["temperature"] = 0.7
@@ -98,25 +76,35 @@ def forward(self, **kwargs):
         x = dsp.Example(demos=demos, **kwargs)
 
         if new_signature is not None:
-            signature = dsp.Template(signature.instructions, **new_signature)
+            signature = new_signature
+
+        assert all(k in kwargs for k in signature.input_fields), "Not all input fields were provided."
+
+        # Switch to legacy format for dsp.generate
+        template = signature_to_template(signature)
 
         if self.lm is None:
-            x, C = dsp.generate(signature, **config)(x, stage=self.stage)
+            x, C = dsp.generate(template, **config)(x, stage=self.stage)
         else:
+            # Note: query_only=True means the instructions and examples are not included.
+            # I'm not really sure why we'd want to do that, but it's there.
             with dsp.settings.context(lm=self.lm, query_only=True):
-                # print(f"using lm = {self.lm} !")
-                x, C = dsp.generate(signature, **config)(x, stage=self.stage)
+                x, C = dsp.generate(template, **config)(x, stage=self.stage)
+
+        assert self.stage in x, "The generated (input, output) example was not stored"
 
         completions = []
 
         for c in C:
             completions.append({})
-            for field in signature.fields:
+            for field in template.fields:
                 if field.output_variable not in kwargs.keys():
-                    completions[-1][field.output_variable] = getattr(c, field.output_variable)
+                    completions[-1][field.output_variable] = getattr(
+                        c, field.output_variable
+                    )
 
         pred = Prediction.from_completions(completions, signature=signature)
-            
+
         if kwargs.pop("_trace", True) and dsp.settings.trace is not None:
             trace = dsp.settings.trace
             trace.append((self, {**kwargs}, pred))
@@ -125,7 +113,7 @@ def forward(self, **kwargs):
 
     def update_config(self, **kwargs):
         self.config = {**self.config, **kwargs}
-    
+
     def get_config(self):
         return self.config
 
@@ -133,7 +121,6 @@ def __repr__(self):
         return f"{self.__class__.__name__}({self.signature})"
 
 
-
 # TODO: get some defaults during init from the context window?
 # # TODO: FIXME: Hmm, I guess expected behavior is that contexts can
 # affect execution. Well, we need to determine whether context dominates, __init__ demoninates, or forward dominates.
diff --git a/dspy/predict/program_of_thought.py b/dspy/predict/program_of_thought.py
index 65d1613b3d..516c1129cf 100644
--- a/dspy/predict/program_of_thought.py
+++ b/dspy/predict/program_of_thought.py
@@ -1,94 +1,159 @@
 import dsp
 import dspy
+from dspy.signatures.signature import ensure_signature
 from ..primitives.program import Module
 from ..primitives.python_interpreter import CodePrompt, PythonInterpreter
 import re
 
+
 class ProgramOfThought(Module):
     def __init__(self, signature, max_iters=3):
         super().__init__()
-        self.signature = signature = dspy.Predict(signature).signature
+        self.signature = signature = ensure_signature(signature)
         self.max_iters = max_iters
 
-        self.input_fields = signature.input_fields()
-        self.output_fields = signature.output_fields()
+        self.input_fields = signature.input_fields
+        self.output_fields = signature.output_fields
 
-        inputs_ = ', '.join([f"`{field_name}`" for field_name in self.input_fields.keys()])
-        outputs_ = ', '.join([f"`{field_name}`" for field_name in self.output_fields.keys()])
+        inputs_ = ", ".join(
+            [f"`{field_name}`" for field_name in self.input_fields.keys()]
+        )
+        outputs_ = ", ".join(
+            [f"`{field_name}`" for field_name in self.output_fields.keys()]
+        )
 
         assert len(self.output_fields) == 1, "PoT only supports one output field."
-        
+
         instr = []
-        instr.append(f"You will be given {inputs_} and you will respond with {outputs_}.")
-        instr.append(f"Generating executable Python code that programmatically computes the correct {outputs_}.")
-        instr.append(f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {outputs_}.")
-        instr = '\n'.join(instr)
-        
-        self.code_generate = dspy.ChainOfThought(dsp.Template(self._generate_instruction('generate'), **self._generate_signature('generate')))
-        self.code_regenerate = dspy.ChainOfThought(dsp.Template(self._generate_instruction('regenerate'), **self._generate_signature('regenerate')))
-        self.generate_answer = dspy.ChainOfThought(dsp.Template(self._generate_instruction('answer'), **self._generate_signature('answer')))
+        instr.append(
+            f"You will be given {inputs_} and you will respond with {outputs_}."
+        )
+        instr.append(
+            f"Generating executable Python code that programmatically computes the correct {outputs_}."
+        )
+        instr.append(
+            f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {outputs_}."
+        )
+        instr = "\n".join(instr)
+
+        self.code_generate = dspy.ChainOfThought(
+            dspy.Signature(
+                self._generate_signature("generate").fields,
+                self._generate_instruction("generate"),
+            )
+        )
+        self.code_regenerate = dspy.ChainOfThought(
+            dspy.Signature(
+                self._generate_signature("regenerate").fields,
+                self._generate_instruction("regenerate"),
+            )
+        )
+        self.generate_answer = dspy.ChainOfThought(
+            dspy.Signature(
+                self._generate_signature("answer").fields,
+                self._generate_instruction("answer"),
+            )
+        )
 
     def _generate_signature(self, mode):
         signature_dict = dict(self.input_fields)
         fields_for_mode = {
-            'generate': {
-                'generated_code': dspy.OutputField(prefix="Code:", desc="python code that answers the question", format=str)
+            "generate": {
+                "generated_code": dspy.OutputField(
+                    prefix="Code:",
+                    desc="python code that answers the question",
+                    format=str,
+                )
+            },
+            "regenerate": {
+                "previous_code": dspy.InputField(
+                    prefix="Previous Code:",
+                    desc="previously-generated python code that errored",
+                    format=str,
+                ),
+                "error": dspy.InputField(
+                    prefix="Error:",
+                    desc="error message from previously-generated python code",
+                ),
+                "generated_code": dspy.OutputField(
+                    prefix="Code:",
+                    desc="python code that answers the question",
+                    format=str,
+                ),
             },
-            'regenerate': {
-                'previous_code': dspy.InputField(prefix="Previous Code:", desc="previously-generated python code that errored", format=str),
-                'error': dspy.InputField(prefix="Error:", desc="error message from previously-generated python code"),
-                'generated_code': dspy.OutputField(prefix="Code:", desc="python code that answers the question", format=str)
+            "answer": {
+                "final_generated_code": dspy.InputField(
+                    prefix="Code:",
+                    desc="python code that answers the question",
+                    format=str,
+                ),
+                "code_output": dspy.InputField(
+                    prefix="Code Output:",
+                    desc="output of previously-generated python code",
+                ),
+                "answer": self.signature.fields["answer"],
             },
-            'answer': {
-                'final_generated_code': dspy.InputField(prefix="Code:", desc="python code that answers the question", format=str),
-                'code_output': dspy.InputField(prefix="Code Output:", desc="output of previously-generated python code"),
-                'answer': self.signature.kwargs["answer"]
-            }
         }
         signature_dict.update(fields_for_mode[mode])
-        return signature_dict
+        return dspy.Signature(signature_dict)
 
     def _generate_instruction(self, mode):
-        mode_inputs = ', '.join([f"`{field_name}`" for field_name in self._generate_signature(mode).keys() if isinstance(self._generate_signature(mode)[field_name], dspy.InputField)])
-        mode_outputs = ', '.join([f"`{field_name}`" for field_name in self._generate_signature(mode).keys() if isinstance(self._generate_signature(mode)[field_name], dspy.OutputField)])
-        if mode == 'generate':
+        mode_inputs = ", ".join(
+            [
+                f"`{field_name}`"
+                for field_name in self._generate_signature(mode).input_fields
+            ]
+        )
+        mode_outputs = ", ".join(
+            [
+                f"`{field_name}`"
+                for field_name in self._generate_signature(mode).output_fields
+            ]
+        )
+        if mode == "generate":
             instr = [
                 f"You will be given {mode_inputs} and you will respond with {mode_outputs}.",
                 f"Generating executable Python code that programmatically computes the correct {mode_outputs}.",
-                f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {mode_outputs}."
+                f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {mode_outputs}.",
             ]
-        elif mode == 'regenerate':
+        elif mode == "regenerate":
             instr = [
                 f"You are given {mode_inputs} due to an error in previous code.",
-                f"Your task is to correct the error and provide the new {mode_outputs}."
+                f"Your task is to correct the error and provide the new {mode_outputs}.",
             ]
         else:  # mode == 'answer'
             instr = [
                 f"Given the final code {mode_inputs}, provide the final {mode_outputs}."
             ]
 
-        return '\n'.join(instr)
+        return "\n".join(instr)
 
     def parse_code(self, code_data):
-        code = code_data.get('generated_code', '').split('---', 1)[0].split('\n\n\n', 1)[0]
-        code_match = re.search(r'```python[ \n](.*?)[ \n]```?', code, re.DOTALL)
-        code_block = (code_match.group(1) if code_match else code).replace('\\n', '\n')
+        code = (
+            code_data.get("generated_code", "").split("---", 1)[0].split("\n\n\n", 1)[0]
+        )
+        code_match = re.search(r"```python[ \n](.*?)[ \n]```?", code, re.DOTALL)
+        code_block = (code_match.group(1) if code_match else code).replace("\\n", "\n")
         if not code_block:
             return code, "Error: Empty code after parsing."
-        if "\n" not in code_block and code_block.count('=') > 1:
+        if "\n" not in code_block and code_block.count("=") > 1:
             return code, "Error: Code format is not correct."
-        lines = code_block.split('\n')
-        last_line_match = re.match(r'^(\w+)\s*=', lines[-1].strip())
+        lines = code_block.split("\n")
+        last_line_match = re.match(r"^(\w+)\s*=", lines[-1].strip())
         if last_line_match and len(lines) > 1:
-            code_block += '\n' + last_line_match.group(1)
+            code_block += "\n" + last_line_match.group(1)
         else:
-            code_block = re.sub(r'([a-zA-Z_]\w* *=.*?)(?=[a-zA-Z_]\w* *=)', r'\1\n', code_block)
-            code_block = re.sub(r'([a-zA-Z_]\w* *=.*?)([a-zA-Z_]\w*)$', r'\1\n\2', code_block)
+            code_block = re.sub(
+                r"([a-zA-Z_]\w* *=.*?)(?=[a-zA-Z_]\w* *=)", r"\1\n", code_block
+            )
+            code_block = re.sub(
+                r"([a-zA-Z_]\w* *=.*?)([a-zA-Z_]\w*)$", r"\1\n\2", code_block
+            )
         return code_block, None
 
     def execute_code(self, code):
         if not code:
-            return code, None, 'Error: Empty code before execution.'
+            return code, None, "Error: Empty code before execution."
         code_prompt = CodePrompt(code, code_type="python")
         interpreter = PythonInterpreter(action_space={"print": print})
         try:
@@ -96,19 +161,26 @@ def execute_code(self, code):
             return code, output, None
         except Exception as e:
             return code, None, str(e)
-            
+
     def forward(self, **kwargs):
         code_data = self.code_generate(question=kwargs["question"])
         parsed_code, error = self.parse_code(code_data)
+        # FIXME: Don't try to execute the code if it didn't parse
         code, output, error = self.execute_code(parsed_code)
         hop = 0
         while hop < self.max_iters and error:
-            print('Error in code execution')
-            code_data = self.code_regenerate(question=kwargs["question"], previous_code=code, error=error)
+            print("Error in code execution")
+            code_data = self.code_regenerate(
+                question=kwargs["question"], previous_code=code, error=error
+            )
             parsed_code, error = self.parse_code(code_data)
+            # FIXME: Don't try to execute the code if it didn't parse
+            code, output, error = self.execute_code(parsed_code)
             hop += 1
             if hop == self.max_iters:
-                print('Max hops reached. Error persists.')
+                print("Max hops reached. Error persists.")
                 return None
-        answer_gen_result = self.generate_answer(question=kwargs["question"], final_generated_code=code, code_output=output)
+        answer_gen_result = self.generate_answer(
+            question=kwargs["question"], final_generated_code=code, code_output=output
+        )
         return answer_gen_result
diff --git a/dspy/predict/react.py b/dspy/predict/react.py
index 7dc3d1bd94..ef24c6aca0 100644
--- a/dspy/predict/react.py
+++ b/dspy/predict/react.py
@@ -1,5 +1,6 @@
 import dsp
 import dspy
+from dspy.signatures.signature import ensure_signature
 from ..primitives.program import Module
 from .predict import Predict
 
@@ -10,35 +11,43 @@
 class ReAct(Module):
     def __init__(self, signature, max_iters=5, num_results=3, tools=None):
         super().__init__()
-        self.signature = signature = dspy.Predict(signature).signature
+        self.signature = signature = ensure_signature(signature)
         self.max_iters = max_iters
 
         self.tools = tools or [dspy.Retrieve(k=num_results)]
-        self.tools = {tool.name: tool for tool in self.tools} #if isinstance(self.tools, list) else self.tools
+        self.tools = {tool.name: tool for tool in self.tools}
 
-        self.input_fields = {k: v for k, v in self.signature.kwargs.items() if isinstance(v, dspy.InputField)}
-        self.output_fields = {k: v for k, v in self.signature.kwargs.items() if isinstance(v, dspy.OutputField)}
+        self.input_fields = self.signature.input_fields
+        self.output_fields = self.signature.output_fields
 
-        inputs, outputs = signature.fields[:-1], signature.fields[-1:]
+        assert len(self.output_fields) == 1, "ReAct only supports one output field."
 
-        inputs_ = ', '.join([f"`{field.input_variable}`" for field in inputs])
-        outputs_ = ', '.join([f"`{field.output_variable}`" for field in outputs])
+        inputs_ = ", ".join([f"`{k}`" for k in self.input_fields.keys()])
+        outputs_ = ", ".join([f"`{k}`" for k in self.output_fields.keys()])
 
-        assert len(outputs) == 1, "ReAct only supports one output field."
+        instr = [
+            f"You will be given {inputs_} and you will respond with {outputs_}.\n",
+            "To do this, you will interleave Thought, Action, and Observation steps.\n",
+            "Thought can reason about the current situation, and Action can be the following types:\n",
+        ]
 
-        instr = []
-        instr.append(f"You will be given {inputs_} and you will respond with {outputs_}.\n")
-        instr.append("To do this, you will interleave Thought, Action, and Observation steps.\n")
-        instr.append("Thought can reason about the current situation, and Action can be the following types:\n")
-
-        self.tools['Finish'] = dspy.Example(name="Finish", input_variable=outputs_.strip('`'), desc=f"returns the final {outputs_} and finishes the task")
+        self.tools["Finish"] = dspy.Example(
+            name="Finish",
+            input_variable=outputs_.strip("`"),
+            desc=f"returns the final {outputs_} and finishes the task",
+        )
 
         for idx, tool in enumerate(self.tools):
             tool = self.tools[tool]
-            instr.append(f"({idx+1}) {tool.name}[{tool.input_variable}], which {tool.desc}")
-        
-        instr = '\n'.join(instr)
-        self.react = [Predict(dsp.Template(instr, **self._generate_signature(i))) for i in range(1, max_iters + 1)]
+            instr.append(
+                f"({idx+1}) {tool.name}[{tool.input_variable}], which {tool.desc}"
+            )
+
+        instr = "\n".join(instr)
+        self.react = [
+            Predict(dspy.Signature(self._generate_signature(i), instr))
+            for i in range(1, max_iters + 1)
+        ]
 
     def _generate_signature(self, iters):
         signature_dict = {}
@@ -46,25 +55,42 @@ def _generate_signature(self, iters):
             signature_dict[key] = val
 
         for j in range(1, iters + 1):
-            signature_dict[f"Thought_{j}"] = dspy.OutputField(prefix=f"Thought {j}:", desc="next steps to take based on last observation")
-
-            tool_list = ' or '.join([f"{tool.name}[{tool.input_variable}]" for tool in self.tools.values() if tool.name != 'Finish'])
-            signature_dict[f"Action_{j}"] = dspy.OutputField(prefix=f"Action {j}:", desc=f"always either {tool_list} or, when done, Finish[answer]")
+            signature_dict[f"Thought_{j}"] = dspy.OutputField(
+                prefix=f"Thought {j}:",
+                desc="next steps to take based on last observation",
+            )
+
+            tool_list = " or ".join(
+                [
+                    f"{tool.name}[{tool.input_variable}]"
+                    for tool in self.tools.values()
+                    if tool.name != "Finish"
+                ]
+            )
+            signature_dict[f"Action_{j}"] = dspy.OutputField(
+                prefix=f"Action {j}:",
+                desc=f"always either {tool_list} or, when done, Finish[answer]",
+            )
 
             if j < iters:
-                signature_dict[f"Observation_{j}"] = dspy.OutputField(prefix=f"Observation {j}:", desc="observations based on action", format=dsp.passages2text)
+                signature_dict[f"Observation_{j}"] = dspy.OutputField(
+                    prefix=f"Observation {j}:",
+                    desc="observations based on action",
+                    format=dsp.passages2text,
+                )
 
         return signature_dict
-    
+
     def act(self, output, hop):
         try:
             action = output[f"Action_{hop+1}"]
-            action_name, action_val = action.strip().split('\n')[0].split('[', 1)
-            action_val = action_val.rsplit(']', 1)[0]
+            action_name, action_val = action.strip().split("\n")[0].split("[", 1)
+            action_val = action_val.rsplit("]", 1)[0]
 
-            if action_name == 'Finish': return action_val
+            if action_name == "Finish":
+                return action_val
 
-            try: 
+            try:
                 output[f"Observation_{hop+1}"] = self.tools[action_name](action_val).passages
             except AttributeError:
                 # Handle the case where 'passages' attribute is missing
@@ -72,8 +98,10 @@ def act(self, output, hop):
                 output[f"Observation_{hop+1}"] = self.tools[action_name](action_val)
 
         except Exception as e:
-            output[f"Observation_{hop+1}"] = "Failed to parse action. Bad formatting or incorrect action name."
-        
+            output[f"Observation_{hop+1}"] = (
+                "Failed to parse action. Bad formatting or incorrect action name."
+            )
+            raise e
 
     def forward(self, **kwargs):
         args = {key: kwargs[key] for key in self.input_fields.keys() if key in kwargs}
@@ -81,9 +109,10 @@ def forward(self, **kwargs):
         for hop in range(self.max_iters):
             # with dspy.settings.context(show_guidelines=(i <= 2)):
             output = self.react[hop](**args)
-            
-            if action_val := self.act(output, hop): break
+
+            if action_val := self.act(output, hop):
+                break
             args.update(output)
 
         # assumes only 1 output field for now - TODO: handling for multiple output fields
-        return dspy.Prediction(**{list(self.output_fields.keys())[0]: action_val or ''}) 
+        return dspy.Prediction(**{list(self.output_fields.keys())[0]: action_val or ""})
diff --git a/dspy/predict/retry.py b/dspy/predict/retry.py
index b8f06633bf..af1d37f98b 100644
--- a/dspy/predict/retry.py
+++ b/dspy/predict/retry.py
@@ -9,41 +9,36 @@ class Retry(Predict):
     def __init__(self, module):
         super().__init__(module.signature)
         self.module = module
-        self.original_signature = module.signature.signature
+        self.original_signature = module.signature
         self.original_forward = module.forward
         self.new_signature = self._create_new_signature(self.original_signature)
 
-    def _create_new_signature(self, original_signature):
-        extended_signature = {}
-        input_fields = original_signature.input_fields()
-        output_fields = original_signature.output_fields()
-        modified_output_fields = {}
-
-        for key, value in output_fields.items():
-            modified_output_fields[f"past_{key}"] = dspy.InputField(
-                prefix="Past " + value.prefix,
+    def _create_new_signature(self, signature):
+        # Add "Past" input fields for each output field
+        for key, value in signature.output_fields.items():
+            signature = signature.append(f"past_{key}", dspy.InputField(
+                prefix="Past " + value.json_schema_extra["prefix"],
                 desc="past output with errors",
-                format=value.format,
-            )
-
-        extended_signature.update(input_fields)
-        extended_signature.update(modified_output_fields)
+                format=value.json_schema_extra.get("format"),
+            ))
 
-        extended_signature["feedback"] = dspy.InputField(
+        signature = signature.append("feedback", dspy.InputField(
             prefix="Instructions:",
             desc="Some instructions you must satisfy",
             format=str,
-        )
-        extended_signature.update(output_fields)
+        ))
 
-        return extended_signature
+        return signature
 
-    def forward(self, *args, **kwargs):
-        for key, value in kwargs["past_outputs"].items():
+    def forward(self, *, past_outputs, **kwargs):
+        # Convert the dict past_outputs={"answer": ...} to kwargs
+        # {past_answer=..., ...}
+        for key, value in past_outputs.items():
             past_key = f"past_{key}"
-            if past_key in self.new_signature:
+            if past_key in self.new_signature.input_fields:
                 kwargs[past_key] = value
-        del kwargs["past_outputs"]
+        # Tell the wrapped module to use the new signature.
+        # Note: This only works if the wrapped module is a Predict or ChainOfThought.
         kwargs["new_signature"] = self.new_signature
         return self.original_forward(**kwargs)
     
diff --git a/dspy/primitives/assertions.py b/dspy/primitives/assertions.py
index 5f89896e24..feb54005c3 100644
--- a/dspy/primitives/assertions.py
+++ b/dspy/primitives/assertions.py
@@ -238,7 +238,6 @@ def wrapper(*args, **kwargs):
                 else:
                     try:
                         dsp.settings.trace.clear()
-                        # print("backtrack", dspy.settings.backtrack_to)
                         result = func(*args, **kwargs)
                         break
                     except (DSPySuggestionError, DSPyAssertionError) as e:
@@ -282,13 +281,13 @@ def wrapper(*args, **kwargs):
                                     dspy.settings.backtrack_to
                                 ].append(error_msg)
 
-                            output_fields = vars(error_state[0].signature.signature)
+                            # assert isinstance(error_state[0].signature, dspy.Signature)
+                            output_fields = error_state[0].signature.output_fields
                             past_outputs = {}
-                            for field_name, field_obj in output_fields.items():
-                                if isinstance(field_obj, dspy.OutputField):
-                                    past_outputs[field_name] = getattr(
-                                        error_state[2], field_name, None
-                                    )
+                            for field_name in output_fields.keys():
+                                past_outputs[field_name] = getattr(
+                                    error_state[2], field_name, None
+                                )
 
                             # save latest failure trace for predictor per suggestion
                             error_ip = error_state[1]
diff --git a/dspy/primitives/python_interpreter.py b/dspy/primitives/python_interpreter.py
index f05ec01115..11ae2795a7 100644
--- a/dspy/primitives/python_interpreter.py
+++ b/dspy/primitives/python_interpreter.py
@@ -14,6 +14,7 @@
 import ast
 import difflib
 import importlib
+import re
 import typing
 import inspect
 from typing import (
@@ -506,10 +507,11 @@ class TextPrompt(str):
 
     @property
     def key_words(self) -> Set[str]:
-        r"""Returns a set of strings representing the keywords in the prompt.
-        """
-        from camel.utils import get_prompt_template_key_words
-        return get_prompt_template_key_words(self)
+        """Returns a set of strings representing the keywords in the prompt."""
+        # Regex to find format placeholders within the string, excluding escaped braces
+        pattern = re.compile(r"\{([^{}]+)\}")
+        found = pattern.findall(self)
+        return set(found)
 
     def format(self, *args: Any, **kwargs: Any) -> 'TextPrompt':
         r"""Overrides the built-in :obj:`str.format` method to allow for
diff --git a/dspy/signatures/field.py b/dspy/signatures/field.py
index 848439b6d4..7822c625c0 100644
--- a/dspy/signatures/field.py
+++ b/dspy/signatures/field.py
@@ -1,31 +1,71 @@
-import re
-import dsp
+import pydantic
 
-class Field:
+
+def move_kwargs(**kwargs):
+    # Pydantic doesn't allow arbitrary arguments to be given to fields,
+    # but asks that
+    # > any extra data you want to add to the JSON schema should be passed
+    # > as a dictionary to the json_schema_extra keyword argument.
+    # See: https://docs.pydantic.dev/2.6/migration/#changes-to-pydanticfield
+    pydantic_kwargs = {}
+    json_schema_extra = {}
+    for k, v in kwargs.items():
+        if k in ["desc", "prefix", "format", "parser", "__dspy_field_type"]:
+            json_schema_extra[k] = v
+        else:
+            pydantic_kwargs[k] = v
+    pydantic_kwargs["json_schema_extra"] = json_schema_extra
+    return pydantic_kwargs
+
+
+def InputField(**kwargs):
+    return pydantic.Field(**move_kwargs(**kwargs, __dspy_field_type="input"))
+
+
+def OutputField(**kwargs):
+    return pydantic.Field(**move_kwargs(**kwargs, __dspy_field_type="output"))
+
+
+def new_to_old_field(field):
+    return (
+        OldInputField
+        if field.json_schema_extra["__dspy_field_type"] == "input"
+        else OldOutputField
+    )(
+        prefix=field.json_schema_extra["prefix"],
+        desc=field.json_schema_extra["desc"],
+        format=field.json_schema_extra.get("format"),
+    )
+
+
+class OldField:
     """A more ergonomic datatype that infers prefix and desc if omitted."""
+
     def __init__(self, *, prefix=None, desc=None, input, format=None):
         self.prefix = prefix  # This can be None initially and set later
         self.desc = desc
         self.format = format
-        
+
     def finalize(self, key, inferred_prefix):
         """Set the prefix if it's not provided explicitly."""
         if self.prefix is None:
             self.prefix = inferred_prefix + ":"
-        
+
         if self.desc is None:
-            self.desc = f'${{{key}}}'
-        
+            self.desc = f"${{{key}}}"
+
     def __repr__(self):
         return f"{self.__class__.__name__}(prefix={self.prefix}, desc={self.desc})"
-    
+
     def __eq__(self, __value: object) -> bool:
         return self.__dict__ == __value.__dict__
 
-class InputField(Field):
+
+class OldInputField(OldField):
     def __init__(self, *, prefix=None, desc=None, format=None):
         super().__init__(prefix=prefix, desc=desc, input=True, format=format)
 
-class OutputField(Field):
+
+class OldOutputField(OldField):
     def __init__(self, *, prefix=None, desc=None, format=None):
         super().__init__(prefix=prefix, desc=desc, input=False, format=format)
diff --git a/dspy/signatures/signature.py b/dspy/signatures/signature.py
index ea31a46d4e..b73c956d78 100644
--- a/dspy/signatures/signature.py
+++ b/dspy/signatures/signature.py
@@ -1,176 +1,256 @@
-import re
+from copy import deepcopy
 import dsp
+from pydantic import BaseModel, Field, create_model
+from typing import Type, Union, Dict, Tuple
+import re
 
-from .field import Field, InputField, OutputField
-import threading
+from dspy.signatures.field import InputField, OutputField, new_to_old_field
 
-class SignatureMeta(type):
-    _thread_local_storage = threading.local()
 
-    class _SignatureNamespace:
-        def __init__(self, fields):
-            for key, value in fields.items():
-                setattr(self, key, value)
+def signature_to_template(signature):
+    """Convert from new to legacy format"""
+    return dsp.Template(
+        signature.instructions,
+        **{name: new_to_old_field(field) for name, field in signature.fields.items()},
+    )
 
-        def input_fields(self):
-            return {k: v for k, v in self.__dict__.items() if isinstance(v, InputField)}
 
-        def output_fields(self):
-            return {k: v for k, v in self.__dict__.items() if isinstance(v, OutputField)}
-    
+def _default_instructions(cls):
+    inputs_ = ", ".join([f"`{field}`" for field in cls.input_fields.keys()])
+    outputs_ = ", ".join([f"`{field}`" for field in cls.output_fields.keys()])
+    return f"Given the fields {inputs_}, produce the fields {outputs_}."
 
-    def __new__(cls, name, bases, class_dict):
-        type_attributes = {}
 
-        for k, v in list(class_dict.items()):
-            if isinstance(v, Field):
-                v.finalize(k, infer_prefix(k))
-                type_attributes[k] = v
-                del class_dict[k]
+class SignatureMeta(type(BaseModel)):
+    def __new__(mcs, name, bases, namespace, **kwargs):
+        # Set `str` as the default type for all fields
+        raw_annotations = namespace.get("__annotations__", {})
+        for name, field in namespace.items():
+            if not name.startswith("__") and name not in raw_annotations:
+                raw_annotations[name] = str
+        namespace["__annotations__"] = raw_annotations
 
-        instructions = class_dict.get('__doc__') or ""
+        # Let Pydantic do its thing
+        cls = super().__new__(mcs, name, bases, namespace, **kwargs)
 
-        new_class = super().__new__(cls, name, bases, class_dict)
+        if cls.__doc__ is None:
+            cls.__doc__ = _default_instructions(cls)
 
-        # Attach the _SignatureNamespace directly to the class
-        setattr(new_class, 'signature', cls._SignatureNamespace(type_attributes))
+        # Ensure all fields are declared with InputField or OutputField
+        cls._validate_fields()
 
-        # Create and attach the template directly to the class
-        setattr(new_class, '_template', dsp.Template(instructions=instructions, **type_attributes))
+        # Ensure all fields have a prefix
+        for name, field in cls.model_fields.items():
+            if "prefix" not in field.json_schema_extra:
+                field.json_schema_extra["prefix"] = infer_prefix(name) + ":"
+            if "desc" not in field.json_schema_extra:
+                field.json_schema_extra["desc"] = f"${{{name}}}"
 
-        return new_class
+        return cls
+
+    def _validate_fields(cls):
+        for name, field in cls.model_fields.items():
+            extra = field.json_schema_extra or {}
+            field_type = extra.get("__dspy_field_type")
+            if field_type not in ["input", "output"]:
+                raise TypeError(
+                    f"Field '{name}' in '{cls.__name__}' must be declared with InputField or OutputField."
+                )
 
     @property
-    def kwargs(cls):
-        return cls.signature.fields
-    
-    def __call__(cls, *args, **kwargs):
-        if len(args) == 1 and isinstance(args[0], str):
-            instance = super(SignatureMeta, cls).__call__(*args, **kwargs)
-            return instance
-        #old 
-        return cls._template(*args, **kwargs)
-
-    def __getattr__(cls, attr):
-        # Redirect attribute access to the template object when accessed on the class directly
-        if attr not in cls.__dict__:
-            return getattr(cls._template, attr)
-        return super().__getattr__(attr)    
-
-class Signature(metaclass=SignatureMeta):
-    def __init__(self, signature: str = "", instructions: str = ""):
-        self.signature = signature
-        self.instructions = instructions
-        self.fields = {}
-        self.parse_structure()
-    
-    def __getattr__(self, attr):
-        if attr not in self.__dict__:
-            return getattr(self.__class__, attr)
-        return super().__getattr__(attr)
+    def signature(cls) -> str:
+        in_args = ", ".join(cls.input_fields.keys())
+        out_args = ", ".join(cls.output_fields.keys())
+        return f"{in_args} -> {out_args}"
 
     @property
-    def kwargs(self):
-        return {k: v for k, v in self.fields.items()}
-
-    def parse_structure(self):
-        inputs_str, outputs_str = self.signature.split("->")
-        for name in inputs_str.split(","):
-            self.add_field(name.strip(), InputField())
-        for name in outputs_str.split(","):
-            self.add_field(name.strip(), OutputField())
-
-    def attach(self, **kwargs):
-        for key, (prefix, desc) in kwargs.items():
-            field_type = self.fields.get(key)
-            if not field_type:
-                raise ValueError(f"{key} does not exist in this signature")
-            field_map = {
-                InputField: InputField(prefix=prefix, desc=desc),
-                OutputField: OutputField(prefix=prefix, desc=desc)
-            }
-            self.fields[key] = field_map.get(type(field_type))
-        return self
-
-    def add_field(self, field_name: str, field_type, position="append"):
-        if field_name in self.fields:
-            raise ValueError(f"{field_name} already exists in fields.")
-        if isinstance(field_type, (InputField, OutputField)):
-            field_instance = field_type
-        else:
-            raise ValueError(f"non-existent {field_type}.")
-        if isinstance(field_instance, InputField) and position == "append":
-            input_fields = self.input_fields()
-            if input_fields:
-                last_input_key = list(input_fields.keys())[-1]
-                index = list(self.fields.keys()).index(last_input_key) + 1
-                self.fields = {**dict(list(self.fields.items())[:index]), field_name: field_instance, **dict(list(self.fields.items())[index:])}
-            else:
-                self.fields[field_name] = field_instance
-        elif isinstance(field_instance, OutputField) and position == "prepend":
-            output_fields = self.output_fields()
-            if output_fields:
-                first_output_key = list(output_fields.keys())[0]
-                index = list(self.fields.keys()).index(first_output_key)
-                self.fields = {**dict(list(self.fields.items())[:index]), field_name: field_instance, **dict(list(self.fields.items())[index:])}
-            else:
-                self.fields[field_name] = field_instance
-        elif position == "prepend":
-            self.fields = {field_name: field_instance, **self.fields}
-        elif position == "append":
-            self.fields[field_name] = field_instance
+    def instructions(cls) -> str:
+        return getattr(cls, "__doc__", "")
+
+    def with_instructions(cls, instructions: str):
+        return create_model(
+            cls.__name__, __base__=Signature, __doc__=instructions, **cls.fields
+        )
+
+    @property
+    def fields(cls):
+        # Make sure to give input fields before output fields
+        return {**cls.input_fields, **cls.output_fields}
+
+    def with_updated_fields(cls, name, **kwargs):
+        """Returns a new Signature type with the field, name, updated
+        with fields[name].json_schema_extra[key] = value."""
+        fields_copy = deepcopy(cls.fields)
+        fields_copy[name].json_schema_extra = {
+            **fields_copy[name].json_schema_extra,
+            **kwargs,
+        }
+        return create_model(
+            cls.__name__, __base__=Signature, __doc__=cls.instructions, **fields_copy
+        )
+
+    @property
+    def input_fields(cls):
+        return cls._get_fields_with_type("input")
+
+    @property
+    def output_fields(cls):
+        return cls._get_fields_with_type("output")
+
+    def _get_fields_with_type(cls, field_type):
+        return {
+            k: v
+            for k, v in cls.model_fields.items()
+            if v.json_schema_extra["__dspy_field_type"] == field_type
+        }
+
+    def prepend(cls, name, field, type_=None):
+        return cls.insert(0, name, field, type_)
+
+    def append(cls, name, field, type_=None):
+        return cls.insert(-1, name, field, type_)
+
+    def insert(cls, index: int, name: str, field, type_: Type = None):
+        # It's posisble to set the type as annotation=type in pydantic.Field(...)
+        # But this may be annoying for users, so we allow them to pass the type
+        if type_ is not None:
+            field.annotation = type_
+
+        input_fields = list(cls.input_fields.items())
+        output_fields = list(cls.output_fields.items())
+
+        # Choose the list to insert into based on the field type
+        lst = (
+            input_fields
+            if field.json_schema_extra["__dspy_field_type"] == "input"
+            else output_fields
+        )
+        # We support negative insert indices
+        if index < 0:
+            index += len(lst) + 1
+        if index < 0 or index > len(lst):
+            raise ValueError(f"Invalid index: {index}")
+        lst.insert(index, (name, field))
+
+        new_fields = dict(input_fields + output_fields)
+        new_signature = create_model(
+            cls.__name__ + "'", __base__=Signature, **new_fields
+        )
+        new_signature.__doc__ = cls.instructions
+        return new_signature
+
+    def _parse_signature(cls, signature: str) -> Tuple[Type, Field]:
+        pattern = r"^\s*[\w\s,]+\s*->\s*[\w\s,]+\s*$"
+        if not re.match(pattern, signature):
+            raise ValueError(f"Invalid signature format: '{signature}'")
+
+        fields = {}
+        inputs_str, outputs_str = map(str.strip, signature.split("->"))
+        inputs = [v.strip() for v in inputs_str.split(",") if v.strip()]
+        outputs = [v.strip() for v in outputs_str.split(",") if v.strip()]
+        for name in inputs:
+            fields[name] = (str, InputField())
+        for name in outputs:
+            fields[name] = (str, OutputField())
+
+        return fields
+
+    def __call__(
+        cls,
+        signature: Union[str, Dict[str, Tuple[type, Field]]],
+        instructions: str = None,
+    ):
+        """
+        Creates a new Signature type with the given fields and instructions.
+        Note:
+            Even though we're calling a type, we're not making an instance of the type.
+            In general we don't allow instances of Signature types to be made. The call
+            syntax is only for your convenience.
+        Parameters:
+            signature: Format: "input1, input2 -> output1, output2"
+            instructions: Optional prompt for the signature.
+        """
+
+        if isinstance(signature, str):
+            fields = cls._parse_signature(signature)
         else:
-            raise ValueError(f"invalid field addition. Please verify that your field name: {field_name}, field_type: {field_type}, and expected position: {position} are correct.")
+            fields = signature
+
+        # Default prompt when no instructions are provided
+        if instructions is None:
+            sig = Signature(signature, "")  # Simple way to parse input/output fields
+            instructions = _default_instructions(sig)
+
+        signature = create_model("Signature", __base__=Signature, **fields)
+        signature.__doc__ = instructions
+        return signature
 
-    def input_fields(self):
-        return {k: v for k, v in self.fields.items() if isinstance(v, InputField)}
+    def equals(cls, other):
+        """Compare the JSON schema of two Pydantic models."""
+        if not isinstance(other, type) or not issubclass(other, BaseModel):
+            return False
+        if cls.instructions != other.instructions:
+            return False
+        for name in cls.fields.keys() | other.fields.keys():
+            if name not in other.fields or name not in cls.fields:
+                return False
+            # TODO: Should we compare the fields?
+        return True
 
-    def output_fields(self):
-        return {k: v for k, v in self.fields.items() if isinstance(v, OutputField)}
+    def __repr__(cls):
+        """
+        Outputs something on the form:
+        Signature(question, context -> answer
+            question: str = InputField(desc="..."),
+            context: List[str] = InputField(desc="..."),
+            answer: int = OutputField(desc="..."),
+        )
+        """
+        field_reprs = []
+        for name, field in cls.fields.items():
+            field_reprs.append(f"{name} = Field({field})")
+        field_repr = "\n    ".join(field_reprs)
+        return (
+            f"Signature({cls.signature}\n"
+            f"    instructions={repr(cls.instructions)}\n"
+            f"    {field_repr}\n)"
+        )
 
-    def __repr__(self):
-        s = []
-        for name, _ in self.fields.items():
-            value = getattr(self, name, None)
-            if value:
-                s.append(f"- {name} = {value}")
-            else:
-                s.append(f"- {name} = [field not attached]")
-        return f'{self.__class__.__name__}\n' + '\n'.join(s)
 
-    def __eq__(self, __value: object) -> bool:
-        return self._template == __value._template
+class Signature(BaseModel, metaclass=SignatureMeta):
+    pass
 
 
+def ensure_signature(signature):
+    if signature is None:
+        return None
+    if isinstance(signature, str):
+        return Signature(signature)
+    return signature
+
 
 def infer_prefix(attribute_name: str) -> str:
     """Infers a prefix from an attribute name."""
-    
+
     # Convert camelCase to snake_case, but handle sequences of capital letters properly
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', attribute_name)
-    intermediate_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", attribute_name)
+    intermediate_name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
 
     # Insert underscores around numbers to ensure spaces in the final output
-    with_underscores_around_numbers = re.sub('([a-zA-Z])(\d)', r'\1_\2', intermediate_name)
-    with_underscores_around_numbers = re.sub('(\d)([a-zA-Z])', r'\1_\2', with_underscores_around_numbers)
+    with_underscores_around_numbers = re.sub(
+        r"([a-zA-Z])(\d)", r"\1_\2", intermediate_name
+    )
+    with_underscores_around_numbers = re.sub(
+        r"(\d)([a-zA-Z])", r"\1_\2", with_underscores_around_numbers
+    )
 
     # Convert snake_case to 'Proper Title Case', but ensure acronyms are uppercased
-    words = with_underscores_around_numbers.split('_')
+    words = with_underscores_around_numbers.split("_")
     title_cased_words = []
     for word in words:
         if word.isupper():
             title_cased_words.append(word)
         else:
             title_cased_words.append(word.capitalize())
-    
-    return ' '.join(title_cased_words)
-
-### Testing the function
-assert infer_prefix('someAttributeName42IsCool') == 'Some Attribute Name 42 Is Cool'
-assert infer_prefix('version2Update') == 'Version 2 Update'
-assert infer_prefix('modelT45Enhanced') == 'Model T 45 Enhanced'
-assert infer_prefix('someAttributeName') == 'Some Attribute Name'
-assert infer_prefix('some_attribute_name') == 'Some Attribute Name'
-assert infer_prefix('URLAddress') == 'URL Address'
-assert infer_prefix('isHTTPSecure') == 'Is HTTP Secure'
-assert infer_prefix('isHTTPSSecure123') == 'Is HTTPS Secure 123'
\ No newline at end of file
+
+    return " ".join(title_cased_words)
diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
index b885bea0f7..c4804d63d2 100644
--- a/dspy/teleprompt/bootstrap.py
+++ b/dspy/teleprompt/bootstrap.py
@@ -80,7 +80,7 @@ def _prepare_predictor_mappings(self):
 
         for (name1, predictor1), (name2, predictor2) in zip(student.named_predictors(), teacher.named_predictors()):
             assert name1 == name2, "Student and teacher must have the same program structure."
-            assert predictor1.signature == predictor2.signature, f"Student and teacher must have the same signatures. {type(predictor1.signature)} != {type(predictor2.signature)}"
+            assert predictor1.signature.equals(predictor2.signature), f"Student and teacher must have the same signatures. {type(predictor1.signature)} != {type(predictor2.signature)}"
             assert id(predictor1) != id(predictor2), "Student and teacher must be different objects."
 
             name2predictor[name1] = None # dict(student=predictor1, teacher=predictor2)
diff --git a/dspy/teleprompt/finetune.py b/dspy/teleprompt/finetune.py
index 82fdf530d8..a56adaecbe 100644
--- a/dspy/teleprompt/finetune.py
+++ b/dspy/teleprompt/finetune.py
@@ -7,6 +7,8 @@
 import ujson
 from datasets.fingerprint import Hasher
 
+from dspy.signatures.signature import signature_to_template
+
 # from dspy.primitives import Example
 
 from .teleprompt import Teleprompter
@@ -84,8 +86,9 @@ def compile(self, student, *, teacher=None, trainset, valset=None,
                     demo = dict(demo)
 
                     # TODO: FIXME: generalize.
-                    completion = demo.pop(predictor.signature.fields[-1].output_variable)
-                    prompt = predictor.signature.query(dsp.Example(demos=[], **demo)).strip()
+                    template = signature_to_template(predictor.signature)
+                    completion = demo.pop(template.fields[-1].output_variable)
+                    prompt = template.query(dsp.Example(demos=[], **demo)).strip()
 
                     finetune_data[name_].append(dict(prompt=prompt, completion=completion))
 
diff --git a/dspy/teleprompt/signature_opt.py b/dspy/teleprompt/signature_opt.py
index 4c047b3daf..512372f00b 100644
--- a/dspy/teleprompt/signature_opt.py
+++ b/dspy/teleprompt/signature_opt.py
@@ -48,6 +48,8 @@ class GenerateInstructionGivenAttempts(dspy.Signature):
 
 class SignatureOptimizer(Teleprompter):
     def __init__(self, prompt_model=None, metric=None, breadth=10, depth=3, init_temperature=1.4, verbose=False, track_stats=False):
+        if breadth <= 1:
+            raise ValueError("Breadth must be greater than 1")
         self.metric = metric
         self.breadth = breadth
         self.depth = depth
@@ -60,7 +62,9 @@ def _check_candidates_equal(self, candidate1, candidate2):
         for p1, p2 in zip(candidate1["program"].predictors(), candidate2["program"].predictors()):
             if not p1.extended_signature.instructions == p2.extended_signature.instructions:
                 return False
-            if not p1.extended_signature.fields[-1] == p2.extended_signature.fields[-1]:
+            *_, p1_last_field = p1.extended_signature.fields.values()
+            *_, p2_last_field = p2.extended_signature.fields.values()
+            if not p1_last_field == p2_last_field:
                 return False
         return True
 
@@ -103,12 +107,13 @@ def compile(self, student, *, devset, eval_kwargs):
         for predictor in module.predictors():
             basic_instruction = None
             basic_prefix = None
+            *_, last_key = predictor.extended_signature.fields.keys()
             if (hasattr(predictor, 'extended_signature')):
                 basic_instruction = predictor.extended_signature.instructions
-                basic_prefix = predictor.extended_signature.fields[-1].name
+                basic_prefix = predictor.extended_signature.fields[last_key].json_schema_extra['prefix']
             else:
                 basic_instruction = predictor.extended_signature1.instructions
-                basic_prefix = predictor.extended_signature1.fields[-1].name
+                basic_prefix = predictor.extended_signature1.fields[last_key].json_schema_extra['prefix']
             if self.prompt_model: 
                 with dspy.settings.context(lm=self.prompt_model):
                     instruct = dspy.Predict(BasicGenerateInstruction, n=self.breadth-1, temperature=self.init_temperature)(basic_instruction=basic_instruction)
@@ -146,13 +151,19 @@ def compile(self, student, *, devset, eval_kwargs):
 
                     # Set this new module with our instruction / prefix 
                     if (hasattr(p_new, 'extended_signature')):
-                        p_new.extended_signature.instructions = instruction
-                        p_new.extended_signature.fields[-1] = p_new.extended_signature.fields[-1]._replace(name=prefix)
+                        *_, last_key = p_new.extended_signature.fields.keys()
+                        p_new.extended_signature = p_new.extended_signature \
+                            .with_instructions(instruction) \
+                            .with_updated_fields(last_key, prefix=prefix)
                     else:
-                        p_new.extended_signature1.instructions = instruction
-                        p_new.extended_signature1.fields[-1] = p_new.extended_signature1.fields[-1]._replace(name=prefix)
-                        p_new.extended_signature2.instructions = instruction
-                        p_new.extended_signature2.fields[-1] = p_new.extended_signature2.fields[-1]._replace(name=prefix)           
+                        *_, last_key = p_new.extended_signature1.fields.keys()
+                        p_new.extended_signature1 = p_new.extended_signature1 \
+                            .with_instructions(instruction) \
+                            .with_updated_fields(last_key, prefix=prefix)
+                        *_, last_key = p_new.extended_signature2.fields.keys()
+                        p_new.extended_signature2 = p_new.extended_signature2 \
+                            .with_instructions(instruction) \
+                            .with_updated_fields(last_key, prefix=prefix)
 
                     # Score the instruction / prefix 
                     if self.verbose: print(f"----------------")
@@ -203,13 +214,19 @@ def compile(self, student, *, devset, eval_kwargs):
                 # to ensure the next round of scores reflect the best possible version
                 best_candidate = max(evaluated_candidates[id(p_old)].values(), key=lambda candidate: candidate['score'])
                 if (hasattr(p_new, 'extended_signature')):
-                    p_new.extended_signature.instructions = best_candidate["instruction"]
-                    p_new.extended_signature.fields[-1] = p_new.extended_signature.fields[-1]._replace(name=best_candidate["prefix"])
+                    *_, last_key = p_old.extended_signature.fields.keys()
+                    p_new.extended_signature = p_new.extended_signature \
+                        .with_instructions(best_candidate["instruction"]) \
+                        .with_updated_fields(last_key, prefix=best_candidate["prefix"])
                 else:
-                    p_new.extended_signature1.instructions = best_candidate["instruction"]
-                    p_new.extended_signature1.fields[-1] = p_new.extended_signature1.fields[-1]._replace(name=best_candidate["prefix"])
-                    p_new.extended_signature2.instructions = best_candidate["instruction"]
-                    p_new.extended_signature2.fields[-1] = p_new.extended_signature2.fields[-1]._replace(name=best_candidate["prefix"])     
+                    *_, last_key1 = p_old.extended_signature1.fields.keys()
+                    p_new.extended_signature1 = p_new.extended_signature \
+                        .with_instructions(best_candidate["instruction"]) \
+                        .with_updated_fields(last_key1, prefix=best_candidate["prefix"])
+                    *_, last_key2 = p_old.extended_signature2.fields.keys()
+                    p_new.extended_signature2 = p_new.extended_signature \
+                        .with_instructions(best_candidate["instruction"]) \
+                        .with_updated_fields(last_key2, prefix=best_candidate["prefix"])
                 if self.verbose: print(f"Updating Predictor {id(p_old)} to:\ni: {best_candidate['instruction']}\np: {best_candidate['prefix']}")
                 if self.verbose: print(f"Full predictor with update: ")
                 for i,predictor in enumerate(module_clone.predictors()):
diff --git a/dspy/teleprompt/signature_opt_bayesian.py b/dspy/teleprompt/signature_opt_bayesian.py
index 68d7aacf0e..045461fcea 100644
--- a/dspy/teleprompt/signature_opt_bayesian.py
+++ b/dspy/teleprompt/signature_opt_bayesian.py
@@ -1,5 +1,6 @@
 import dsp
 import dspy
+from dspy.signatures.signature import signature_to_template
 from dspy.teleprompt.teleprompt import Teleprompter
 from dspy.signatures import Signature
 from dspy.evaluate.evaluate import Evaluate
@@ -114,10 +115,12 @@ def _print_full_program(self, program):
             if self.verbose: print(f"Predictor {i}")
             if (hasattr(predictor, 'extended_signature')):
                 if self.verbose: print(f"i: {predictor.extended_signature.instructions}")
-                if self.verbose: print(f"p: {predictor.extended_signature.fields[-1].name}")
+                *_, last_field = predictor.extended_signature.fields.values()
+                if self.verbose: print(f"p: {last_field.json_schema_extra['prefix']}")
             else:
                 if self.verbose: print(f"i: {predictor.extended_signature1.instructions}")
-                if self.verbose: print(f"p: {predictor.extended_signature1.fields[-1].name}")
+                *_, last_field = predictor.extended_signature1.fields.values()
+                if self.verbose: print(f"p: {last_field.json_schema_extra['prefix']}")
             if self.verbose: print("\n")
     
     def _print_model_history(self, model, n=1):
@@ -186,8 +189,8 @@ def _generate_first_N_candidates(self, module, N, view_data, view_examples, demo
                                 if example["augmented"]:
                                     if example_set_i not in example_set:
                                         example_set[example_set_i] = []
-                                    fields_to_use = predictor.signature.fields
-                                    input_variable_names = [field.input_variable for field in fields_to_use]
+                                    fields_to_use = signature_to_template(predictor.signature).fields
+                                    input_variable_names = list(predictor.signature.input_fields.keys())
                                     example_with_only_signature_fields = {key: value for key, value in example.items() if key in input_variable_names}
                                     example_string = self._create_example_string(fields_to_use, example_with_only_signature_fields)
                                     example_set[example_set_i].append(example_string)
@@ -202,16 +205,28 @@ def _generate_first_N_candidates(self, module, N, view_data, view_examples, demo
             basic_prefix = None
             if (hasattr(predictor, 'extended_signature')):
                 basic_instruction = predictor.extended_signature.instructions
-                basic_prefix = predictor.extended_signature.fields[-1].name
+                *_, last_field = predictor.extended_signature.fields.values()
+                basic_prefix = last_field.json_schema_extra["prefix"]
             else:
                 basic_instruction = predictor.extended_signature1.instructions
-                basic_prefix = predictor.extended_signature1.fields[-1].name
+                *_, last_field = predictor.extended_signature1.fields.values()
+                basic_prefix = last_field.json_schema_extra["prefix"]
             with dspy.settings.context(lm=self.prompt_model):
                 # Data & Examples
                 if view_data and view_examples:
+                    if 1 not in example_sets[id(predictor)].keys():
+                        raise ValueError("No examples found for the given predictor")
                     instruct = None
-                    for i in range(1,self.n):
-                        new_instruct = dspy.Predict(BasicGenerateInstructionWithExamplesAndDataObservations, n=1, temperature=self.init_temperature)(basic_instruction=basic_instruction, observations=self.observations, examples=example_sets[id(predictor)][i])
+                    for i in range(1, self.n):
+                        new_instruct = dspy.Predict(
+                            BasicGenerateInstructionWithExamplesAndDataObservations,
+                            n=1,
+                            temperature=self.init_temperature
+                        )(
+                            basic_instruction=basic_instruction,
+                            observations=self.observations,
+                            examples=example_sets[id(predictor)][i]
+                        )
                         if not instruct:
                             instruct = new_instruct
                         else:
@@ -224,7 +239,14 @@ def _generate_first_N_candidates(self, module, N, view_data, view_examples, demo
                 elif view_examples: 
                     instruct = None
                     for i in range(1,self.n): # Note: skip over the first example set which is empty
-                        new_instruct = dspy.Predict(BasicGenerateInstructionWithExamples, n=1, temperature=self.init_temperature)(basic_instruction=basic_instruction, examples=example_sets[id(predictor)][i])
+                        new_instruct = dspy.Predict(
+                            BasicGenerateInstructionWithExamples,
+                            n=1,
+                            temperature=self.init_temperature
+                        )(
+                            basic_instruction=basic_instruction,
+                            examples=example_sets[id(predictor)][i]
+                        )
                         if not instruct:
                             instruct = new_instruct
                         else:
@@ -314,8 +336,10 @@ def objective(trial):
                     selected_prefix = selected_candidate.proposed_prefix_for_output_field.strip('"').strip()
 
                     # Use this candidates in our program
-                    p_new.extended_signature.instructions = selected_instruction
-                    p_new.extended_signature.fields[-1] = p_new.extended_signature.fields[-1]._replace(name=selected_prefix)
+                    *_, last_field = p_new.extended_signature.fields.keys()
+                    p_new.extended_signature = p_new.extended_signature \
+                        .with_instructions(selected_instruction) \
+                        .with_updated_fields(last_field, prefix=selected_prefix)
 
                     # Get the selected demos
                     selected_demos = p_demo_candidates[demos_idx]
@@ -353,8 +377,9 @@ def objective(trial):
                         trial_num += 1 
                         raise optuna.TrialPruned()
                 
-                if self.verbose: print(f"Fully evaled score: {curr_weighted_avg_score}")
-                self._print_model_history(self.task_model, n=1)
+                if self.verbose:
+                    print(f"Fully evaled score: {curr_weighted_avg_score}")
+                    self._print_model_history(self.task_model, n=1)
                 score = curr_weighted_avg_score
                 
                 trial_logs[trial_num]["score"] = curr_weighted_avg_score
diff --git a/dspy/utils/__init__.py b/dspy/utils/__init__.py
new file mode 100644
index 0000000000..c6f239df08
--- /dev/null
+++ b/dspy/utils/__init__.py
@@ -0,0 +1 @@
+from .dummies import *
\ No newline at end of file
diff --git a/dspy/utils/dummies.py b/dspy/utils/dummies.py
new file mode 100644
index 0000000000..a0c997145b
--- /dev/null
+++ b/dspy/utils/dummies.py
@@ -0,0 +1,144 @@
+import random
+from dsp.modules import LM
+from typing import List, Union, Dict
+import numpy as np
+from dsp.utils.utils import dotdict
+import re
+
+
+class DummyLM(LM):
+    """Dummy language model for unit testing purposes."""
+
+    def __init__(self, answers: Union[List[str], Dict[str,str]], follow_examples: bool = False):
+        """
+        Initializes the dummy language model.
+        Parameters:
+        - answers: A list of strings or a dictionary with string keys and values.
+        - follow_examples: If True, and the prompt contains an example exactly equal to the prompt,
+                           the dummy model will return the next string in the list for each request.
+        If a list is provided, the dummy model will return the next string in the list for each request.
+        If a dictionary is provided, the dummy model will return the value corresponding to the key that matches the prompt.
+        """
+        super().__init__("dummy-model")
+        self.provider = "dummy"
+        self.answers = answers
+        self.follow_examples = follow_examples
+
+    def basic_request(self, prompt, n=1, **kwargs):
+        """Generates a dummy response based on the prompt."""
+        dummy_response = {"choices": []}
+        for _ in range(n):
+            answer = None
+
+            if self.follow_examples:
+                prefix = prompt.split("\n")[-1]
+                _instructions, _format, *examples, _output = prompt.split("\n---\n")
+                examples_str = "\n".join(examples)
+                possible_answers = re.findall(prefix + r"\s*(.*)", examples_str)
+                if possible_answers:
+                    # We take the last answer, as the first one is just from
+                    # the "Follow the following format" section.
+                    answer = possible_answers[-1]
+                    print(f"DummyLM got found previous example for {prefix} with value {answer=}")
+                else:
+                    print(f"DummyLM couldn't find previous example for {prefix=}")
+
+            if answer is None:
+                if isinstance(self.answers, dict):
+                    answer = next((v for k, v in self.answers.items() if k in prompt), None)
+                else:
+                    if len(self.answers) > 0:
+                        answer = self.answers[0]
+                        self.answers = self.answers[1:]
+
+            if answer is None:
+                answer = "No more responses"
+
+            # Mimic the structure of a real language model response.
+            dummy_response["choices"].append({
+                "text": answer,
+                "finish_reason": "simulated completion",
+            })
+            
+            RED, GREEN, RESET = '\033[91m', '\033[92m', '\033[0m'
+            print("=== DummyLM ===")
+            print(prompt, end="")
+            print(f"{RED}{answer}{RESET}")
+            print("===")
+
+        # Simulate processing and storing the request and response.
+        history_entry = {
+            "prompt": prompt,
+            "response": dummy_response,
+            "kwargs": kwargs,
+            "raw_kwargs": kwargs,
+        }
+        self.history.append(history_entry)
+
+        return dummy_response
+
+    def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
+        """Retrieves dummy completions."""
+        response = self.basic_request(prompt, **kwargs)
+        choices = response["choices"]
+
+        # Filter choices and return text completions.
+        completions = [choice["text"] for choice in choices]
+
+        return completions
+
+    def get_convo(self, index):
+        """Get the prompt + anwer from the ith message"""
+        return self.history[index]['prompt'] \
+            + " " \
+            + self.history[index]['response']['choices'][0]['text']
+
+
+def dummy_rm(passages=()):
+    if not passages:
+        def inner(query:str, *, k:int, **kwargs):
+            assert False, "No passages defined"
+        return inner
+    max_length = max(map(len, passages)) + 100
+    vectorizer = DummyVectorizer(max_length)
+    passage_vecs = vectorizer(passages)
+    def inner(query:str, *, k:int, **kwargs):
+        assert k <= len(passages)
+        query_vec = vectorizer([query])[0]
+        scores = passage_vecs @ query_vec
+        largest_idx = (-scores).argsort()[:k]
+        #return dspy.Prediction(passages=[passages[i] for i in largest_idx])
+        return [dotdict(dict(long_text=passages[i])) for i in largest_idx]
+    return inner
+
+
+class DummyVectorizer:
+    """Simple vectorizer based on n-grams"""
+    def __init__(self, max_length=100, n_gram=2):
+        self.max_length = max_length
+        self.n_gram = n_gram
+        self.P = 10**9 + 7  # A large prime number
+        random.seed(123)
+        self.coeffs = [random.randrange(1, self.P) for _ in range(n_gram)]
+    
+    def _hash(self, gram):
+        """Hashes a string using a polynomial hash function"""
+        h = 1
+        for coeff, c in zip(self.coeffs, gram):
+            h = h * coeff + ord(c)
+            h %= self.P
+        return h % self.max_length
+
+    def __call__(self, texts: List[str]) -> np.ndarray:
+        vecs = []
+        for text in texts:
+            grams = [text[i:i+self.n_gram] for i in range(len(text) - self.n_gram + 1)]
+            vec = [0] * self.max_length
+            for gram in grams:
+                vec[self._hash(gram)] += 1
+            vecs.append(vec)
+        
+        vecs = np.array(vecs, dtype=np.float32)
+        vecs -= np.mean(vecs, axis=1, keepdims=True)
+        vecs /= np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-10  # Added epsilon to avoid division by zero
+        return vecs
diff --git a/examples/longformqa/DSPy_LongFormQA_Cache b/examples/longformqa/DSPy_LongFormQA_Cache
new file mode 160000
index 0000000000..595ce1f7df
--- /dev/null
+++ b/examples/longformqa/DSPy_LongFormQA_Cache
@@ -0,0 +1 @@
+Subproject commit 595ce1f7dfd71dd925795d5fa07f36c9f13a4c29
diff --git a/examples/longformqa/longformqa_assertions.ipynb b/examples/longformqa/longformqa_assertions.ipynb
index d3059b1cbc..40df53a244 100644
--- a/examples/longformqa/longformqa_assertions.ipynb
+++ b/examples/longformqa/longformqa_assertions.ipynb
@@ -6,9 +6,7 @@
    "source": [
     "<img src=\"../../docs/images/DSPy8.png\" alt=\"DSPy7 Image\" height=\"150\"/>\n",
     "\n",
-    "## **DSPy Assertions**: Asserting Computational Constraints on Foundation \n",
-    "\n",
-    "### **LongFormQA**: Generating long-form length responses to answer questions"
+    "## **DSPy Assertions**: Asserting Computational Constraints on Foundation Models"
    ]
   },
   {
@@ -87,7 +85,6 @@
     "if not \"dspy-ai\" in {pkg.key for pkg in pkg_resources.working_set}:\n",
     "    !pip install -U pip\n",
     "    !pip install dspy-ai\n",
-    "    !pip install openai~=0.28.1\n",
     "    !pip install -e $repo_path\n",
     "\n",
     "import dspy\n",
@@ -555,7 +552,7 @@
     "\n",
     "We can also leverage **DSPy**'s advanced compiling features to enhance our program's performance. \n",
     "\n",
-    "For this, we utilize the `BootstrapFewShotWithRandomSearch` teleprompter, which automatically incorporates few-shot demonstrations and conducts a random search over a candidate set to output the best compiled program. We evaluate this over the `answer_correctness` metric as our ultimate goal is indeed to generate correct answers to the `HotPotQA` questions from the paragraphs, aiming to optimize both intrinsic and extrinsic metrics as a result. \n",
+    "For this, we utilize the `BootstrapFewShotWithRandomSearch` teleprompter, which automatically incorporates few-shot demonstrations and conducts a random search over a candidate set to output the best compiled program. We evaluate this over the `answer_correctness` metric as our ultimate goal is indeed to generate correct answers to the `HotPotQA` questions from the paragraphs, aiming to optimize both instrinsic and extrinsic metrics as a result. \n",
     "\n",
     "Let's evaluate this on the LongFormQA program first:"
    ]
@@ -594,24 +591,24 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note** This pipeline on the other hand sets both the teacher and student with `LongFormQAWithAssertions()` to ensure the teacher correctly instructs the student with the right bootstrapped examples and the student has the chance to self-correct with **Assertions** for any examples that are still deemed incorrect."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "longformqa = LongFormQA()\n",
-    "teleprompter = BootstrapFewShotWithRandomSearch(metric = answer_correctness, max_bootstrapped_demos=2, num_candidate_programs=6)\n",
-    "cited_longformqa_student_teacher = teleprompter.compile(student=assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), teacher = assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), trainset=trainset, valset=devset[:100])\n",
-    "evaluate(cited_longformqa_student_teacher)"
-   ]
-  }
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "**Note** This pipeline on the other hand sets both the teacher and student with `LongFormQAWithAssertions()` to ensure the teacher correctly instructs the student with the right bootstrapped examples and the student has the chance to self-correct with **Assertions** for any examples that are still deemed incorrect."
+     ]
+    },
+   {
+    "cell_type": "code",
+    "execution_count": null,
+    "metadata": {},
+    "outputs": [],
+    "source": [
+     "longformqa = LongFormQA()\n",
+     "teleprompter = BootstrapFewShotWithRandomSearch(metric = answer_correctness, max_bootstrapped_demos=2, num_candidate_programs=6)\n",
+     "cited_longformqa_student_teacher = teleprompter.compile(student=assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), teacher = assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), trainset=trainset, valset=devset[:100])\n",
+     "evaluate(cited_longformqa_student_teacher)"
+    ]
+   }
  ],
  "metadata": {
   "kernelspec": {
diff --git a/pyproject.toml b/pyproject.toml
index 8878b279aa..6db47e0ad6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dspy-ai"
-version = "2.1.10"
+version = "2.1.9"
 description = "DSPy"
 readme = "README.md"
 authors = [{name = "Omar Khattab", email = "okhattab@stanford.edu"}]
@@ -26,17 +26,16 @@ dependencies = [
     "regex~=2023.10.3",
     "ujson~=5.8.0",
     "tqdm~=4.66.1",
-    "datasets~=2.14.6,<3.0.0",
+    "datasets~=2.14.6",
     "requests~=2.31.0",
     "optuna~=3.4.0",
 ]
 
 [project.optional-dependencies]
-chromadb = ["chromadb~=0.4.14"]
+pinecone = ["pinecone-client~=2.2.4"]
 qdrant = ["qdrant-client~=1.6.2", "fastembed~=0.1.0"]
+chromadb = ["chromadb~=0.4.14"]
 marqo = ["marqo"]
-pinecone = ["pinecone-client~=2.2.4"]
-weaviate = ["weaviate-client~=3.26.1"]
 docs = [
     "sphinx>=4.3.0",
     "furo>=2023.3.27",
@@ -81,12 +80,11 @@ tqdm = "^4.66.1"
 datasets = "^2.14.6"
 requests = "^2.31.0"
 optuna = "^3.4.0"
-chromadb = {version = "^0.4.14", optional = true}
+pinecone-client = {version = "^2.2.4", optional = true}
+qdrant-client = {version = "^1.6.2", optional = true}
 fastembed = {version = "^0.1.0", optional = true}
+chromadb = {version = "^0.4.14", optional = true}
 marqo = {version = "*", optional = true}
-qdrant-client = {version = "^1.6.2", optional = true}
-pinecone-client = {version = "^2.2.4", optional = true}
-weaviate-client = {version = "^3.26.1", optional=true}
 sphinx = {version = ">=4.3.0", optional = true}
 furo = {version = ">=2023.3.27", optional = true}
 docutils = {version = "<0.17", optional = true}
@@ -101,11 +99,10 @@ sphinx-reredirects = {version = "^0.1.2", optional = true}
 sphinx-automodapi = {version = "0.16.0", optional = true}
 
 [tool.poetry.extras]
-chromadb = ["chromadb"]
+pinecone = ["pinecone-client"]
 qdrant = ["qdrant-client", "fastembed"]
+chromadb = ["chromadb"]
 marqo = ["marqo"]
-pinecone = ["pinecone-client"]
-weaviate = ["weaviate-client"]
 docs = [
     "sphinx",
     "furo",
diff --git a/requirements.txt b/requirements.txt
index 8dc43dd62c..c853c2ef3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,4 @@ ujson
 tqdm
 datasets
 requests
-optuna
+optuna
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6c5175e60d..f50a568d5f 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(	
     name="dspy-ai",	
-    version="2.1.10",	
+    version="2.1.9",	
     description="DSPy",	
     long_description=long_description,	
     long_description_content_type='text/markdown',	
@@ -21,13 +21,13 @@
     packages=find_packages(include=['dsp.*', 'dspy.*', 'dsp', 'dspy']),	
     python_requires='>=3.9',	
     install_requires=requirements,	
-    extras_require={
-        "chromadb": ["chromadb~=0.4.14"],
-        "qdrant": ["qdrant-client", "fastembed"],
-        "marqo": ["marqo"],
-        "mongodb": ["pymongo~=3.12.0"],			
+    extras_require={	
         "pinecone": ["pinecone-client~=2.2.4"],	
+        "qdrant": ["qdrant-client~=1.6.2", "fastembed~=0.1.0"],	
+        "chromadb": ["chromadb~=0.4.14"],	
+        "marqo": ["marqo"],	
         "weaviate": ["weaviate-client~=3.26.1"],	
+        "mongodb": ["pymongo~=3.12.0"],	
     },	
     classifiers=[	
         "Development Status :: 3 - Alpha",	
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
new file mode 100644
index 0000000000..3af3ae9dab
--- /dev/null
+++ b/tests/evaluate/test_evaluate.py
@@ -0,0 +1,59 @@
+import dsp, dspy
+from dspy.evaluate.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+from dspy.utils.dummies import DummyLM
+
+def new_example(question, answer):
+    """Helper function to create a new example."""
+    return dspy.Example(
+        question=question,
+        answer=answer,
+        ).with_inputs("question")
+
+def test_evaluate_initialization():
+    devset = [new_example("What is 1+1?", "2")]
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    assert ev.devset == devset
+    assert ev.metric == answer_exact_match
+    assert ev.num_threads == len(devset)
+    assert ev.display_progress == False
+
+def test_evaluate_call():
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "2", "What is 2+2?": "4"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    assert program(question="What is 1+1?").answer == "2"
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    score = ev(program)
+    assert score == 100.0
+
+def test_evaluate_call_bad():
+    dspy.settings.configure(lm=DummyLM({"What is 1+1?": "0", "What is 2+2?": "0"}))
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    program = Predict("question -> answer")
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_progress=False,
+    )
+    score = ev(program)
+    assert score == 0.0
+
+def test_evaluate_display_table():
+    devset = [new_example("What is 1+1?", "2")]
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+        display_table=True,
+    )
+    assert ev.display_table == True
+
diff --git a/tests/evaluate/test_metrics.py b/tests/evaluate/test_metrics.py
new file mode 100644
index 0000000000..f04148251b
--- /dev/null
+++ b/tests/evaluate/test_metrics.py
@@ -0,0 +1,32 @@
+# FILEPATH: /Users/ahle/repos/dspy/tests/evaluate/test_metrics.py
+
+import dsp, dspy
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+
+def test_answer_exact_match_string():
+    example = dspy.Example(
+        question="What is 1+1?",
+        answer="2",
+    ).with_inputs("question")
+    pred = Predict("question -> answer")
+    pred.answer = "2"
+    assert answer_exact_match(example, pred)
+
+def test_answer_exact_match_list():
+    example = dspy.Example(
+        question="What is 1+1?",
+        answer=["2", "two"],
+    ).with_inputs("question")
+    pred = Predict("question -> answer")
+    pred.answer = "2"
+    assert answer_exact_match(example, pred)
+
+def test_answer_exact_match_no_match():
+    example = dspy.Example(
+        question="What is 1+1?",
+        answer="2",
+    ).with_inputs("question")
+    pred = Predict("question -> answer")
+    pred.answer = "3"
+    assert not answer_exact_match(example, pred)
\ No newline at end of file
diff --git a/tests/examples/test_baleen.py b/tests/examples/test_baleen.py
new file mode 100644
index 0000000000..ab14458444
--- /dev/null
+++ b/tests/examples/test_baleen.py
@@ -0,0 +1,136 @@
+import pytest
+from dsp.utils import deduplicate
+import dspy.evaluate
+import dspy
+from dspy.datasets import HotPotQA
+from dspy.evaluate.evaluate import Evaluate
+from dspy.teleprompt.bootstrap import BootstrapFewShot
+
+
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+
+class GenerateSearchQuery(dspy.Signature):
+    """Write a simple search query that will help answer a complex question."""
+
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    query = dspy.OutputField()
+
+
+class SimplifiedBaleen(dspy.Module):
+    def __init__(self, passages_per_hop=3, max_hops=2):
+        super().__init__()
+
+        self.generate_query = [
+            dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)
+        ]
+        self.retrieve = dspy.Retrieve(k=passages_per_hop)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+        self.max_hops = max_hops
+
+    def forward(self, question):
+        context = []
+
+        for hop in range(self.max_hops):
+            query = self.generate_query[hop](context=context, question=question).query
+            passages = self.retrieve(query).passages
+            context = deduplicate(context + passages)
+
+        pred = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=pred.answer)
+
+
+def load_hotpotqa():
+    # Load the dataset.
+    dataset = HotPotQA(
+        train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0
+    )
+    # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+    return trainset, devset
+
+
+# @pytest.mark.slow_test
+# TODO: Find a way to make this test run without openai
+def _test_baleen():
+    lm = dspy.OpenAI(model="gpt-3.5-turbo")
+    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    # Ask any question you like to this simple RAG program.
+    my_question = "How many storeys are in the castle that David Gregory inherited?"
+
+    # Get the prediction. This contains `pred.context` and `pred.answer`.
+    uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
+    pred = uncompiled_baleen(my_question)
+
+    assert pred.answer == "five"
+
+
+def validate_context_and_answer_and_hops(example, pred, trace=None):
+    if not dspy.evaluate.answer_exact_match(example, pred):
+        return False
+    if not dspy.evaluate.answer_passage_match(example, pred):
+        return False
+
+    hops = [example.question] + [
+        outputs.query for *_, outputs in trace if "query" in outputs
+    ]
+
+    if max([len(h) for h in hops]) > 100:
+        return False
+    if any(
+        dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8)
+        for idx in range(2, len(hops))
+    ):
+        return False
+
+    return True
+
+
+def gold_passages_retrieved(example, pred, trace=None):
+    gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
+    found_titles = set(
+        map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context])
+    )
+
+    return gold_titles.issubset(found_titles)
+
+
+# @pytest.mark.slow_test
+# TODO: Find a way to make this test run without the slow hotpotqa dataset
+def _test_compiled_baleen():
+    trainset, devset = load_hotpotqa()
+    lm = dspy.OpenAI(model="gpt-3.5-turbo")
+    rm = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
+
+    teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
+    compiled_baleen = teleprompter.compile(
+        SimplifiedBaleen(),
+        teacher=SimplifiedBaleen(passages_per_hop=2),
+        trainset=trainset,
+    )
+
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset, num_threads=1, display_progress=True, display_table=5
+    )
+    uncompiled_baleen_retrieval_score = evaluate_on_hotpotqa(
+        uncompiled_baleen, metric=gold_passages_retrieved, display=False
+    )
+    # assert uncompiled_baleen_retrieval_score / 100 == 18 / 50
+
+    compiled_baleen_retrieval_score = evaluate_on_hotpotqa(
+        compiled_baleen, metric=gold_passages_retrieved
+    )
+    # assert compiled_baleen_retrieval_score / 100 == 27 / 50
+    assert uncompiled_baleen_retrieval_score < compiled_baleen_retrieval_score
\ No newline at end of file
diff --git a/tests/functional/test_functional.py b/tests/functional/test_functional.py
new file mode 100644
index 0000000000..8ed180cff4
--- /dev/null
+++ b/tests/functional/test_functional.py
@@ -0,0 +1,401 @@
+import datetime
+import json
+import textwrap
+import pydantic
+from pydantic import Field, BaseModel, field_validator
+from typing import Annotated
+
+import pytest
+
+import dspy
+from dspy.functional import predictor, cot, FunctionalModule, TypedPredictor
+from dspy.primitives.example import Example
+from dspy.teleprompt.bootstrap import BootstrapFewShot
+from dspy.utils.dummies import DummyLM
+
+
+def test_simple():
+    @predictor
+    def hard_question(topic: str) -> str:
+        """Think of a hard factual question about a topic."""
+
+    expected = "What is the speed of light?"
+    lm = DummyLM([expected])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_question(topic="Physics")
+    lm.inspect_history(n=2)
+
+    assert question == expected
+
+
+def test_simple_type():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    @predictor
+    def hard_question(topic: str) -> Question:
+        """Think of a hard factual question about a topic."""
+
+    expected = "What is the speed of light?"
+    lm = DummyLM([f'{{"value": "{expected}"}}'])
+    dspy.settings.configure(lm=lm)
+
+    question = hard_question(topic="Physics")
+
+    assert isinstance(question, Question)
+    assert question.value == expected
+
+
+def test_simple_type_input():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class Answer(pydantic.BaseModel):
+        value: str
+
+    @predictor
+    def answer(question: Question) -> Answer:
+        pass
+
+    question = Question(value="What is the speed of light?")
+    lm = DummyLM([f'{{"value": "3e8"}}'])
+    dspy.settings.configure(lm=lm)
+
+    result = answer(question=question)
+
+    assert result == Answer(value="3e8")
+
+
+def test_simple_class():
+    class Answer(pydantic.BaseModel):
+        value: float
+        certainty: float
+        comments: list[str] = pydantic.Field(
+            description="At least two comments about the answer"
+        )
+
+    class QA(dspy.Module):
+        @predictor
+        def hard_question(self, topic: str) -> str:
+            """Think of a hard factual question about a topic. It should be answerable with a number."""
+
+        @cot
+        def answer(self, question: Annotated[str, "Question to answer"]) -> Answer:
+            pass
+
+        def forward(self, **kwargs):
+            question = self.hard_question(**kwargs)
+            return (question, self.answer(question=question))
+
+    expected = Answer(
+        value=3e8,
+        certainty=0.9,
+        comments=["It is the speed of light", "It is a constant"],
+    )
+
+    lm = DummyLM(
+        [
+            "What is the speed of light?",
+            "Some bad reasoning, 3e8 m/s.",
+            "3e8",  # Bad answer 1
+            "Some good reasoning...",
+            expected.model_dump_json(),  # Good answer
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    qa = QA()
+    question, answer = qa(topic="Physics")
+
+    assert question == "What is the speed of light?"
+    assert answer == expected
+
+
+def test_simple_oop():
+    class Question(pydantic.BaseModel):
+        value: str
+
+    class MySignature(dspy.Signature):
+        topic: str = dspy.InputField()
+        output: Question = dspy.OutputField()
+
+    # Run the signature
+    program = TypedPredictor(MySignature)
+    expected = "What is the speed of light?"
+    lm = DummyLM(
+        [
+            Question(value=expected).model_dump_json(),
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    question = program(topic="Physics").output
+
+    assert isinstance(question, Question)
+    assert question.value == expected
+
+
+def test_equivalent_signatures():
+    class ClassSignature(dspy.Signature):
+        input: str = dspy.InputField()
+        output: str = dspy.OutputField()
+
+    @predictor
+    def output(input: str) -> str:
+        pass
+
+    function_signature = output.predictor.signature
+
+    simple_signature = dspy.Signature("input -> output")
+
+    assert ClassSignature.equals(function_signature)
+    assert ClassSignature.equals(simple_signature)
+
+
+def test_named_params():
+    class QA(FunctionalModule):
+        @predictor
+        def hard_question(self, topic: str) -> str:
+            """Think of a hard factual question about a topic. It should be answerable with a number."""
+
+        @cot
+        def answer(self, question: str) -> str:
+            pass
+
+    qa = QA()
+    named_predictors = list(qa.named_predictors())
+    assert len(named_predictors) == 2
+    names, _ = zip(*qa.named_predictors())
+    assert set(names) == {"hard_question.predictor", "answer.predictor"}
+
+
+def test_bootstrap_effectiveness():
+    class SimpleModule(FunctionalModule):
+        @predictor
+        def output(self, input: str) -> str:
+            pass
+
+        def forward(self, **kwargs):
+            return self.output(**kwargs)
+
+    def simple_metric(example, prediction, trace=None):
+        return example.output == prediction.output
+
+    examples = [
+        ex.with_inputs("input")
+        for ex in (
+            Example(input="What is the color of the sky?", output="blue"),
+            Example(
+                input="What does the fox say?",
+                output="Ring-ding-ding-ding-dingeringeding!",
+            ),
+        )
+    ]
+    trainset = [examples[0]]
+    valset = [examples[1]]
+
+    # This test verifies if the bootstrapping process improves the student's predictions
+    student = SimpleModule()
+    teacher = SimpleModule()
+    assert student.output.predictor.signature.equals(teacher.output.predictor.signature)
+
+    lm = DummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    dspy.settings.configure(lm=lm, trace=[])
+
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
+    )
+    compiled_student = bootstrap.compile(student, teacher=teacher, trainset=trainset)
+
+    lm.inspect_history(n=2)
+
+    # Check that the compiled student has the correct demos
+    assert len(compiled_student.output.predictor.demos) == 1
+    assert compiled_student.output.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.output.predictor.demos[0].output == trainset[0].output
+
+    # Test the compiled student's prediction.
+    # We are using a DummyLM with follow_examples=True, which means that
+    # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
+    # on the second output, if it seems an example that perfectly matches the
+    # prompt, it will use that instead. That is why we expect "blue" here.
+    prediction = compiled_student(input=trainset[0].input)
+    assert prediction == trainset[0].output
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}. Respond with a single str value
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue"""
+    )
+
+
+def test_regex():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    email = textwrap.dedent(
+        """\
+        We're excited to welcome you aboard your upcoming flight from 
+        John F. Kennedy International Airport (JFK) to Los Angeles International Airport (LAX)
+        on December 25, 2022. Here's everything you need to know before you take off: ...
+    """
+    )
+    lm = DummyLM(
+        [
+            # Example with a bad origin code.
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            # Fixed
+            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email=email) == TravelInformation(
+        origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
+    )
+
+
+def test_raises():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    lm = DummyLM(
+        [
+            "A list of bad inputs",
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            '{"origin": "JFK", "destination": "LAX", "date": "bad date"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    with pytest.raises(ValueError):
+        flight_information(email="Some email")
+
+
+def test_multi_errors():
+    class TravelInformation(BaseModel):
+        origin: str = Field(pattern=r"^[A-Z]{3}$")
+        destination: str = Field(pattern=r"^[A-Z]{3}$")
+        date: datetime.date
+
+    @predictor
+    def flight_information(email: str) -> TravelInformation:
+        pass
+
+    lm = DummyLM(
+        [
+            # First origin is wrong, then destination, then all is good
+            '{"origin": "JF0", "destination": "LAX", "date": "2022-12-25"}',
+            '{"origin": "JFK", "destination": "LA0", "date": "2022-12-25"}',
+            '{"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}',
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    assert flight_information(email="Some email") == TravelInformation(
+        origin="JFK", destination="LAX", date=datetime.date(2022, 12, 25)
+    )
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `email`, produce the fields `flight_information`.
+
+        ---
+
+        Follow the following format.
+
+        Email: ${email}
+
+        Past Error (flight_information): An error to avoid in the future
+
+        Past Error (flight_information, 2): An error to avoid in the future
+
+        Flight Information: ${flight_information}. Respond with a single JSON object using the schema {"properties": {"origin": {"pattern": "^[A-Z]{3}$", "title": "Origin", "type": "string"}, "destination": {"pattern": "^[A-Z]{3}$", "title": "Destination", "type": "string"}, "date": {"format": "date", "title": "Date", "type": "string"}}, "required": ["origin", "destination", "date"], "title": "TravelInformation", "type": "object"}
+
+        ---
+
+        Email: Some email
+
+        Past Error (flight_information): 1 validation error for TravelInformation origin String should match pattern '^[A-Z]{3}$' [type=string_pattern_mismatch, input_value='JF0', input_type=str] For further information visit https://errors.pydantic.dev/2.5/v/string_pattern_mismatch
+
+        Past Error (flight_information, 2): 1 validation error for TravelInformation destination String should match pattern '^[A-Z]{3}$' [type=string_pattern_mismatch, input_value='LA0', input_type=str] For further information visit https://errors.pydantic.dev/2.5/v/string_pattern_mismatch
+
+        Flight Information: {"origin": "JFK", "destination": "LAX", "date": "2022-12-25"}"""
+    )
+
+
+def test_field_validator():
+    class UserDetails(BaseModel):
+        name: str
+        age: int
+
+        @field_validator("name")
+        @classmethod
+        def validate_name(cls, v):
+            if v.upper() != v:
+                raise ValueError("Name must be in uppercase.")
+            return v
+
+    @predictor
+    def get_user_details() -> UserDetails:
+        pass
+
+    # Keep making the mistake (lower case name) until we run
+    # out of retries.
+    lm = DummyLM(
+        [
+            '{"name": "lower case name", "age": 25}',
+        ]
+        * 10
+    )
+    dspy.settings.configure(lm=lm)
+
+    with pytest.raises(ValueError):
+        get_user_details()
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields , produce the fields `get_user_details`.
+
+        ---
+
+        Follow the following format.
+
+        Past Error (get_user_details): An error to avoid in the future
+        Past Error (get_user_details, 2): An error to avoid in the future
+        Get User Details: ${get_user_details}. Respond with a single JSON object using the schema {"properties": {"name": {"title": "Name", "type": "string"}, "age": {"title": "Age", "type": "integer"}}, "required": ["name", "age"], "title": "UserDetails", "type": "object"}
+
+        ---
+
+        Past Error (get_user_details): 1 validation error for UserDetails name Value error, Name must be in uppercase. [type=value_error, input_value='lower case name', input_type=str] For further information visit https://errors.pydantic.dev/2.5/v/value_error
+        Past Error (get_user_details, 2): 1 validation error for UserDetails name Value error, Name must be in uppercase. [type=value_error, input_value='lower case name', input_type=str] For further information visit https://errors.pydantic.dev/2.5/v/value_error
+        Get User Details: {"name": "lower case name", "age": 25}"""
+    )
diff --git a/tests/predict/test_aggregation.py b/tests/predict/test_aggregation.py
new file mode 100644
index 0000000000..2c5f705fe6
--- /dev/null
+++ b/tests/predict/test_aggregation.py
@@ -0,0 +1,47 @@
+from dspy.predict.aggregation import majority
+from dspy.primitives.prediction import Prediction, Completions
+from dsp.utils import normalize_text
+
+
+def test_majority_with_prediction():
+    prediction = Prediction.from_completions(
+        [{"answer": "2"}, {"answer": "2"}, {"answer": "3"}]
+    )
+    result = majority(prediction)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_completions():
+    completions = Completions([{"answer": "2"}, {"answer": "2"}, {"answer": "3"}])
+    result = majority(completions)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_list():
+    completions = [{"answer": "2"}, {"answer": "2"}, {"answer": "3"}]
+    result = majority(completions)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_normalize():
+    completions = [{"answer": "2"}, {"answer": " 2"}, {"answer": "3"}]
+    result = majority(completions, normalize=normalize_text)
+    assert result.completions[0]["answer"] == "2"
+
+
+def test_majority_with_field():
+    completions = [
+        {"answer": "2", "other": "1"},
+        {"answer": "2", "other": "1"},
+        {"answer": "3", "other": "2"},
+    ]
+    result = majority(completions, field="other")
+    assert result.completions[0]["other"] == "1"
+
+
+def test_majority_with_no_majority():
+    completions = [{"answer": "2"}, {"answer": "3"}, {"answer": "4"}]
+    result = majority(completions)
+    assert (
+        result.completions[0]["answer"] == "2"
+    )  # The first completion is returned in case of a tie
diff --git a/tests/predict/test_chain_of_thought.py b/tests/predict/test_chain_of_thought.py
new file mode 100644
index 0000000000..c1d08e729c
--- /dev/null
+++ b/tests/predict/test_chain_of_thought.py
@@ -0,0 +1,35 @@
+import textwrap
+import dspy
+from dspy import ChainOfThought
+from dspy.utils import DummyLM
+
+
+def test_initialization_with_string_signature():
+    lm = DummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThought("question -> answer")
+    assert list(predict.extended_signature.output_fields.keys()) == [
+        "rationale",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?").answer == "2"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `question`, produce the fields `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+        Answer: ${answer}
+
+        ---
+
+        Question: What is 1+1?
+        Reasoning: Let's think step by step in order to find the number after 1
+        Answer: 2"""
+    )
diff --git a/tests/predict/test_chain_of_thought_with_hint.py b/tests/predict/test_chain_of_thought_with_hint.py
new file mode 100644
index 0000000000..b5e62425dc
--- /dev/null
+++ b/tests/predict/test_chain_of_thought_with_hint.py
@@ -0,0 +1,42 @@
+import dspy
+from dspy import ChainOfThoughtWithHint
+from dspy.utils import DummyLM
+
+
+def test_cot_with_no_hint():
+    lm = DummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThoughtWithHint("question -> answer")
+    assert list(predict.extended_signature2.output_fields.keys()) == [
+        "rationale",
+        "hint",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?").answer == "2"
+
+    final_convo = lm.get_convo(-1)
+    assert final_convo.endswith(
+        "Question: What is 1+1?\n"
+        "Reasoning: Let's think step by step in order to find the number after 1\n"
+        "Answer: 2"
+    )
+
+
+def test_cot_with_hint():
+    lm = DummyLM(["find the number after 1", "2"])
+    dspy.settings.configure(lm=lm)
+    predict = ChainOfThoughtWithHint("question -> answer")
+    assert list(predict.extended_signature2.output_fields.keys()) == [
+        "rationale",
+        "hint",
+        "answer",
+    ]
+    assert predict(question="What is 1+1?", hint="think small").answer == "2"
+
+    final_convo = lm.get_convo(-1)
+    assert final_convo.endswith(
+        "Question: What is 1+1?\n\n"
+        "Reasoning: Let's think step by step in order to find the number after 1\n\n"
+        "Hint: think small\n\n"
+        "Answer: 2"
+    )
diff --git a/tests/predict/test_knn.py b/tests/predict/test_knn.py
new file mode 100644
index 0000000000..62cf96682b
--- /dev/null
+++ b/tests/predict/test_knn.py
@@ -0,0 +1,55 @@
+import pytest
+import numpy as np
+import dsp, dspy
+from dspy.utils import DummyVectorizer
+from dspy.predict import KNN
+
+
+def mock_example(question: str, answer: str) -> dsp.Example:
+    """Creates a mock DSP example with specified question and answer."""
+    return dspy.Example(question=question, answer=answer).with_inputs("question")
+
+
+@pytest.fixture
+def setup_knn():
+    """Sets up a KNN instance with a mocked vectorizer for testing."""
+    dsp.SentenceTransformersVectorizer = DummyVectorizer
+    trainset = [
+        mock_example("What is the capital of France?", "Paris"),
+        mock_example("What is the largest ocean?", "Pacific"),
+        mock_example("What is 2+2?", "4"),
+    ]
+    knn = KNN(k=2, trainset=trainset)
+    return knn
+
+
+def test_knn_initialization(setup_knn):
+    """Tests the KNN initialization and checks if the trainset vectors are correctly created."""
+    knn = setup_knn
+    assert knn.k == 2, "Incorrect k value"
+    assert len(knn.trainset_vectors) == 3, "Incorrect size of trainset vectors"
+    assert isinstance(
+        knn.trainset_vectors, np.ndarray
+    ), "Trainset vectors should be a NumPy array"
+
+
+def test_knn_query(setup_knn):
+    """Tests the KNN query functionality for retrieving the nearest neighbors."""
+    knn = setup_knn
+    query = {"question": "What is 3+3?"}  # A query close to "What is 2+2?"
+    nearest_samples = knn(**query)
+    assert len(nearest_samples) == 2, "Incorrect number of nearest samples returned"
+    assert nearest_samples[0].answer == "4", "Incorrect nearest sample returned"
+
+
+def test_knn_query_specificity(setup_knn):
+    """Tests the KNN query functionality for specificity of returned examples."""
+    knn = setup_knn
+    query = {
+        "question": "What is the capital of Germany?"
+    }  # A query close to "What is the capital of France?"
+    nearest_samples = knn(**query)
+    assert len(nearest_samples) == 2, "Incorrect number of nearest samples returned"
+    assert "Paris" in [
+        sample.answer for sample in nearest_samples
+    ], "Expected Paris to be a nearest sample answer"
diff --git a/tests/predict/test_multi_chain_comparison.py b/tests/predict/test_multi_chain_comparison.py
new file mode 100644
index 0000000000..8c936a2d80
--- /dev/null
+++ b/tests/predict/test_multi_chain_comparison.py
@@ -0,0 +1,38 @@
+import dspy
+from dspy.utils.dummies import DummyLM
+
+
+def test_basic_example():
+    class BasicQA(dspy.Signature):
+        """Answer questions with short factoid answers."""
+
+        question = dspy.InputField()
+        answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+    # Example completions generated by a model for reference
+    completions = [
+        dspy.Prediction(
+            rationale="I recall that during clear days, the sky often appears this color.",
+            answer="blue",
+        ),
+        dspy.Prediction(
+            rationale="Based on common knowledge, I believe the sky is typically seen as this color.",
+            answer="green",
+        ),
+        dspy.Prediction(
+            rationale="From images and depictions in media, the sky is frequently represented with this hue.",
+            answer="blue",
+        ),
+    ]
+
+    # Pass signature to MultiChainComparison module
+    compare_answers = dspy.MultiChainComparison(BasicQA)
+
+    # Call the MultiChainComparison on the completions
+    question = "What is the color of the sky?"
+    lm = DummyLM(["my rationale", "blue"])
+    dspy.settings.configure(lm=lm)
+    final_pred = compare_answers(completions, question=question)
+
+    assert final_pred.rationale == "my rationale"
+    assert final_pred.answer == "blue"
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
new file mode 100644
index 0000000000..e44b3a135c
--- /dev/null
+++ b/tests/predict/test_predict.py
@@ -0,0 +1,91 @@
+import dspy
+from dspy import Predict, Signature
+from dspy.utils.dummies import DummyLM
+
+
+def test_initialization_with_string_signature():
+    signature_string = "input1, input2 -> output"
+    predict = Predict(signature_string)
+    expected_instruction = (
+        "Given the fields `input1`, `input2`, produce the fields `output`."
+    )
+    assert predict.signature.instructions == expected_instruction
+    assert predict.signature.instructions == Signature(signature_string).instructions
+
+
+def test_reset_method():
+    predict_instance = Predict("input -> output")
+    predict_instance.lm = "modified"
+    predict_instance.traces = ["trace"]
+    predict_instance.train = ["train"]
+    predict_instance.demos = ["demo"]
+    predict_instance.reset()
+    assert predict_instance.lm is None
+    assert predict_instance.traces == []
+    assert predict_instance.train == []
+    assert predict_instance.demos == []
+
+
+def test_dump_and_load_state():
+    predict_instance = Predict("input -> output")
+    predict_instance.lm = "lm_state"
+    dumped_state = predict_instance.dump_state()
+    new_instance = Predict("input -> output")
+    new_instance.load_state(dumped_state)
+    assert new_instance.lm == "lm_state"
+
+
+def test_call_method():
+    predict_instance = Predict("input -> output")
+    lm = DummyLM(["test output"])
+    dspy.settings.configure(lm=lm)
+    result = predict_instance(input="test input")
+    assert result.output == "test output"
+    assert lm.get_convo(-1) == (
+        "Given the fields `input`, produce the fields `output`.\n"
+        "\n---\n\n"
+        "Follow the following format.\n\n"
+        "Input: ${input}\n"
+        "Output: ${output}\n"
+        "\n---\n\n"
+        "Input: test input\n"
+        "Output: test output"
+    )
+
+
+def test_dump_load_state():
+    predict_instance = Predict(Signature("input -> output", "original instructions"))
+    dumped_state = predict_instance.dump_state()
+    new_instance = Predict(Signature("input -> output", "new instructions"))
+    new_instance.load_state(dumped_state)
+    assert new_instance.signature.instructions == "original instructions"
+
+
+def test_forward_method():
+    program = Predict("question -> answer")
+    dspy.settings.configure(lm=DummyLM([]))
+    result = program(question="What is 1+1?").answer
+    assert result == "No more responses"
+
+
+def test_forward_method2():
+    program = Predict("question -> answer1, answer2")
+    dspy.settings.configure(lm=DummyLM(["my first answer", "my second answer"]))
+    result = program(question="What is 1+1?")
+    assert result.answer1 == "my first answer"
+    assert result.answer2 == "my second answer"
+
+
+def test_config_management():
+    predict_instance = Predict("input -> output")
+    predict_instance.update_config(new_key="value")
+    config = predict_instance.get_config()
+    assert "new_key" in config and config["new_key"] == "value"
+
+
+def test_multi_output():
+    program = Predict("question -> answer", n=2)
+    dspy.settings.configure(lm=DummyLM(["my first answer", "my second answer"]))
+    results = program(question="What is 1+1?")
+    assert results.completions.answer[0] == "my first answer"
+    assert results.completions.answer[1] == "my second answer"
diff --git a/tests/predict/test_program_of_thought.py b/tests/predict/test_program_of_thought.py
new file mode 100644
index 0000000000..2aa153a1d6
--- /dev/null
+++ b/tests/predict/test_program_of_thought.py
@@ -0,0 +1,121 @@
+from dspy import Signature, ProgramOfThought
+import dspy
+from dspy.utils import DummyLM
+import textwrap
+
+class BasicQA(Signature):
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+def test_pot_code_generation():
+    pot = ProgramOfThought(BasicQA)
+    lm = DummyLM([
+        "Reason_A",
+        "```python\nresult = 1+1\n```", 
+        "Reason_B",
+        "2",
+    ])
+    dspy.settings.configure(lm=lm)
+    res = pot(question="What is 1+1?")
+    assert res.answer == "2"
+    assert lm.get_convo(index=-1) == textwrap.dedent("""\
+        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Code: python code that answers the question
+
+        Code Output: output of previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+
+        Answer: often between 1 and 5 words
+
+        ---
+
+        Question: What is 1+1?
+
+        Code: result = 1+1
+
+        Code Output: 2
+
+        Reasoning: Let's think step by step in order to Reason_B
+
+        Answer: 2""")
+
+def test_pot_code_generation_with_error():
+    pot = ProgramOfThought(BasicQA)
+    lm = DummyLM([
+        "Reason_A",
+        "```python\nresult = 1+0/0\n```",
+        "Reason_B", # Error: division by zero
+        "```python\nresult = 1+1\n```",
+        "Reason_C",
+        "2",
+    ])
+    dspy.settings.configure(lm=lm)
+    res = pot(question="What is 1+1?")
+    assert res.answer == "2"
+
+    # The first code example failed
+    assert lm.get_convo(index=2) == textwrap.dedent("""\
+        You are given `question`, `previous_code`, `error` due to an error in previous code.
+        Your task is to correct the error and provide the new `generated_code`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Previous Code: previously-generated python code that errored
+
+        Error: error message from previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the generated_code}. We ...
+
+        Code: python code that answers the question
+
+        ---
+
+        Question: What is 1+1?
+
+        Previous Code: result = 1+0/0
+
+        Error: division by zero
+
+        Reasoning: Let's think step by step in order to Reason_B""")
+
+    # The second code example succeeded
+    assert lm.get_convo(-1) == textwrap.dedent("""\
+        Given the final code `question`, `final_generated_code`, `code_output`, provide the final `answer`.
+
+        ---
+
+        Follow the following format.
+
+        Question: ${question}
+
+        Code: python code that answers the question
+
+        Code Output: output of previously-generated python code
+
+        Reasoning: Let's think step by step in order to ${produce the answer}. We ...
+
+        Answer: often between 1 and 5 words
+
+        ---
+
+        Question: What is 1+1?
+
+        Code: result = 1+1
+
+        Code Output: 2
+
+        Reasoning: Let's think step by step in order to Reason_C
+
+        Answer: 2""")
diff --git a/tests/predict/test_react.py b/tests/predict/test_react.py
new file mode 100644
index 0000000000..f28e905e70
--- /dev/null
+++ b/tests/predict/test_react.py
@@ -0,0 +1,86 @@
+import dspy
+from dspy.utils.dummies import dummy_rm
+
+
+def test_example_no_tools():
+    # Createa a simple dataset which the model will use with the Retrieve tool.
+    lm = dspy.utils.DummyLM(
+        [
+            "Initial thoughts",  # Thought_1
+            "Finish[blue]",  # Action_1
+        ]
+    )
+    dspy.settings.configure(lm=lm, rm=dummy_rm())
+
+    program = dspy.ReAct("question -> answer")
+
+    # Check default tools
+    assert isinstance(program.tools["Finish"], dspy.Example)
+
+    # Call the ReAct module on a particular input
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "blue"
+
+    # For debugging
+    print("---")
+    for row in lm.history:
+        print(row["prompt"])
+        print("Response:", row["response"]["choices"][0]["text"])
+        print("---")
+
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n"
+        "Thought 1: Initial thoughts\n"
+        "Action 1: Finish[blue]"
+    )
+
+
+def test_example_search():
+    # Createa a simple dataset which the model will use with the Retrieve tool.
+    lm = dspy.utils.DummyLM(
+        [
+            "Initial thoughts",  # Thought_1
+            "Search[the color of the sky]",  # Thought_1
+            "More thoughts",  # Thought_2
+            "Finish[blue]",  # Action_2
+        ]
+    )
+    rm = dummy_rm(
+        [
+            "We all know the color of the sky is blue.",
+            "Somethng about the sky colors",
+            "This sentence is completely irellevant to answer the question.",
+            "Let's add some more sentences to act as summy passages.",
+            "Let's add some more sentences to act as summy passages.",
+            "Let's add some more sentences to act as summy passages.",
+        ]
+    )
+    dspy.settings.configure(lm=lm, rm=rm)
+
+    program = dspy.ReAct("question -> answer")
+
+    # Check default tools
+    assert len(program.tools) == 2
+    assert isinstance(program.tools["Search"], dspy.Retrieve)
+    assert isinstance(program.tools["Finish"], dspy.Example)
+
+    # Call the ReAct module on a particular input
+    question = "What is the color of the sky?"
+    result = program(question=question)
+    assert result.answer == "blue"
+
+    # For debugging
+    print(lm.get_convo(-1))
+
+    assert lm.get_convo(-1).endswith(
+        "Question: What is the color of the sky?\n\n"
+        "Thought 1: Initial thoughts\n\n"
+        "Action 1: Search[the color of the sky]\n\n"
+        "Observation 1:\n"
+        "[1] «We all know the color of the sky is blue.»\n"
+        "[2] «Somethng about the sky colors»\n"
+        "[3] «This sentence is completely irellevant to answer the question.»\n\n"
+        "Thought 2: More thoughts\n\n"
+        "Action 2: Finish[blue]"
+    )
diff --git a/tests/predict/test_retry.py b/tests/predict/test_retry.py
new file mode 100644
index 0000000000..a125dde296
--- /dev/null
+++ b/tests/predict/test_retry.py
@@ -0,0 +1,66 @@
+import functools
+import dspy
+from dspy.utils import DummyLM
+from dspy.primitives.assertions import assert_transform_module, backtrack_handler
+
+
+def test_retry_simple():
+    predict = dspy.Predict("question -> answer")
+    retry_module = dspy.Retry(predict)
+
+    # Test Retry has created the correct new signature
+    for field in predict.signature.output_fields:
+        assert f"past_{field}" in retry_module.new_signature.input_fields
+    assert "feedback" in retry_module.new_signature.input_fields
+
+    lm = DummyLM(["blue"])
+    dspy.settings.configure(lm=lm)
+    result = retry_module.forward(
+        question="What color is the sky?",
+        past_outputs={"answer": "red"},
+        feedback="Try harder",
+    )
+    assert result.answer == "blue"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1).endswith(
+        "Question: What color is the sky?\n\n"
+        "Past Answer: red\n\n"
+        "Instructions: Try harder\n\n"
+        "Answer: blue"
+    )
+
+
+def test_retry_forward_with_feedback():
+    # First we make a mistake, then we fix it
+    lm = DummyLM(["red", "blue"])
+    dspy.settings.configure(lm=lm, trace=[])
+
+    class SimpleModule(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.predictor = dspy.Predict("question -> answer")
+
+        def forward(self, **kwargs):
+            result = self.predictor(**kwargs)
+            print(f"SimpleModule got {result.answer=}")
+            dspy.Suggest(result.answer == "blue", "Please think harder")
+            return result
+
+    program = SimpleModule()
+    program = assert_transform_module(
+        program.map_named_predictors(dspy.Retry),
+        functools.partial(backtrack_handler, max_backtracks=1),
+    )
+
+    result = program(question="What color is the sky?")
+
+    assert result.answer == "blue"
+
+    print(lm.get_convo(-1))
+    assert lm.get_convo(-1).endswith(
+        "Question: What color is the sky?\n\n"
+        "Past Answer: red\n\n"
+        "Instructions: Please think harder\n\n"
+        "Answer: blue"
+    )
diff --git a/tests/primitives/test_example.py b/tests/primitives/test_example.py
new file mode 100644
index 0000000000..2f27996a24
--- /dev/null
+++ b/tests/primitives/test_example.py
@@ -0,0 +1,108 @@
+import pytest
+from dspy import Example
+
+
+def test_example_initialization():
+    example = Example(a=1, b=2)
+    assert example.a == 1
+    assert example.b == 2
+
+
+def test_example_initialization_from_base():
+    base = Example(a=1, b=2)
+    example = Example(base=base, c=3)
+    assert example.a == 1
+    assert example.b == 2
+    assert example.c == 3
+
+
+def test_example_initialization_from_dict():
+    base_dict = {"a": 1, "b": 2}
+    example = Example(base=base_dict, c=3)
+    assert example.a == 1
+    assert example.b == 2
+    assert example.c == 3
+
+
+def test_example_set_get_item():
+    example = Example()
+    example["a"] = 1
+    assert example["a"] == 1
+
+
+def test_example_attribute_access():
+    example = Example(a=1)
+    assert example.a == 1
+    example.a = 2
+    assert example.a == 2
+
+
+def test_example_deletion():
+    example = Example(a=1, b=2)
+    del example["a"]
+    with pytest.raises(AttributeError):
+        _ = example.a
+
+
+def test_example_len():
+    example = Example(a=1, b=2, dspy_hidden=3)
+    assert len(example) == 2
+
+
+def test_example_repr_str():
+    example = Example(a=1)
+    assert repr(example) == "Example({'a': 1}) (input_keys=None)"
+    assert str(example) == "Example({'a': 1}) (input_keys=None)"
+
+
+def test_example_eq():
+    example1 = Example(a=1, b=2)
+    example2 = Example(a=1, b=2)
+    assert example1 == example2
+
+
+def test_example_hash():
+    example1 = Example(a=1, b=2)
+    example2 = Example(a=1, b=2)
+    assert hash(example1) == hash(example2)
+
+
+def test_example_keys_values_items():
+    example = Example(a=1, b=2, dspy_hidden=3)
+    assert set(example.keys()) == {"a", "b"}
+    assert 1 in example.values()
+    assert ("b", 2) in example.items()
+
+
+def test_example_get():
+    example = Example(a=1, b=2)
+    assert example.get("a") == 1
+    assert example.get("c", "default") == "default"
+
+
+def test_example_with_inputs():
+    example = Example(a=1, b=2).with_inputs("a")
+    assert example._input_keys == {"a"}
+
+
+def test_example_inputs_labels():
+    example = Example(a=1, b=2).with_inputs("a")
+    inputs = example.inputs()
+    assert inputs.toDict() == {"a": 1}
+    labels = example.labels()
+    assert labels.toDict() == {"b": 2}
+
+
+def test_example_copy_without():
+    example = Example(a=1, b=2)
+    copied = example.copy(c=3)
+    assert copied.a == 1
+    assert copied.c == 3
+    without_a = copied.without("a")
+    with pytest.raises(AttributeError):
+        _ = without_a.a
+
+
+def test_example_to_dict():
+    example = Example(a=1, b=2)
+    assert example.toDict() == {"a": 1, "b": 2}
diff --git a/tests/primitives/test_program.py b/tests/primitives/test_program.py
new file mode 100644
index 0000000000..b1d7c89725
--- /dev/null
+++ b/tests/primitives/test_program.py
@@ -0,0 +1,66 @@
+import dspy
+from dspy.primitives.program import (
+    Module,
+    set_attribute_by_name,
+)  # Adjust the import based on your file structure
+from dspy.utils import DummyLM
+
+
+class HopModule(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.predict1 = dspy.Predict("question -> query")
+        self.predict2 = dspy.Predict("query -> answer")
+
+    def forward(self, question):
+        query = self.predict1(question=question).query
+        return self.predict2(query=query)
+
+
+def test_module_initialization():
+    module = Module()
+    assert (
+        module._compiled is False
+    ), "Module _compiled attribute should be False upon initialization"
+
+
+def test_named_predictors():
+    module = HopModule()
+    named_preds = module.named_predictors()
+    assert len(named_preds) == 2, "Should identify correct number of Predict instances"
+    names, preds = zip(*named_preds)
+    assert (
+        "predict1" in names and "predict2" in names
+    ), "Named predictors should include 'predict1' and 'predict2'"
+
+
+def test_predictors():
+    module = HopModule()
+    preds = module.predictors()
+    assert len(preds) == 2, "Should return correct number of Predict instances"
+    assert all(
+        isinstance(p, dspy.Predict) for p in preds
+    ), "All returned items should be instances of PredictMock"
+
+
+def test_forward():
+    program = HopModule()
+    dspy.settings.configure(
+        lm=DummyLM({"What is 1+1?": "let me check", "let me check": "2"})
+    )
+    result = program(question="What is 1+1?").answer
+    assert result == "2"
+
+
+def test_nested_named_predictors():
+    class Hop2Module(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.hop = HopModule()
+
+    module = Hop2Module()
+    named_preds = module.named_predictors()
+    assert len(named_preds) == 2
+    names, _preds = zip(*named_preds)
+    assert "hop.predict1" in names
+    assert "hop.predict2" in names
diff --git a/tests/primitives/test_python_interpreter.py b/tests/primitives/test_python_interpreter.py
new file mode 100644
index 0000000000..14b15d5572
--- /dev/null
+++ b/tests/primitives/test_python_interpreter.py
@@ -0,0 +1,44 @@
+import pytest
+from dspy.primitives.python_interpreter import PythonInterpreter, TextPrompt, CodePrompt
+
+def test_execute_simple_code():
+    interpreter = PythonInterpreter(action_space={'print': print})
+    code = "print('Hello, World!')"
+    result = interpreter.execute(code)
+    assert result is None, "Simple print statement should return None"
+
+def test_action_space_limitation():
+    def func(string):
+        pass
+    interpreter = PythonInterpreter(action_space={})
+    code = "func('This should not execute')"
+    with pytest.raises(Exception):
+        interpreter.execute(code)
+
+def test_import_whitelist():
+    interpreter = PythonInterpreter(action_space={}, import_white_list=['math'])
+    code = "import math\nresult = math.sqrt(4)"
+    result = interpreter.execute(code)
+    assert result == 2, "Should be able to import and use math.sqrt"
+
+def test_fuzzy_variable_matching():
+    interpreter = PythonInterpreter(action_space={})
+    code = "result = number + 1"
+    result = interpreter.execute(code, fuzz_state={'number': 4})
+    assert result == 5, "Fuzzy variable matching should work"
+
+def test_text_prompt_keyword_extraction():
+    prompt = TextPrompt("Hello {name}, how are you?")
+    assert 'name' in prompt.key_words, "Keyword 'name' should be extracted"
+
+def test_text_prompt_formatting():
+    prompt = TextPrompt("Hello {name}, how are you?")
+    formatted = prompt.format(name="Alice")
+    assert formatted == "Hello Alice, how are you?", "Should format with provided value"
+
+def test_code_prompt_execution():
+    action_space = {'len': len}
+    interpreter = PythonInterpreter(action_space=action_space)
+    code_prompt = CodePrompt("result = len('hello')")
+    result, _ = code_prompt.execute(interpreter)
+    assert result == 5, "Code execution should return the length of 'hello'"
diff --git a/tests/signatures/test_signature.py b/tests/signatures/test_signature.py
new file mode 100644
index 0000000000..b093258540
--- /dev/null
+++ b/tests/signatures/test_signature.py
@@ -0,0 +1,166 @@
+import pytest
+import pydantic
+from dspy import Signature, infer_prefix, InputField, OutputField
+from typing import List
+
+
+def test_field_types_and_custom_attributes():
+    class TestSignature(Signature):
+        """Instructions"""
+
+        input1: str = InputField()
+        input2: int = InputField()
+        output1: List[str] = OutputField()
+        output2 = OutputField()
+
+    assert TestSignature.instructions == "Instructions"
+    assert TestSignature.input_fields["input1"].annotation == str
+    assert TestSignature.input_fields["input2"].annotation == int
+    assert TestSignature.output_fields["output1"].annotation == List[str]
+    assert TestSignature.output_fields["output2"].annotation == str
+
+
+def test_no_input_output():
+    with pytest.raises(TypeError):
+
+        class TestSignature(Signature):
+            input1: str
+
+
+def test_no_input_output2():
+    with pytest.raises(TypeError):
+
+        class TestSignature(Signature):
+            input1: str = pydantic.Field()
+
+
+def test_all_fields_have_prefix():
+    class TestSignature(Signature):
+        input = InputField(prefix="Modified:")
+        output = OutputField()
+
+    assert (
+        TestSignature.input_fields["input"].json_schema_extra["prefix"] == "Modified:"
+    )
+    assert (
+        TestSignature.output_fields["output"].json_schema_extra["prefix"] == "Output:"
+    )
+
+
+def test_signature_parsing():
+    signature = Signature("input1, input2 -> output")
+    assert "input1" in signature.input_fields
+    assert "input2" in signature.input_fields
+    assert "output" in signature.output_fields
+
+
+def test_with_signature():
+    signature1 = Signature("input1, input2 -> output")
+    signature2 = signature1.with_instructions("This is a test")
+    assert signature2.instructions == "This is a test"
+    assert signature1 is not signature2, "The type should be immutable"
+
+
+def test_with_updated_field():
+    signature1 = Signature("input1, input2 -> output")
+    signature2 = signature1.with_updated_fields("input1", prefix="Modified:")
+    assert signature2.input_fields["input1"].json_schema_extra["prefix"] == "Modified:"
+    assert signature1.input_fields["input1"].json_schema_extra["prefix"] == "Input 1:"
+    assert signature1 is not signature2, "The type should be immutable"
+    for key in signature1.fields.keys():
+        if key != "input1":
+            assert (
+                signature1.fields[key].json_schema_extra
+                == signature2.fields[key].json_schema_extra
+            )
+    assert signature1.instructions == signature2.instructions
+
+
+def test_empty_signature():
+    with pytest.raises(ValueError):
+        Signature("")
+
+
+def test_instructions_signature():
+    with pytest.raises(ValueError):
+        Signature("")
+
+
+def test_signature_instructions():
+    sig1 = Signature("input1 -> output1", instructions="This is a test")
+    assert sig1.instructions == "This is a test"
+
+
+def test_signature_instructions_none():
+    sig1 = Signature("a, b -> c")
+    assert sig1.instructions == f"Given the fields `a`, `b`, produce the fields `c`."
+
+
+def test_signature_from_dict():
+    signature = Signature(
+        {"input1": InputField(), "input2": InputField(), "output": OutputField()}
+    )
+    for k in ["input1", "input2", "output"]:
+        assert k in signature.fields
+        assert signature.fields[k].annotation == str
+
+
+def test_signature_from_dict():
+    signature = Signature(
+        {"input1": InputField(), "input2": InputField(), "output": OutputField()}
+    )
+    assert "input1" in signature.input_fields
+    assert "input2" in signature.input_fields
+    assert "output" in signature.output_fields
+
+
+def test_signature_equality():
+    sig1 = Signature("input1 -> output1")
+    sig2 = Signature("input1 -> output1")
+    assert sig1.equals(sig2)
+
+
+def test_signature_inequality():
+    sig1 = Signature("input1 -> output1")
+    sig2 = Signature("input2 -> output2")
+    assert not sig1.equals(sig2)
+
+
+def test_equality_format():
+    class TestSignature(Signature):
+        input = InputField(format=lambda x: x)
+        output = OutputField()
+
+    assert TestSignature.equals(TestSignature)
+
+
+def test_signature_reverse():
+    sig = Signature("input1 -> output1")
+    assert sig.signature == "input1 -> output1"
+
+
+def test_insert_field_at_various_positions():
+    class InitialSignature(Signature):
+        input1: str = InputField()
+        output1: int = OutputField()
+
+    S1 = InitialSignature.prepend("new_input_start", InputField(), str)
+    S2 = InitialSignature.append("new_input_end", InputField(), str)
+    assert "new_input_start" == list(S1.input_fields.keys())[0]
+    assert "new_input_end" == list(S2.input_fields.keys())[-1]
+
+    S3 = InitialSignature.prepend("new_output_start", OutputField(), str)
+    S4 = InitialSignature.append("new_output_end", OutputField(), str)
+    assert "new_output_start" == list(S3.output_fields.keys())[0]
+    assert "new_output_end" == list(S4.output_fields.keys())[-1]
+
+
+def test_infer_prefix():
+    assert infer_prefix("someAttributeName42IsCool") == "Some Attribute Name 42 Is Cool"
+    assert infer_prefix("version2Update") == "Version 2 Update"
+    assert infer_prefix("modelT45Enhanced") == "Model T 45 Enhanced"
+    assert infer_prefix("someAttributeName") == "Some Attribute Name"
+    assert infer_prefix("some_attribute_name") == "Some Attribute Name"
+    assert infer_prefix("URLAddress") == "URL Address"
+    assert infer_prefix("isHTTPSecure") == "Is HTTP Secure"
+    assert infer_prefix("isHTTPSSecure123") == "Is HTTPS Secure 123"
diff --git a/tests/teleprompt/test_bootstrap.py b/tests/teleprompt/test_bootstrap.py
new file mode 100644
index 0000000000..4758a5aae4
--- /dev/null
+++ b/tests/teleprompt/test_bootstrap.py
@@ -0,0 +1,180 @@
+import pytest
+import dspy
+from dspy.predict import Predict
+from dspy.utils.dummies import DummyLM
+from dspy import Example
+from dspy.teleprompt import BootstrapFewShot
+import textwrap
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction, trace=None):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+examples = [
+    Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(
+        input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
+    ),
+]
+trainset = [examples[0]]
+valset = [examples[1]]
+
+
+def test_bootstrap_initialization():
+    # Initialize BootstrapFewShot with a dummy metric and minimal setup
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
+    )
+    assert bootstrap.metric == simple_metric, "Metric not correctly initialized"
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = Predict(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_compile_with_predict_instances():
+    # Create Predict instances for student and teacher
+    # Note that dspy.Predict is not itself a module, so we can't use it directly here
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DummyLM(["Initial thoughts", "Finish[blue]"])
+    dspy.settings.configure(lm=lm)
+
+    # Initialize BootstrapFewShot and compile the student
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
+    )
+    compiled_student = bootstrap.compile(
+        student, teacher=teacher, trainset=trainset, valset=valset
+    )
+
+    assert compiled_student is not None, "Failed to compile student"
+    assert (
+        hasattr(compiled_student, "_compiled") and compiled_student._compiled
+    ), "Student compilation flag not set"
+
+
+def test_bootstrap_effectiveness():
+    # This test verifies if the bootstrapping process improves the student's predictions
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+    lm = DummyLM(["blue", "Ring-ding-ding-ding-dingeringeding!"], follow_examples=True)
+    dspy.settings.configure(lm=lm, trace=[])
+
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
+    )
+    compiled_student = bootstrap.compile(
+        student, teacher=teacher, trainset=trainset, valset=valset
+    )
+
+    # Check that the compiled student has the correct demos
+    assert len(compiled_student.predictor.demos) == 1
+    assert compiled_student.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.predictor.demos[0].output == trainset[0].output
+
+    # Test the compiled student's prediction.
+    # We are using a DummyLM with follow_examples=True, which means that
+    # even though it would normally reply with "Ring-ding-ding-ding-dingeringeding!"
+    # on the second output, if it seems an example that perfectly matches the
+    # prompt, it will use that instead. That is why we expect "blue" here.
+    prediction = compiled_student(input=trainset[0].input)
+    assert prediction.output == trainset[0].output
+
+    # For debugging
+    print("Convo")
+    print(lm.get_convo(-1))
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Output: ${output}
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue
+
+        ---
+
+        Input: What is the color of the sky?
+        Output: blue"""
+    )
+
+
+def test_error_handling_during_bootstrap():
+    """
+    Test to verify error handling during the bootstrapping process
+    """
+
+    class BuggyModule(dspy.Module):
+        def __init__(self, signature):
+            super().__init__()
+            self.predictor = Predict(signature)
+
+        def forward(self, **kwargs):
+            raise RuntimeError("Simulated error")
+
+    student = SimpleModule("input -> output")
+    teacher = BuggyModule("input -> output")
+
+    # Setup DummyLM to simulate an error scenario
+    lm = DummyLM(
+        [
+            "Initial thoughts",  # Simulate initial teacher's prediction
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric,
+        max_bootstrapped_demos=1,
+        max_labeled_demos=1,
+        max_errors=1,
+    )
+
+    with pytest.raises(RuntimeError, match="Simulated error"):
+        bootstrap.compile(student, teacher=teacher, trainset=trainset, valset=valset)
+
+
+def test_validation_set_usage():
+    """
+    Test to ensure the validation set is correctly used during bootstrapping
+    """
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")
+
+    lm = DummyLM(
+        [
+            "Initial thoughts",
+            "Finish[blue]",  # Expected output for both training and validation
+        ]
+    )
+    dspy.settings.configure(lm=lm)
+
+    bootstrap = BootstrapFewShot(
+        metric=simple_metric, max_bootstrapped_demos=1, max_labeled_demos=1
+    )
+    compiled_student = bootstrap.compile(
+        student, teacher=teacher, trainset=trainset, valset=valset
+    )
+
+    # Check that validation examples are part of student's demos after compilation
+    assert len(compiled_student.predictor.demos) >= len(
+        valset
+    ), "Validation set not used in compiled student demos"
diff --git a/tests/teleprompt/test_ensemble.py b/tests/teleprompt/test_ensemble.py
new file mode 100644
index 0000000000..292176af4f
--- /dev/null
+++ b/tests/teleprompt/test_ensemble.py
@@ -0,0 +1,60 @@
+import pytest
+import dspy
+from dspy.teleprompt.ensemble import Ensemble
+
+
+class MockProgram(dspy.Module):
+    def __init__(self, output):
+        super().__init__()
+        self.output = output
+
+    def forward(self, *args, **kwargs):
+        return self.output
+
+
+# Simple reduction function to test with
+def mock_reduce_fn(outputs):
+    return sum(outputs) / len(outputs)
+
+
+def test_ensemble_without_reduction():
+    """Test that Ensemble correctly combines outputs without applying a reduce_fn."""
+    programs = [MockProgram(i) for i in range(5)]
+    ensemble = Ensemble()
+    ensembled_program = ensemble.compile(programs)
+
+    outputs = ensembled_program()
+    assert len(outputs) == 5, "Ensemble did not combine the correct number of outputs"
+
+
+def test_ensemble_with_reduction():
+    """Test that Ensemble correctly applies a reduce_fn to combine outputs."""
+    programs = [MockProgram(i) for i in range(5)]
+    ensemble = Ensemble(reduce_fn=mock_reduce_fn)
+    ensembled_program = ensemble.compile(programs)
+
+    output = ensembled_program()
+    expected_output = sum(range(5)) / 5
+    assert output == expected_output, "Ensemble did not correctly apply the reduce_fn"
+
+
+def test_ensemble_with_size_limitation():
+    """Test that specifying a size limits the number of programs used in the ensemble."""
+    programs = [MockProgram(i) for i in range(10)]
+    ensemble_size = 3
+    ensemble = Ensemble(size=ensemble_size)
+    ensembled_program = ensemble.compile(programs)
+
+    outputs = ensembled_program()
+    assert (
+        len(outputs) == ensemble_size
+    ), "Ensemble did not respect the specified size limitation"
+
+
+def test_ensemble_deterministic_behavior():
+    """Verify that the Ensemble class raises an assertion for deterministic behavior."""
+    with pytest.raises(
+        AssertionError,
+        match="TODO: Implement example hashing for deterministic ensemble.",
+    ):
+        Ensemble(deterministic=True)
diff --git a/tests/teleprompt/test_finetune.py b/tests/teleprompt/test_finetune.py
new file mode 100644
index 0000000000..f87f5c14cb
--- /dev/null
+++ b/tests/teleprompt/test_finetune.py
@@ -0,0 +1 @@
+# TODO
\ No newline at end of file
diff --git a/tests/teleprompt/test_knn_fewshot.py b/tests/teleprompt/test_knn_fewshot.py
new file mode 100644
index 0000000000..b267d3dce8
--- /dev/null
+++ b/tests/teleprompt/test_knn_fewshot.py
@@ -0,0 +1,72 @@
+import pytest
+import dsp, dspy
+from dspy.predict.knn import KNN
+from dspy.teleprompt.knn_fewshot import KNNFewShot
+from dspy.utils.dummies import DummyLM, DummyVectorizer
+
+
+def mock_example(question: str, answer: str) -> dsp.Example:
+    """Creates a mock DSP example with specified question and answer."""
+    return dspy.Example(question=question, answer=answer).with_inputs("question")
+
+
+@pytest.fixture
+def setup_knn_few_shot():
+    """Sets up a KNNFewShot instance for testing."""
+    trainset = [
+        mock_example("What is the capital of France?", "Paris"),
+        mock_example("What is the largest ocean?", "Pacific"),
+        mock_example("What is 2+2?", "4"),
+    ]
+    dsp.SentenceTransformersVectorizer = DummyVectorizer
+    knn_few_shot = KNNFewShot(KNN, k=2, trainset=trainset)
+    return knn_few_shot
+
+
+def test_knn_few_shot_initialization(setup_knn_few_shot):
+    """Tests the KNNFewShot initialization."""
+    knn_few_shot = setup_knn_few_shot
+    assert knn_few_shot.KNN.k == 2, "Incorrect k value for KNN"
+    assert len(knn_few_shot.KNN.trainset) == 3, "Incorrect trainset size for KNN"
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        self.predictor = dspy.Predict(signature)
+
+    def forward(self, *args, **kwargs):
+        return self.predictor(**kwargs)
+
+    def reset_copy(self):
+        # Creates a new instance of SimpleModule with the same predictor
+        return SimpleModule(self.predictor.signature)
+
+
+# TODO: Test not working yet
+def _test_knn_few_shot_compile(setup_knn_few_shot):
+    """Tests the compile method of KNNFewShot with SimpleModule as student."""
+    student = SimpleModule("input -> output")
+    teacher = SimpleModule("input -> output")  # Assuming teacher uses the same module type
+
+    # Setup DummyLM with a response for a query similar to one of the training examples
+    lm = DummyLM(["Madrid", "10"])
+    dspy.settings.configure(lm=lm)  # Responses for the capital of Spain and the result of 5+5)
+
+    knn_few_shot = setup_knn_few_shot
+    trainset = knn_few_shot.KNN.trainset
+    compiled_student = knn_few_shot.compile(student, teacher=teacher, trainset=trainset, valset=None)
+
+    assert len(compiled_student.predictor.demos) == 1
+    assert compiled_student.predictor.demos[0].input == trainset[0].input
+    assert compiled_student.predictor.demos[0].output == trainset[0].output
+    
+    # Simulate a query that is similar to one of the training examples
+    output = compiled_student.forward(input = "What is the capital of Spain?").output
+
+    print("CONVO")
+    print(lm.get_convo(-1))
+
+    # Validate that the output corresponds to one of the expected DummyLM responses
+    # This assumes the compiled_student's forward method will execute the predictor with the given query
+    assert output in ["Madrid", "10"], "The compiled student did not return the correct output based on the query"
diff --git a/tests/teleprompt/test_signature_opt.py b/tests/teleprompt/test_signature_opt.py
new file mode 100644
index 0000000000..d7f3475514
--- /dev/null
+++ b/tests/teleprompt/test_signature_opt.py
@@ -0,0 +1,121 @@
+import textwrap
+import dspy
+from dspy.teleprompt.signature_opt import SignatureOptimizer
+from dspy.utils.dummies import DummyLM
+from dspy import Example
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+# Example training and validation sets
+trainset = [
+    Example(input="Question: What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(input="Question: What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!").with_inputs("input"),
+]
+
+def test_signature_optimizer_initialization():
+    optimizer = SignatureOptimizer(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    assert optimizer.metric == simple_metric, "Metric not correctly initialized"
+    assert optimizer.breadth == 2, "Breadth not correctly initialized"
+    assert optimizer.depth == 1, "Depth not correctly initialized"
+    assert optimizer.init_temperature == 1.4, "Initial temperature not correctly initialized"
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        # SignatureOptimizer doesn't work with dspy.Predict
+        self.predictor = dspy.ChainOfThought(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+def test_signature_optimizer_optimization_process():
+    optimizer = SignatureOptimizer(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    dspy.settings.configure(lm=DummyLM(["Optimized instruction 1", "Optimized instruction 2"]))
+    
+    student = SimpleModule("input -> output")
+    
+    # Assuming the compile method of SignatureOptimizer requires a student module, a development set, and evaluation kwargs
+    optimized_student = optimizer.compile(student, devset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+    
+    # Check that the optimized student has been modified from the original
+    # This check can be more specific based on how the optimization modifies the student
+    assert optimized_student is not student, "Optimization did not modify the student"
+
+    # Further tests can be added to verify the specifics of the optimization process,
+    # such as checking the instructions of the optimized student's predictors.
+
+def test_signature_optimizer_statistics_tracking():
+    optimizer = SignatureOptimizer(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    optimizer.track_stats = True  # Enable statistics tracking
+
+    dspy.settings.configure(lm=DummyLM(["Optimized instruction"]))
+    student = SimpleModule("input -> output")
+    optimized_student = optimizer.compile(student, devset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+
+    # Verify that statistics have been tracked and attached to the optimized student
+    assert hasattr(optimized_student, 'total_calls'), "Total calls statistic not tracked"
+    assert hasattr(optimized_student, 'results_best'), "Best results statistics not tracked"
+
+# Assuming the setup_signature_optimizer fixture and simple_metric function are defined as before
+
+def test_optimization_and_output_verification():
+    lm = DummyLM([
+        "Optimized Prompt",
+        "Optimized Prefix",
+    ])
+    dspy.settings.configure(lm=lm)
+    optimizer = SignatureOptimizer(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    
+    student = SimpleModule("input -> output")
+    
+    # Compile the student with the optimizer
+    optimized_student = optimizer.compile(student, devset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+    
+    # Simulate calling the optimized student with a new input
+    test_input = "What is the capital of France?"
+    prediction = optimized_student(input=test_input)
+
+    print(lm.get_convo(-1))
+    
+    assert prediction.output == "No more responses"
+
+    assert lm.get_convo(-1) == textwrap.dedent("""\
+        Optimized Prompt
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Reasoning: Let's think step by step in order to ${produce the output}. We ...
+        Optimized Prefix ${output}
+
+        ---
+
+        Input: What is the capital of France?
+        Reasoning: Let's think step by step in order to No more responses
+        Optimized Prefix No more responses""")
+
+def test_statistics_tracking_during_optimization():
+    dspy.settings.configure(lm=DummyLM(["Optimized instruction for stats tracking"]))
+    
+    optimizer = SignatureOptimizer(metric=simple_metric, breadth=2, depth=1, init_temperature=1.4)
+    optimizer.track_stats = True  # Enable statistics tracking
+    
+    student = SimpleModule("input -> output")
+    optimized_student = optimizer.compile(student, devset=trainset, eval_kwargs={"num_threads": 1, "display_progress": False})
+
+    # Verify that statistics have been tracked
+    assert hasattr(optimized_student, 'total_calls'), "Optimizer did not track total metric calls"
+    assert optimized_student.total_calls > 0, "Optimizer reported no metric calls"
+    
+    # Check if the results_best and results_latest contain valid statistics
+    assert 'results_best' in optimized_student.__dict__, "Optimizer did not track the best results"
+    assert 'results_latest' in optimized_student.__dict__, "Optimizer did not track the latest results"
+    assert len(optimized_student.results_best) > 0, "Optimizer did not properly populate the best results statistics"
+    assert len(optimized_student.results_latest) > 0, "Optimizer did not properly populate the latest results statistics"
+
+    # Additional detailed checks can be added here to verify the contents of the tracked statistics
diff --git a/tests/teleprompt/test_signature_opt_bayesian.py b/tests/teleprompt/test_signature_opt_bayesian.py
new file mode 100644
index 0000000000..abbcf11d18
--- /dev/null
+++ b/tests/teleprompt/test_signature_opt_bayesian.py
@@ -0,0 +1,176 @@
+import textwrap
+import pytest
+import dspy
+from dspy.teleprompt.signature_opt_bayesian import BayesianSignatureOptimizer
+from dspy.utils.dummies import DummyLM
+from dspy import Example
+
+
+# Define a simple metric function for testing
+def simple_metric(example, prediction, trace=None):
+    # Simplified metric for testing: true if prediction matches expected output
+    return example.output == prediction.output
+
+
+# Example training and validation sets
+trainset = [
+    Example(input="What is the color of the sky?", output="blue").with_inputs("input"),
+    Example(
+        input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
+    ).with_inputs("input"),
+    Example(input="What is the capital of France?", output="Paris").with_inputs(
+        "input"
+    ),
+    Example(input="What is the capital of Germany?", output="Berlin").with_inputs(
+        "input"
+    ),
+]
+
+
+def test_bayesian_signature_optimizer_initialization():
+    optimizer = BayesianSignatureOptimizer(
+        metric=simple_metric, n=10, init_temperature=1.4, verbose=True, track_stats=True
+    )
+    assert optimizer.metric == simple_metric, "Metric not correctly initialized"
+    assert optimizer.n == 10, "Incorrect 'n' parameter initialization"
+    assert (
+        optimizer.init_temperature == 1.4
+    ), "Initial temperature not correctly initialized"
+    assert optimizer.verbose is True, "Verbose flag not correctly initialized"
+    assert optimizer.track_stats is True, "Track stats flag not correctly initialized"
+
+
+class SimpleModule(dspy.Module):
+    def __init__(self, signature):
+        super().__init__()
+        # SignatureOptimizer doesn't work with dspy.Predict
+        self.predictor = dspy.ChainOfThought(signature)
+
+    def forward(self, **kwargs):
+        return self.predictor(**kwargs)
+
+
+def test_signature_optimizer_optimization_process():
+    # Make LM that is always right
+    dspy.settings.configure(lm=DummyLM({ex.input: ex.output for ex in trainset}))
+
+    student = SimpleModule(signature="input -> output")
+
+    optimizer = BayesianSignatureOptimizer(
+        metric=simple_metric,
+        n=10,
+        init_temperature=1.4,
+        verbose=False,
+        track_stats=False,
+    )
+
+    # Adjustments: Include required parameters for the compile method
+    optimized_student = optimizer.compile(
+        student=student,
+        devset=trainset,
+        optuna_trials_num=10,
+        max_bootstrapped_demos=3,
+        max_labeled_demos=5,
+        eval_kwargs={"num_threads": 1, "display_progress": False},
+    )
+
+    assert len(optimized_student.predictor.demos) == 4
+
+
+def test_signature_optimizer_bad_lm():
+    dspy.settings.configure(
+        lm=DummyLM([f"Optimized instruction {i}" for i in range(30)])
+    )
+    student = SimpleModule(signature="input -> output")
+    optimizer = BayesianSignatureOptimizer(
+        metric=simple_metric,
+        n=10,
+        init_temperature=1.4,
+        verbose=False,
+        track_stats=False,
+    )
+
+    # Krista: when the code tries to generate bootstrapped examples, the examples are generated using DummyLM,
+    # which only outputs "Optimized instruction i" this means that none of the bootstrapped examples are successful,
+    # and therefore the set of examples that we're using to generate new prompts is empty
+    with pytest.raises(ValueError):
+        _optimized_student = optimizer.compile(
+            student=student,
+            devset=trainset,
+            optuna_trials_num=10,
+            max_bootstrapped_demos=3,
+            max_labeled_demos=5,
+            eval_kwargs={"num_threads": 1, "display_progress": False},
+        )
+
+
+def test_optimization_and_output_verification():
+    # Make a language model that is always right, except on the last
+    # example in the train set.
+    lm = DummyLM({ex.input: ex.output for ex in trainset[:-1]}, follow_examples=True)
+    dspy.settings.configure(lm=lm)
+
+    optimizer = BayesianSignatureOptimizer(
+        metric=simple_metric,
+        n=10,
+        init_temperature=1.4,
+        verbose=False,
+        track_stats=True,
+    )
+
+    student = SimpleModule("input -> output")
+
+    # Compile the student with the optimizer
+    optimized_student = optimizer.compile(
+        student=student,
+        devset=trainset,
+        optuna_trials_num=10,
+        max_bootstrapped_demos=3,
+        max_labeled_demos=5,
+        eval_kwargs={"num_threads": 1, "display_progress": False},
+    )
+
+    # Simulate calling the optimized student with a new input
+    test_input = "What is the capital of France?"
+    prediction = optimized_student(input=test_input)
+
+    print("CORRECT ANSWER")
+    print(lm.get_convo(-1))
+
+    assert prediction.output == "blue"
+
+    assert lm.get_convo(-1) == textwrap.dedent(
+        """\
+        Given the fields `input`, produce the fields `output`.
+
+        ---
+
+        Input: What does the fox say?
+        Output: Ring-ding-ding-ding-dingeringeding!
+
+        Input: What is the capital of Germany?
+        Output: Berlin
+
+        Input: What is the capital of France?
+        Output: Paris
+
+        ---
+
+        Follow the following format.
+
+        Input: ${input}
+        Reasoning: Let's think step by step in order to ${produce the output}. We ...
+        Output: ${output}
+
+        ---
+
+        Input: What is the color of the sky?
+        Reasoning: Let's think step by step in order to blue
+        Output: blue
+
+        ---
+
+        Input: What is the capital of France?
+        Reasoning: Let's think step by step in order to blue
+        Output: blue"""
+    )

From 51902e1744011899e70b35274102425318bbd2df Mon Sep 17 00:00:00 2001
From: Thomas D Ahle <thomas@ahle.dk>
Date: Sat, 24 Feb 2024 22:38:33 -0800
Subject: [PATCH 2/6] Better dummy lm for bayesian opt

---
 dspy/functional/functional.py                 |   4 +-
 dspy/teleprompt/signature_opt_bayesian.py     |   7 +-
 .../teleprompt/test_signature_opt_bayesian.py | 151 ++++++++++++++----
 3 files changed, 125 insertions(+), 37 deletions(-)

diff --git a/dspy/functional/functional.py b/dspy/functional/functional.py
index c4388ee862..bc10def220 100644
--- a/dspy/functional/functional.py
+++ b/dspy/functional/functional.py
@@ -71,7 +71,7 @@ def _prepare_signature(self):
                             f". Respond with a single JSON object using the schema "
                             + json.dumps(type_.model_json_schema())
                         ),
-                        format=lambda x: x if isinstance(x, str) else x.json(),
+                        format=lambda x: x if isinstance(x, str) else x.model_dump_json(),
                         parser=lambda x: unwrap(
                             type_.model_validate_json(_unwrap_json(x))
                         ),
@@ -81,7 +81,7 @@ def _prepare_signature(self):
                 if type_ in (list[str], tuple[str]):
                     format = passages2text
                 elif inspect.isclass(type_) and issubclass(type_, pydantic.BaseModel):
-                    format = lambda x: x if isinstance(x, str) else x.json()
+                    format = lambda x: x if isinstance(x, str) else x.model_dump_json()
                 signature = signature.with_updated_fields(name, format=format)
 
         if self.chain_of_thought:
diff --git a/dspy/teleprompt/signature_opt_bayesian.py b/dspy/teleprompt/signature_opt_bayesian.py
index 045461fcea..d5245bc931 100644
--- a/dspy/teleprompt/signature_opt_bayesian.py
+++ b/dspy/teleprompt/signature_opt_bayesian.py
@@ -325,8 +325,11 @@ def objective(trial):
                     p_demo_candidates = demo_candidates[id(p_old)]
 
                     # Suggest the index of the instruction candidate to use in our trial
-                    instruction_idx = trial.suggest_categorical(f"{id(p_old)}_predictor_instruction",range(len(p_instruction_candidates)))
-                    demos_idx = trial.suggest_categorical(f"{id(p_old)}_predictor_demos",range(len(p_demo_candidates)))
+                    #instruction_idx = trial.suggest_categorical(f"{id(p_old)}_predictor_instruction",range(len(p_instruction_candidates)))
+                    #demos_idx = trial.suggest_categorical(f"{id(p_old)}_predictor_demos",range(len(p_demo_candidates)))
+                    instruction_idx = trial.suggest_int(f"{id(p_old)}_predictor_instruction",low=0, high=len(p_instruction_candidates)-1)
+                    demos_idx = trial.suggest_int(f"{id(p_old)}_predictor_demos",low=0, high=len(p_demo_candidates)-1)
+
                     trial_logs[trial_num][f"{id(p_old)}_predictor_instruction"] = instruction_idx
                     trial_logs[trial_num][f"{id(p_old)}_predictor_demos"] = demos_idx
 
diff --git a/tests/teleprompt/test_signature_opt_bayesian.py b/tests/teleprompt/test_signature_opt_bayesian.py
index abbcf11d18..0cf655784f 100644
--- a/tests/teleprompt/test_signature_opt_bayesian.py
+++ b/tests/teleprompt/test_signature_opt_bayesian.py
@@ -1,6 +1,8 @@
 import textwrap
 import pytest
+import re
 import dspy
+from dsp.modules import LM
 from dspy.teleprompt.signature_opt_bayesian import BayesianSignatureOptimizer
 from dspy.utils.dummies import DummyLM
 from dspy import Example
@@ -11,6 +13,20 @@ def simple_metric(example, prediction, trace=None):
     # Simplified metric for testing: true if prediction matches expected output
     return example.output == prediction.output
 
+# Some example data
+capitals = {
+    "Germany": "Berlin",
+    "France": "Paris",
+    "Denmark": "Copenhagen",
+    "Sweden": "Stockholm",
+    "Norway": "Oslo",
+}
+# Not used for training data
+extra_capitals = {
+    "Spain": "Madrid",
+    "Portugal": "Lisbon",
+    "Italy": "Rome",
+}
 
 # Example training and validation sets
 trainset = [
@@ -18,13 +34,82 @@ def simple_metric(example, prediction, trace=None):
     Example(
         input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"
     ).with_inputs("input"),
-    Example(input="What is the capital of France?", output="Paris").with_inputs(
-        "input"
-    ),
-    Example(input="What is the capital of Germany?", output="Berlin").with_inputs(
-        "input"
-    ),
-]
+] + [Example(input=f"What is the capital of {country}?", output=capital).with_inputs("input") for country, capital in capitals.items()]
+
+
+class ConditionalLM(LM):
+    def __init__(self):
+        super().__init__("conditional-lm")
+
+    def basic_request(self, prompt, n=1, **kwargs):
+        # If we are in the "optimization" stage, we don't say much.
+        if prompt.endswith("Observations:"):
+            answer = " (*silence*)"
+        elif prompt.endswith("Proposed Instruction:"):
+            answer = " Input: "
+        elif prompt.endswith("Proposed Prefix For Output Field:"):
+            answer = " Output: "
+        elif prompt.endswith("Summary:"):
+            answer = " summarizing..."
+        else:
+            pairs = re.findall(r"Input: (.*)\nOutput: (.*)", prompt)
+
+            print("PROMPT:", prompt)
+            print("PAIRS:", pairs)
+
+            last = re.search(r"Input: (.*)\nReasoning: (.*)$", prompt)
+            current_question = last.group(1)
+
+            if match := re.match(r"What is the capital of (.*?)\?", current_question):
+                country = match.group(1)
+                # If we had a previous example of a question about a capital, the model
+                # has learned the format, and will answer with question correctly.
+                if any("capital" in question for question, _ in pairs):
+                    answer = (capitals | extra_capitals)[country]
+                # Otherwise, it is confused and will answer with the country's name.
+                else:
+                    answer = country
+            # For other questions, the model will answer with the last word of the question.
+            else:
+                answer = current_question.split()[-1]
+            
+            answer = "think deeply.\nOutput: " + answer
+
+        RED, GREEN, RESET = '\033[91m', '\033[92m', '\033[0m'
+        print("=== DummyLM ===")
+        print(prompt, end="")
+        print(f"{RED}{answer}{RESET}")
+        print("===")
+
+        dummy_response = {"choices": []}
+        for _ in range(n):
+            dummy_response["choices"].append(
+                {
+                    "text": answer,
+                    "finish_reason": "done",
+                }
+            )
+
+        # Simulate processing and storing the request and response.
+        history_entry = {
+            "prompt": prompt,
+            "response": dummy_response,
+            "kwargs": kwargs,
+            "raw_kwargs": kwargs,
+        }
+        self.history.append(history_entry)
+
+        return dummy_response
+
+    def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
+        response = self.basic_request(prompt, **kwargs)
+        return [choice["text"] for choice in response["choices"]]
+
+    def get_convo(self, index):
+        """get the prompt + anwer from the ith message"""
+        return self.history[index]['prompt'] \
+            + " " \
+            + self.history[index]['response']['choices'][0]['text']
 
 
 def test_bayesian_signature_optimizer_initialization():
@@ -51,8 +136,8 @@ def forward(self, **kwargs):
 
 
 def test_signature_optimizer_optimization_process():
-    # Make LM that is always right
-    dspy.settings.configure(lm=DummyLM({ex.input: ex.output for ex in trainset}))
+    lm = ConditionalLM()
+    dspy.settings.configure(lm=lm)
 
     student = SimpleModule(signature="input -> output")
 
@@ -74,7 +159,7 @@ def test_signature_optimizer_optimization_process():
         eval_kwargs={"num_threads": 1, "display_progress": False},
     )
 
-    assert len(optimized_student.predictor.demos) == 4
+    assert len(optimized_student.predictor.demos) == 5
 
 
 def test_signature_optimizer_bad_lm():
@@ -107,7 +192,7 @@ def test_signature_optimizer_bad_lm():
 def test_optimization_and_output_verification():
     # Make a language model that is always right, except on the last
     # example in the train set.
-    lm = DummyLM({ex.input: ex.output for ex in trainset[:-1]}, follow_examples=True)
+    lm = ConditionalLM()
     dspy.settings.configure(lm=lm)
 
     optimizer = BayesianSignatureOptimizer(
@@ -124,35 +209,24 @@ def test_optimization_and_output_verification():
     optimized_student = optimizer.compile(
         student=student,
         devset=trainset,
-        optuna_trials_num=10,
-        max_bootstrapped_demos=3,
-        max_labeled_demos=5,
+        optuna_trials_num=4,
+        max_bootstrapped_demos=2,
+        max_labeled_demos=3,
         eval_kwargs={"num_threads": 1, "display_progress": False},
     )
 
     # Simulate calling the optimized student with a new input
-    test_input = "What is the capital of France?"
+    test_input = "What is the capital of Spain?"
     prediction = optimized_student(input=test_input)
 
     print("CORRECT ANSWER")
     print(lm.get_convo(-1))
 
-    assert prediction.output == "blue"
+    assert prediction.output == "Madrid"
 
     assert lm.get_convo(-1) == textwrap.dedent(
         """\
-        Given the fields `input`, produce the fields `output`.
-
-        ---
-
-        Input: What does the fox say?
-        Output: Ring-ding-ding-ding-dingeringeding!
-
-        Input: What is the capital of Germany?
-        Output: Berlin
-
-        Input: What is the capital of France?
-        Output: Paris
+        Input:
 
         ---
 
@@ -164,13 +238,24 @@ def test_optimization_and_output_verification():
 
         ---
 
-        Input: What is the color of the sky?
-        Reasoning: Let's think step by step in order to blue
-        Output: blue
+        Input: What is the capital of Norway?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Oslo
+
+        ---
+
+        Input: What is the capital of Sweden?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Stockholm
 
         ---
 
         Input: What is the capital of France?
-        Reasoning: Let's think step by step in order to blue
-        Output: blue"""
+        Output: Paris
+
+        ---
+
+        Input: What is the capital of Spain?
+        Reasoning: Let's think step by step in order to think deeply.
+        Output: Madrid"""
     )

From 40b3e35008e5975491f8476546acb52a8d490e64 Mon Sep 17 00:00:00 2001
From: Thomas D Ahle <thomas@ahle.dk>
Date: Sat, 24 Feb 2024 22:50:31 -0800
Subject: [PATCH 3/6] Reverted aws-lm

---
 dsp/modules/aws_lm.py | 50 +++++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/dsp/modules/aws_lm.py b/dsp/modules/aws_lm.py
index 05a733851c..00906282a8 100644
--- a/dsp/modules/aws_lm.py
+++ b/dsp/modules/aws_lm.py
@@ -28,6 +28,7 @@ def __init__(
         max_new_tokens: int,
         truncate_long_prompts: bool = False,
         input_output_ratio: int = 3,
+        batch_n: bool = True,
     ) -> None:
         """_summary_
 
@@ -40,6 +41,7 @@ def __init__(
             input_output_ratio (int, optional): The rough size of the number of input tokens to output tokens in the worst case. Defaults to 3.
             temperature (float, optional): _description_. Defaults to 0.0.
             truncate_long_prompts (bool, optional): If True, remove extremely long inputs to context. Defaults to False.
+            batch_n (bool, False): If False, call the LM N times rather than batching. Not all AWS models support the n parameter.
         """
         super().__init__(model=model)
         # AWS doesn't have an equivalent of max_tokens so let's clarify
@@ -48,9 +50,10 @@ def __init__(
         self._max_new_tokens: int = max_new_tokens
         self._model_name: str = model
         self._truncate_long_prompt_prompts: bool = truncate_long_prompts
+        self._batch_n: bool = batch_n
 
         import boto3
-        
+
         self.predictor = boto3.client(service_name, region_name=region_name)
 
     @abstractmethod
@@ -72,7 +75,7 @@ def _sanitize_kwargs(self, query_kwargs: dict[str, Any]) -> dict[str, Any]:
         return query_kwargs
 
     @abstractmethod
-    def _call_model(self, body: str) -> str:
+    def _call_model(self, body: str) -> str | list[str]:
         """Call model, get generated input without the formatted prompt"""
         pass
 
@@ -82,7 +85,20 @@ def _extract_input_parameters(
     ) -> dict[str, str | float | int]:
         pass
 
-    def basic_request(self, prompt, **kwargs) -> str:
+    def _simple_api_call(self, formatted_prompt: str, **kwargs) -> str | list[str]:
+        body = self._create_body(formatted_prompt, **kwargs)
+        json_body = json.dumps(body)
+        llm_out: str | list[str] = self._call_model(json_body)
+        if isinstance(llm_out, str):
+            llm_out = llm_out.replace(formatted_prompt, "")
+        else:
+            llm_out = [generated.replace(formatted_prompt, "") for generated in llm_out]
+        self.history.append(
+            {"prompt": formatted_prompt, "response": llm_out, "kwargs": body}
+        )
+        return llm_out
+
+    def basic_request(self, prompt, **kwargs) -> str | list[str]:
         """Query the endpoint."""
 
         # Remove any texts that are too long
@@ -92,16 +108,28 @@ def basic_request(self, prompt, **kwargs) -> str:
             formatted_prompt = self._format_prompt(truncated_prompt)
         else:
             formatted_prompt = self._format_prompt((prompt))
-        body = self._create_body(formatted_prompt, **kwargs)
-        json_body: str = json.dumps(body)
-
-        generated: str = self._call_model(json_body)
 
-        self.history.append(
-            {"prompt": formatted_prompt, "response": generated, "kwargs": body}
-        )
+        llm_out: str | list[str]
+        if "n" in kwargs.keys():
+            if self._batch_n:
+                llm_out = self._simple_api_call(
+                    formatted_prompt=formatted_prompt, **kwargs
+                )
+            else:
+                del kwargs["n"]
+                llm_out = []
+                for _ in range(0, kwargs["n"]):
+                    generated: str | list[str] = self._simple_api_call(
+                        formatted_prompt=formatted_prompt, **kwargs
+                    )
+                    if isinstance(generated, str):
+                        llm_out.append(generated)
+                    else:
+                        raise TypeError("Error, list type was returned from LM call")
+        else:
+            llm_out = self._simple_api_call(formatted_prompt=formatted_prompt, **kwargs)
 
-        return generated.replace(formatted_prompt, "")
+        return llm_out
 
     def _estimate_tokens(self, text: str) -> int:
         return len(text) * CHARS2TOKENS

From 6311b9034a523c32eeb44edfe8ba7077a772a3d3 Mon Sep 17 00:00:00 2001
From: Thomas D Ahle <thomas@ahle.dk>
Date: Sat, 24 Feb 2024 22:52:40 -0800
Subject: [PATCH 4/6] Reversed setup files

---
 pyproject.toml   | 21 ++++++++++++---------
 requirements.txt |  2 +-
 setup.py         | 12 ++++++------
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6db47e0ad6..8878b279aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dspy-ai"
-version = "2.1.9"
+version = "2.1.10"
 description = "DSPy"
 readme = "README.md"
 authors = [{name = "Omar Khattab", email = "okhattab@stanford.edu"}]
@@ -26,16 +26,17 @@ dependencies = [
     "regex~=2023.10.3",
     "ujson~=5.8.0",
     "tqdm~=4.66.1",
-    "datasets~=2.14.6",
+    "datasets~=2.14.6,<3.0.0",
     "requests~=2.31.0",
     "optuna~=3.4.0",
 ]
 
 [project.optional-dependencies]
-pinecone = ["pinecone-client~=2.2.4"]
-qdrant = ["qdrant-client~=1.6.2", "fastembed~=0.1.0"]
 chromadb = ["chromadb~=0.4.14"]
+qdrant = ["qdrant-client~=1.6.2", "fastembed~=0.1.0"]
 marqo = ["marqo"]
+pinecone = ["pinecone-client~=2.2.4"]
+weaviate = ["weaviate-client~=3.26.1"]
 docs = [
     "sphinx>=4.3.0",
     "furo>=2023.3.27",
@@ -80,11 +81,12 @@ tqdm = "^4.66.1"
 datasets = "^2.14.6"
 requests = "^2.31.0"
 optuna = "^3.4.0"
-pinecone-client = {version = "^2.2.4", optional = true}
-qdrant-client = {version = "^1.6.2", optional = true}
-fastembed = {version = "^0.1.0", optional = true}
 chromadb = {version = "^0.4.14", optional = true}
+fastembed = {version = "^0.1.0", optional = true}
 marqo = {version = "*", optional = true}
+qdrant-client = {version = "^1.6.2", optional = true}
+pinecone-client = {version = "^2.2.4", optional = true}
+weaviate-client = {version = "^3.26.1", optional=true}
 sphinx = {version = ">=4.3.0", optional = true}
 furo = {version = ">=2023.3.27", optional = true}
 docutils = {version = "<0.17", optional = true}
@@ -99,10 +101,11 @@ sphinx-reredirects = {version = "^0.1.2", optional = true}
 sphinx-automodapi = {version = "0.16.0", optional = true}
 
 [tool.poetry.extras]
-pinecone = ["pinecone-client"]
-qdrant = ["qdrant-client", "fastembed"]
 chromadb = ["chromadb"]
+qdrant = ["qdrant-client", "fastembed"]
 marqo = ["marqo"]
+pinecone = ["pinecone-client"]
+weaviate = ["weaviate-client"]
 docs = [
     "sphinx",
     "furo",
diff --git a/requirements.txt b/requirements.txt
index c853c2ef3f..8dc43dd62c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,4 @@ ujson
 tqdm
 datasets
 requests
-optuna
\ No newline at end of file
+optuna
diff --git a/setup.py b/setup.py
index f50a568d5f..6c5175e60d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(	
     name="dspy-ai",	
-    version="2.1.9",	
+    version="2.1.10",	
     description="DSPy",	
     long_description=long_description,	
     long_description_content_type='text/markdown',	
@@ -21,13 +21,13 @@
     packages=find_packages(include=['dsp.*', 'dspy.*', 'dsp', 'dspy']),	
     python_requires='>=3.9',	
     install_requires=requirements,	
-    extras_require={	
+    extras_require={
+        "chromadb": ["chromadb~=0.4.14"],
+        "qdrant": ["qdrant-client", "fastembed"],
+        "marqo": ["marqo"],
+        "mongodb": ["pymongo~=3.12.0"],			
         "pinecone": ["pinecone-client~=2.2.4"],	
-        "qdrant": ["qdrant-client~=1.6.2", "fastembed~=0.1.0"],	
-        "chromadb": ["chromadb~=0.4.14"],	
-        "marqo": ["marqo"],	
         "weaviate": ["weaviate-client~=3.26.1"],	
-        "mongodb": ["pymongo~=3.12.0"],	
     },	
     classifiers=[	
         "Development Status :: 3 - Alpha",	

From 4aac55a9e925d6ce5594eb71d1b3f92b9fd9ce25 Mon Sep 17 00:00:00 2001
From: Thomas D Ahle <thomas@ahle.dk>
Date: Sat, 24 Feb 2024 22:54:24 -0800
Subject: [PATCH 5/6] Reverted more files

---
 .../longformqa/longformqa_assertions.ipynb    | 43 ++++++++++---------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/examples/longformqa/longformqa_assertions.ipynb b/examples/longformqa/longformqa_assertions.ipynb
index 40df53a244..d3059b1cbc 100644
--- a/examples/longformqa/longformqa_assertions.ipynb
+++ b/examples/longformqa/longformqa_assertions.ipynb
@@ -6,7 +6,9 @@
    "source": [
     "<img src=\"../../docs/images/DSPy8.png\" alt=\"DSPy7 Image\" height=\"150\"/>\n",
     "\n",
-    "## **DSPy Assertions**: Asserting Computational Constraints on Foundation Models"
+    "## **DSPy Assertions**: Asserting Computational Constraints on Foundation \n",
+    "\n",
+    "### **LongFormQA**: Generating long-form length responses to answer questions"
    ]
   },
   {
@@ -85,6 +87,7 @@
     "if not \"dspy-ai\" in {pkg.key for pkg in pkg_resources.working_set}:\n",
     "    !pip install -U pip\n",
     "    !pip install dspy-ai\n",
+    "    !pip install openai~=0.28.1\n",
     "    !pip install -e $repo_path\n",
     "\n",
     "import dspy\n",
@@ -552,7 +555,7 @@
     "\n",
     "We can also leverage **DSPy**'s advanced compiling features to enhance our program's performance. \n",
     "\n",
-    "For this, we utilize the `BootstrapFewShotWithRandomSearch` teleprompter, which automatically incorporates few-shot demonstrations and conducts a random search over a candidate set to output the best compiled program. We evaluate this over the `answer_correctness` metric as our ultimate goal is indeed to generate correct answers to the `HotPotQA` questions from the paragraphs, aiming to optimize both instrinsic and extrinsic metrics as a result. \n",
+    "For this, we utilize the `BootstrapFewShotWithRandomSearch` teleprompter, which automatically incorporates few-shot demonstrations and conducts a random search over a candidate set to output the best compiled program. We evaluate this over the `answer_correctness` metric as our ultimate goal is indeed to generate correct answers to the `HotPotQA` questions from the paragraphs, aiming to optimize both intrinsic and extrinsic metrics as a result. \n",
     "\n",
     "Let's evaluate this on the LongFormQA program first:"
    ]
@@ -591,24 +594,24 @@
    ]
   },
   {
-     "cell_type": "markdown",
-     "metadata": {},
-     "source": [
-      "**Note** This pipeline on the other hand sets both the teacher and student with `LongFormQAWithAssertions()` to ensure the teacher correctly instructs the student with the right bootstrapped examples and the student has the chance to self-correct with **Assertions** for any examples that are still deemed incorrect."
-     ]
-    },
-   {
-    "cell_type": "code",
-    "execution_count": null,
-    "metadata": {},
-    "outputs": [],
-    "source": [
-     "longformqa = LongFormQA()\n",
-     "teleprompter = BootstrapFewShotWithRandomSearch(metric = answer_correctness, max_bootstrapped_demos=2, num_candidate_programs=6)\n",
-     "cited_longformqa_student_teacher = teleprompter.compile(student=assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), teacher = assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), trainset=trainset, valset=devset[:100])\n",
-     "evaluate(cited_longformqa_student_teacher)"
-    ]
-   }
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note** This pipeline on the other hand sets both the teacher and student with `LongFormQAWithAssertions()` to ensure the teacher correctly instructs the student with the right bootstrapped examples and the student has the chance to self-correct with **Assertions** for any examples that are still deemed incorrect."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "longformqa = LongFormQA()\n",
+    "teleprompter = BootstrapFewShotWithRandomSearch(metric = answer_correctness, max_bootstrapped_demos=2, num_candidate_programs=6)\n",
+    "cited_longformqa_student_teacher = teleprompter.compile(student=assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), teacher = assert_transform_module(LongFormQAWithAssertions().map_named_predictors(Retry), backtrack_handler), trainset=trainset, valset=devset[:100])\n",
+    "evaluate(cited_longformqa_student_teacher)"
+   ]
+  }
  ],
  "metadata": {
   "kernelspec": {

From 3a0b435efc92dd4e3c9c0db9677e8d119035a82a Mon Sep 17 00:00:00 2001
From: Thomas D Ahle <thomas@ahle.dk>
Date: Sun, 25 Feb 2024 10:44:58 -0800
Subject: [PATCH 6/6] Removed submodule

---
 examples/longformqa/DSPy_LongFormQA_Cache | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 examples/longformqa/DSPy_LongFormQA_Cache

diff --git a/examples/longformqa/DSPy_LongFormQA_Cache b/examples/longformqa/DSPy_LongFormQA_Cache
deleted file mode 160000
index 595ce1f7df..0000000000
--- a/examples/longformqa/DSPy_LongFormQA_Cache
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 595ce1f7dfd71dd925795d5fa07f36c9f13a4c29