From c7d03398a1776bdb79def9f8b54cf578c4021256 Mon Sep 17 00:00:00 2001
From: dbczumar <corey.zumar@databricks.com>
Date: Wed, 23 Oct 2024 12:23:11 -0700
Subject: [PATCH 1/6] fix

Signed-off-by: dbczumar <corey.zumar@databricks.com>
---
 dspy/evaluate/evaluate.py       | 20 ++------------------
 tests/evaluate/test_evaluate.py | 20 +++++++++++++++-----
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 64f8f8cc4a..b0baed5a73 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -227,7 +227,8 @@ def wrapped_program(example_idx, example):
             results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]
 
         data = [
-            merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset
+            dict(example) | {"prediction": prediction, "correct": score}
+            for _, example, prediction, score in predicted_devset
         ]
 
         result_df = pd.DataFrame(data)
@@ -275,23 +276,6 @@ def wrapped_program(example_idx, example):
         return round(100 * ncorrect / ntotal, 2)
 
 
-def merge_dicts(d1, d2) -> dict:
-    merged = {}
-    for k, v in d1.items():
-        if k in d2:
-            merged[f"example_{k}"] = v
-        else:
-            merged[k] = v
-
-    for k, v in d2.items():
-        if k in d1:
-            merged[f"pred_{k}"] = v
-        else:
-            merged[k] = v
-
-    return merged
-
-
 def truncate_cell(content) -> str:
     """Truncate content of a cell to 25 words."""
     words = str(content).split()
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index a8552c7aa6..5062e259d8 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -120,13 +120,22 @@ def test_evaluate_call_bad():
     assert score == 0.0
 
 
+@pytest.mark.parametrize(
+    "program_with_example",
+    [
+        (Predict("question -> answer"), new_example("What is 1+1?", "2")),
+        (
+            Predict("text -> entities"),
+            dspy.Example(text="United States", entities=["United States"]).with_inputs("text"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("display_table", [True, False, 1])
 @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False])
-def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd):
-    devset = [new_example("What is 1+1?", "2")]
-    program = Predict("question -> answer")
+def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd):
+    program, example = program_with_example
     ev = Evaluate(
-        devset=devset,
+        devset=[example],
         metric=answer_exact_match,
         display_table=display_table,
     )
@@ -140,4 +149,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen
         if not is_in_ipython_notebook_environment and display_table:
             # In console environments where IPython is not available, the table should be printed
             # to the console
-            assert "What is 1+1?" in out
+            example_input = next(iter(example.inputs().values()))
+            assert example_input in out

From 11aca74d72cb23cf954a43cc7a515cc64f6ca96d Mon Sep 17 00:00:00 2001
From: dbczumar <corey.zumar@databricks.com>
Date: Wed, 23 Oct 2024 12:49:49 -0700
Subject: [PATCH 2/6] fix

Signed-off-by: dbczumar <corey.zumar@databricks.com>
---
 tests/evaluate/test_evaluate.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index 5062e259d8..94713221a9 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -8,6 +8,7 @@
 import dspy
 from dspy.evaluate.evaluate import Evaluate
 from dspy.evaluate.metrics import answer_exact_match
+from dspy.functional import TypedPredictor
 from dspy.predict import Predict
 from dspy.utils.dummies import DummyLM
 
@@ -123,9 +124,8 @@ def test_evaluate_call_bad():
 @pytest.mark.parametrize(
     "program_with_example",
     [
-        (Predict("question -> answer"), new_example("What is 1+1?", "2")),
         (
-            Predict("text -> entities"),
+            lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities,
             dspy.Example(text="United States", entities=["United States"]).with_inputs("text"),
         ),
     ],
@@ -134,9 +134,20 @@ def test_evaluate_call_bad():
 @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False])
 def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd):
     program, example = program_with_example
+    example_input = next(iter(example.inputs().values()))
+    example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()}
+
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                example_input: example_output,
+            }
+        )
+    )
+
     ev = Evaluate(
         devset=[example],
-        metric=answer_exact_match,
+        metric=lambda example, pred, **kwargs: example == pred,
         display_table=display_table,
     )
     assert ev.display_table == display_table

From 4a1bc0bb20926f32e3ca42a982f340877a30a424 Mon Sep 17 00:00:00 2001
From: dbczumar <corey.zumar@databricks.com>
Date: Wed, 23 Oct 2024 12:54:14 -0700
Subject: [PATCH 3/6] fix

Signed-off-by: dbczumar <corey.zumar@databricks.com>
---
 tests/evaluate/test_evaluate.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index 94713221a9..6393da7686 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -125,6 +125,10 @@ def test_evaluate_call_bad():
     "program_with_example",
     [
         (
+            # Create a program that extracts entities from text and returns them as a list,
+            # rather than returning a Predictor() wrapper. This is done intentionally to test
+            # the case where the program does not output a dictionary-like object because
+            # Evaluate() has failed for this case in the past
             lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities,
             dspy.Example(text="United States", entities=["United States"]).with_inputs("text"),
         ),

From 9053c54a1e4e1cbe7891e18299e5b7b8ba7a8208 Mon Sep 17 00:00:00 2001
From: dbczumar <corey.zumar@databricks.com>
Date: Wed, 23 Oct 2024 12:55:46 -0700
Subject: [PATCH 4/6] fix

Signed-off-by: dbczumar <corey.zumar@databricks.com>
---
 tests/evaluate/test_evaluate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index 6393da7686..7615c70b32 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -124,6 +124,7 @@ def test_evaluate_call_bad():
 @pytest.mark.parametrize(
     "program_with_example",
     [
+        (Predict("question -> answer"), new_example("What is 1+1?", "2")),
         (
             # Create a program that extracts entities from text and returns them as a list,
             # rather than returning a Predictor() wrapper. This is done intentionally to test

From 6c0c17778125283862dcc495ba7e500987f6e0ad Mon Sep 17 00:00:00 2001
From: dbczumar <corey.zumar@databricks.com>
Date: Wed, 23 Oct 2024 17:22:49 -0700
Subject: [PATCH 5/6] fix

Signed-off-by: dbczumar <corey.zumar@databricks.com>
---
 dspy/evaluate/evaluate.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index b0baed5a73..6b2c9ccf91 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -227,7 +227,11 @@ def wrapped_program(example_idx, example):
             results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]
 
         data = [
-            dict(example) | {"prediction": prediction, "correct": score}
+            (
+                merge_dicts(example, prediction) | {"correct": score}
+                if isinstance(prediction, dict)
+                else dict(example) | {"prediction": prediction, "correct": score}
+            )
             for _, example, prediction, score in predicted_devset
         ]
 
@@ -276,6 +280,23 @@ def wrapped_program(example_idx, example):
         return round(100 * ncorrect / ntotal, 2)
 
 
+def merge_dicts(d1, d2) -> dict:
+    merged = {}
+    for k, v in d1.items():
+        if k in d2:
+            merged[f"example_{k}"] = v
+        else:
+            merged[k] = v
+
+    for k, v in d2.items():
+        if k in d1:
+            merged[f"pred_{k}"] = v
+        else:
+            merged[k] = v
+
+    return merged
+
+
 def truncate_cell(content) -> str:
     """Truncate content of a cell to 25 words."""
     words = str(content).split()

From ced68ca007e7c762ae87de549835454f0fca6fd1 Mon Sep 17 00:00:00 2001
From: dbczumar <corey.zumar@databricks.com>
Date: Wed, 23 Oct 2024 17:29:36 -0700
Subject: [PATCH 6/6] fix

Signed-off-by: dbczumar <corey.zumar@databricks.com>
---
 dspy/evaluate/evaluate.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 6b2c9ccf91..4ca43ad39b 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -226,10 +226,17 @@ def wrapped_program(example_idx, example):
         if return_outputs:  # Handle the return_outputs logic
             results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]
 
+        def prediction_is_dictlike(prediction):
+            try:
+                dict(prediction)
+                return True
+            except Exception:
+                return False
+
         data = [
             (
                 merge_dicts(example, prediction) | {"correct": score}
-                if isinstance(prediction, dict)
+                if prediction_is_dictlike(prediction)
                 else dict(example) | {"prediction": prediction, "correct": score}
             )
             for _, example, prediction, score in predicted_devset