stanfordnlp · okhat · Oct 24, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -226,8 +226,20 @@ def wrapped_program(example_idx, example):
         if return_outputs:  # Handle the return_outputs logic
             results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]
 
+        def prediction_is_dictlike(prediction):
+            try:
+                dict(prediction)
+                return True
+            except Exception:
+                return False
+
         data = [
-            merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset
+            (
+                merge_dicts(example, prediction) | {"correct": score}
+                if prediction_is_dictlike(prediction)
+                else dict(example) | {"prediction": prediction, "correct": score}
+            )
+            for _, example, prediction, score in predicted_devset
         ]
 
         result_df = pd.DataFrame(data)

diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
@@ -8,6 +8,7 @@
 import dspy
 from dspy.evaluate.evaluate import Evaluate
 from dspy.evaluate.metrics import answer_exact_match
+from dspy.functional import TypedPredictor
 from dspy.predict import Predict
 from dspy.utils.dummies import DummyLM
 
@@ -120,14 +121,38 @@ def test_evaluate_call_bad():
     assert score == 0.0
 
 
+@pytest.mark.parametrize(
+    "program_with_example",
+    [
+        (Predict("question -> answer"), new_example("What is 1+1?", "2")),
+        (
+            # Create a program that extracts entities from text and returns them as a list,
+            # rather than returning a Predictor() wrapper. This is done intentionally to test
+            # the case where the program does not output a dictionary-like object because
+            # Evaluate() has failed for this case in the past
+            lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities,
+            dspy.Example(text="United States", entities=["United States"]).with_inputs("text"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("display_table", [True, False, 1])
 @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False])
-def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd):
-    devset = [new_example("What is 1+1?", "2")]
-    program = Predict("question -> answer")
+def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd):
+    program, example = program_with_example
+    example_input = next(iter(example.inputs().values()))
+    example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()}
+
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                example_input: example_output,
+            }
+        )
+    )
+
     ev = Evaluate(
-        devset=devset,
-        metric=answer_exact_match,
+        devset=[example],
+        metric=lambda example, pred, **kwargs: example == pred,
         display_table=display_table,
     )
     assert ev.display_table == display_table
@@ -140,4 +165,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen
         if not is_in_ipython_notebook_environment and display_table:
             # In console environments where IPython is not available, the table should be printed
             # to the console
-            assert "What is 1+1?" in out
+            example_input = next(iter(example.inputs().values()))
+            assert example_input in out