diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 6b74645a8f..04a135537f 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -1,10 +1,9 @@ import logging -import tqdm import types from typing import Any import pandas as pd - +import tqdm import dspy from dspy.utils.parallelizer import ParallelExecutor @@ -14,7 +13,7 @@ from IPython.display import display as display except ImportError: - + def display(obj: Any): """ Display the specified Python object in the console. @@ -41,6 +40,7 @@ def HTML(x: str) -> str: logger = logging.getLogger(__name__) + class Evaluate: def __init__( self, @@ -66,7 +66,6 @@ def __init__( self.return_outputs = return_outputs self.provide_traceback = provide_traceback - def __call__( self, program, @@ -131,11 +130,9 @@ def process_item(item): results = [(example, prediction, score) for _, example, prediction, score in predicted_devset] def prediction_is_dictlike(prediction): - try: - dict(prediction) - return True - except Exception: - return False + # Downstream logic for displaying dictionary-like predictions depends solely on the predictions + # having a method called `items()` for iterating through key/value pairs + return hasattr(prediction, "items") and callable(getattr(prediction, "items")) data = [ ( diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index 7615c70b32..048d4c08b5 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -8,7 +8,6 @@ import dspy from dspy.evaluate.evaluate import Evaluate from dspy.evaluate.metrics import answer_exact_match -from dspy.functional import TypedPredictor from dspy.predict import Predict from dspy.utils.dummies import DummyLM @@ -125,14 +124,22 @@ def test_evaluate_call_bad(): "program_with_example", [ (Predict("question -> answer"), new_example("What is 1+1?", "2")), + # Create programs that do not return dictionary-like objects because Evaluate() + # has failed for such cases in the past ( - # Create a program that extracts entities from text and returns them as a list, - # rather than returning a Predictor() wrapper. This is done intentionally to test - # the case where the program does not output a dictionary-like object because - # Evaluate() has failed for this case in the past - lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities, + lambda text: Predict("text: str -> entities: List[str]")(text=text).entities, dspy.Example(text="United States", entities=["United States"]).with_inputs("text"), ), + ( + lambda text: Predict("text: str -> entities: List[Dict[str, str]]")(text=text).entities, + dspy.Example(text="United States", entities=[{"name": "United States", "type": "location"}]).with_inputs( + "text" + ), + ), + ( + lambda text: Predict("text: str -> first_word: Tuple[str, int]")(text=text).words, + dspy.Example(text="United States", first_word=("United", 6)).with_inputs("text"), + ), ], ) @pytest.mark.parametrize("display_table", [True, False, 1])