Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,20 @@ def wrapped_program(example_idx, example):
if return_outputs: # Handle the return_outputs logic
results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]

def prediction_is_dictlike(prediction):
try:
dict(prediction)
return True
except Exception:
return False

data = [
merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset
(
merge_dicts(example, prediction) | {"correct": score}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I vaguely presuppose that merge_dicts is there for legacy reasons, and that example | prediction may have the same effect, is that false? (it may be false)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, i see, merge_dicts does some kind of disambiguation instead of overwriting

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, exactly

if prediction_is_dictlike(prediction)
else dict(example) | {"prediction": prediction, "correct": score}
)
for _, example, prediction, score in predicted_devset
]

result_df = pd.DataFrame(data)
Expand Down
38 changes: 32 additions & 6 deletions tests/evaluate/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import dspy
from dspy.evaluate.evaluate import Evaluate
from dspy.evaluate.metrics import answer_exact_match
from dspy.functional import TypedPredictor
from dspy.predict import Predict
from dspy.utils.dummies import DummyLM

Expand Down Expand Up @@ -120,14 +121,38 @@ def test_evaluate_call_bad():
assert score == 0.0


@pytest.mark.parametrize(
"program_with_example",
[
(Predict("question -> answer"), new_example("What is 1+1?", "2")),
(
# Create a program that extracts entities from text and returns them as a list,
# rather than returning a Predictor() wrapper. This is done intentionally to test
# the case where the program does not output a dictionary-like object because
# Evaluate() has failed for this case in the past
lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This case fails on main:

...
test_evaluate.py:163:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../dspy/evaluate/evaluate.py:229: in __call__
    data = [
../../dspy/evaluate/evaluate.py:230: in <listcomp>
    merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

d1 = Example({'text': 'United States', 'entities': ['United States']}) (input_keys={'text'}), d2 = ['United States']

    def merge_dicts(d1, d2) -> dict:
        merged = {}
        for k, v in d1.items():
            if k in d2:
                merged[f"example_{k}"] = v
            else:
                merged[k] = v

>       for k, v in d2.items():
E       AttributeError: 'list' object has no attribute 'items'

../../dspy/evaluate/evaluate.py:286: AttributeError
================================================================================= warnings summary =================================================================================
../../../miniconda3/envs/default/lib/python3.10/site-packages/pydantic/_internal/_config.py:291
  /Users/corey.zumar/miniconda3/envs/default/lib/python3.10/site-packages/pydantic/_internal/_config.py:291: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
    warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)

../../../miniconda3/envs/default/lib/python3.10/site-packages/wandb/env.py:16
  /Users/corey.zumar/miniconda3/envs/default/lib/python3.10/site-packages/wandb/env.py:16: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives
    from distutils.util import strtobool

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
============================================================================= short test summary info ==============================================================================
FAILED test_evaluate.py::test_evaluate_display_table[True-True-program_with_example1] - AttributeError: 'list' object has no attribute 'items'
FAILED test_evaluate.py::test_evaluate_display_table[True-False-program_with_example1] - AttributeError: 'list' object has no attribute 'items'
FAILED test_evaluate.py::test_evaluate_display_table[True-1-program_with_example1] - AttributeError: 'list' object has no attribute 'items'
FAILED test_evaluate.py::test_evaluate_display_table[False-True-program_with_example1] - AttributeError: 'list' object has no attribute 'items'
FAILED test_evaluate.py::test_evaluate_display_table[False-False-program_with_example1] - AttributeError: 'list' object has no attribute 'items'
FAILED test_evaluate.py::test_evaluate_display_table[False-1-program_with_example1] - AttributeError: 'list' object has no attribute 'items'

dspy.Example(text="United States", entities=["United States"]).with_inputs("text"),
Comment on lines +129 to +134
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@okhat This appears to be a valid setup for DSPy evaluation - i.e. all other parts of evaluation work with such a lambda except for displaying table output. Let me know if there's a strong reason to disallow this and require all DSPy modules / programs to return Prediction() objects

),
],
)
@pytest.mark.parametrize("display_table", [True, False, 1])
@pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False])
def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd):
devset = [new_example("What is 1+1?", "2")]
program = Predict("question -> answer")
def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd):
program, example = program_with_example
example_input = next(iter(example.inputs().values()))
example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()}

dspy.settings.configure(
lm=DummyLM(
{
example_input: example_output,
}
)
)

ev = Evaluate(
devset=devset,
metric=answer_exact_match,
devset=[example],
metric=lambda example, pred, **kwargs: example == pred,
display_table=display_table,
)
assert ev.display_table == display_table
Expand All @@ -140,4 +165,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen
if not is_in_ipython_notebook_environment and display_table:
# In console environments where IPython is not available, the table should be printed
# to the console
assert "What is 1+1?" in out
example_input = next(iter(example.inputs().values()))
assert example_input in out
Loading