-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Fix error in Evaluate with display_table=True with outputs that cannot be converted to dict #1682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,6 +8,7 @@ | |
| import dspy | ||
| from dspy.evaluate.evaluate import Evaluate | ||
| from dspy.evaluate.metrics import answer_exact_match | ||
| from dspy.functional import TypedPredictor | ||
| from dspy.predict import Predict | ||
| from dspy.utils.dummies import DummyLM | ||
|
|
||
|
|
@@ -120,14 +121,38 @@ def test_evaluate_call_bad(): | |
| assert score == 0.0 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "program_with_example", | ||
| [ | ||
| (Predict("question -> answer"), new_example("What is 1+1?", "2")), | ||
| ( | ||
| # Create a program that extracts entities from text and returns them as a list, | ||
| # rather than returning a Predictor() wrapper. This is done intentionally to test | ||
| # the case where the program does not output a dictionary-like object because | ||
| # Evaluate() has failed for this case in the past | ||
| lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities, | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case fails on |
||
| dspy.Example(text="United States", entities=["United States"]).with_inputs("text"), | ||
|
Comment on lines
+129
to
+134
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @okhat This appears to be a valid setup for DSPy evaluation - i.e. all other parts of evaluation work with such a |
||
| ), | ||
| ], | ||
| ) | ||
| @pytest.mark.parametrize("display_table", [True, False, 1]) | ||
| @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False]) | ||
| def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd): | ||
| devset = [new_example("What is 1+1?", "2")] | ||
| program = Predict("question -> answer") | ||
| def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd): | ||
| program, example = program_with_example | ||
| example_input = next(iter(example.inputs().values())) | ||
| example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()} | ||
|
|
||
| dspy.settings.configure( | ||
| lm=DummyLM( | ||
| { | ||
| example_input: example_output, | ||
| } | ||
| ) | ||
| ) | ||
|
|
||
| ev = Evaluate( | ||
| devset=devset, | ||
| metric=answer_exact_match, | ||
| devset=[example], | ||
| metric=lambda example, pred, **kwargs: example == pred, | ||
| display_table=display_table, | ||
| ) | ||
| assert ev.display_table == display_table | ||
|
|
@@ -140,4 +165,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen | |
| if not is_in_ipython_notebook_environment and display_table: | ||
| # In console environments where IPython is not available, the table should be printed | ||
| # to the console | ||
| assert "What is 1+1?" in out | ||
| example_input = next(iter(example.inputs().values())) | ||
| assert example_input in out | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I vaguely presuppose that merge_dicts is there for legacy reasons, and that
example | predictionmay have the same effect, is that false? (it may be false)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, i see, merge_dicts does some kind of disambiguation instead of overwriting
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, exactly