From c7d03398a1776bdb79def9f8b54cf578c4021256 Mon Sep 17 00:00:00 2001 From: dbczumar Date: Wed, 23 Oct 2024 12:23:11 -0700 Subject: [PATCH 1/6] fix Signed-off-by: dbczumar --- dspy/evaluate/evaluate.py | 20 ++------------------ tests/evaluate/test_evaluate.py | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 64f8f8cc4a..b0baed5a73 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -227,7 +227,8 @@ def wrapped_program(example_idx, example): results = [(example, prediction, score) for _, example, prediction, score in predicted_devset] data = [ - merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset + dict(example) | {"prediction": prediction, "correct": score} + for _, example, prediction, score in predicted_devset ] result_df = pd.DataFrame(data) @@ -275,23 +276,6 @@ def wrapped_program(example_idx, example): return round(100 * ncorrect / ntotal, 2) -def merge_dicts(d1, d2) -> dict: - merged = {} - for k, v in d1.items(): - if k in d2: - merged[f"example_{k}"] = v - else: - merged[k] = v - - for k, v in d2.items(): - if k in d1: - merged[f"pred_{k}"] = v - else: - merged[k] = v - - return merged - - def truncate_cell(content) -> str: """Truncate content of a cell to 25 words.""" words = str(content).split() diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index a8552c7aa6..5062e259d8 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -120,13 +120,22 @@ def test_evaluate_call_bad(): assert score == 0.0 +@pytest.mark.parametrize( + "program_with_example", + [ + (Predict("question -> answer"), new_example("What is 1+1?", "2")), + ( + Predict("text -> entities"), + dspy.Example(text="United States", entities=["United States"]).with_inputs("text"), + ), + ], +) @pytest.mark.parametrize("display_table", [True, False, 1]) @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False]) -def test_evaluate_display_table(display_table, is_in_ipython_notebook_environment, capfd): - devset = [new_example("What is 1+1?", "2")] - program = Predict("question -> answer") +def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd): + program, example = program_with_example ev = Evaluate( - devset=devset, + devset=[example], metric=answer_exact_match, display_table=display_table, ) @@ -140,4 +149,5 @@ def test_evaluate_display_table(display_table, is_in_ipython_notebook_environmen if not is_in_ipython_notebook_environment and display_table: # In console environments where IPython is not available, the table should be printed # to the console - assert "What is 1+1?" in out + example_input = next(iter(example.inputs().values())) + assert example_input in out From 11aca74d72cb23cf954a43cc7a515cc64f6ca96d Mon Sep 17 00:00:00 2001 From: dbczumar Date: Wed, 23 Oct 2024 12:49:49 -0700 Subject: [PATCH 2/6] fix Signed-off-by: dbczumar --- tests/evaluate/test_evaluate.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index 5062e259d8..94713221a9 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -8,6 +8,7 @@ import dspy from dspy.evaluate.evaluate import Evaluate from dspy.evaluate.metrics import answer_exact_match +from dspy.functional import TypedPredictor from dspy.predict import Predict from dspy.utils.dummies import DummyLM @@ -123,9 +124,8 @@ def test_evaluate_call_bad(): @pytest.mark.parametrize( "program_with_example", [ - (Predict("question -> answer"), new_example("What is 1+1?", "2")), ( - Predict("text -> entities"), + lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities, dspy.Example(text="United States", entities=["United States"]).with_inputs("text"), ), ], @@ -134,9 +134,20 @@ def test_evaluate_call_bad(): @pytest.mark.parametrize("is_in_ipython_notebook_environment", [True, False]) def test_evaluate_display_table(program_with_example, display_table, is_in_ipython_notebook_environment, capfd): program, example = program_with_example + example_input = next(iter(example.inputs().values())) + example_output = {key: value for key, value in example.toDict().items() if key not in example.inputs()} + + dspy.settings.configure( + lm=DummyLM( + { + example_input: example_output, + } + ) + ) + ev = Evaluate( devset=[example], - metric=answer_exact_match, + metric=lambda example, pred, **kwargs: example == pred, display_table=display_table, ) assert ev.display_table == display_table From 4a1bc0bb20926f32e3ca42a982f340877a30a424 Mon Sep 17 00:00:00 2001 From: dbczumar Date: Wed, 23 Oct 2024 12:54:14 -0700 Subject: [PATCH 3/6] fix Signed-off-by: dbczumar --- tests/evaluate/test_evaluate.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index 94713221a9..6393da7686 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -125,6 +125,10 @@ def test_evaluate_call_bad(): "program_with_example", [ ( + # Create a program that extracts entities from text and returns them as a list, + # rather than returning a Predictor() wrapper. This is done intentionally to test + # the case where the program does not output a dictionary-like object because + # Evaluate() has failed for this case in the past lambda text: TypedPredictor("text: str -> entities: List[str]")(text=text).entities, dspy.Example(text="United States", entities=["United States"]).with_inputs("text"), ), From 9053c54a1e4e1cbe7891e18299e5b7b8ba7a8208 Mon Sep 17 00:00:00 2001 From: dbczumar Date: Wed, 23 Oct 2024 12:55:46 -0700 Subject: [PATCH 4/6] fix Signed-off-by: dbczumar --- tests/evaluate/test_evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index 6393da7686..7615c70b32 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -124,6 +124,7 @@ def test_evaluate_call_bad(): @pytest.mark.parametrize( "program_with_example", [ + (Predict("question -> answer"), new_example("What is 1+1?", "2")), ( # Create a program that extracts entities from text and returns them as a list, # rather than returning a Predictor() wrapper. This is done intentionally to test From 6c0c17778125283862dcc495ba7e500987f6e0ad Mon Sep 17 00:00:00 2001 From: dbczumar Date: Wed, 23 Oct 2024 17:22:49 -0700 Subject: [PATCH 5/6] fix Signed-off-by: dbczumar --- dspy/evaluate/evaluate.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index b0baed5a73..6b2c9ccf91 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -227,7 +227,11 @@ def wrapped_program(example_idx, example): results = [(example, prediction, score) for _, example, prediction, score in predicted_devset] data = [ - dict(example) | {"prediction": prediction, "correct": score} + ( + merge_dicts(example, prediction) | {"correct": score} + if isinstance(prediction, dict) + else dict(example) | {"prediction": prediction, "correct": score} + ) for _, example, prediction, score in predicted_devset ] @@ -276,6 +280,23 @@ def wrapped_program(example_idx, example): return round(100 * ncorrect / ntotal, 2) +def merge_dicts(d1, d2) -> dict: + merged = {} + for k, v in d1.items(): + if k in d2: + merged[f"example_{k}"] = v + else: + merged[k] = v + + for k, v in d2.items(): + if k in d1: + merged[f"pred_{k}"] = v + else: + merged[k] = v + + return merged + + def truncate_cell(content) -> str: """Truncate content of a cell to 25 words.""" words = str(content).split() From ced68ca007e7c762ae87de549835454f0fca6fd1 Mon Sep 17 00:00:00 2001 From: dbczumar Date: Wed, 23 Oct 2024 17:29:36 -0700 Subject: [PATCH 6/6] fix Signed-off-by: dbczumar --- dspy/evaluate/evaluate.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 6b2c9ccf91..4ca43ad39b 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -226,10 +226,17 @@ def wrapped_program(example_idx, example): if return_outputs: # Handle the return_outputs logic results = [(example, prediction, score) for _, example, prediction, score in predicted_devset] + def prediction_is_dictlike(prediction): + try: + dict(prediction) + return True + except Exception: + return False + data = [ ( merge_dicts(example, prediction) | {"correct": score} - if isinstance(prediction, dict) + if prediction_is_dictlike(prediction) else dict(example) | {"prediction": prediction, "correct": score} ) for _, example, prediction, score in predicted_devset