diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 603b552aa0..8c0e609e0b 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -166,9 +166,6 @@ def wrapped_program(example_idx, example): num_threads, display_progress, ) - if return_outputs: # Handle the return_outputs logic - results = [(example, prediction, score) - for _, example, prediction, score in reordered_devset] if display: print( @@ -176,6 +173,10 @@ def wrapped_program(example_idx, example): predicted_devset = sorted(reordered_devset) + if return_outputs: # Handle the return_outputs logic + results = [(example, prediction, score) + for _, example, prediction, score in predicted_devset] + # data = [{**example, **prediction, 'correct': score} for example, prediction, score in zip(reordered_devset, preds, scores)] data = [ merge_dicts(example, prediction) | {"correct": score} for _, example, prediction, score in predicted_devset @@ -222,9 +223,9 @@ def wrapped_program(example_idx, example): ipython_display(HTML(message)) if return_all_scores and return_outputs: - return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in reordered_devset] + return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in predicted_devset] elif return_all_scores: - return round(100 * ncorrect / ntotal, 2), [score for *_, score in reordered_devset] + return round(100 * ncorrect / ntotal, 2), [score for *_, score in predicted_devset] elif return_outputs: return round(100 * ncorrect / ntotal, 2), results