diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index b9c7098971..50be80a1e7 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -97,6 +97,7 @@ def __call__( display_table: Optional[bool] = None, return_all_scores: Optional[bool] = None, return_outputs: Optional[bool] = None, + callback_metadata: Optional[dict[str, Any]] = None, ): """ Args: @@ -113,6 +114,7 @@ def __call__( use `self.return_all_scores`. return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not provided, use `self.return_outputs`. + callback_metadata (dict): Metadata to be used for evaluate callback handlers. Returns: The evaluation results are returned in different formats based on the flags: @@ -139,6 +141,9 @@ def __call__( return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores return_outputs = return_outputs if return_outputs is not None else self.return_outputs + if callback_metadata: + logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}") + tqdm.tqdm._instances.clear() executor = ParallelExecutor( diff --git a/dspy/teleprompt/utils.py b/dspy/teleprompt/utils.py index 9414c7bd7e..bbf9ac5035 100644 --- a/dspy/teleprompt/utils.py +++ b/dspy/teleprompt/utils.py @@ -49,13 +49,14 @@ def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rn try: # Evaluate on the full trainset if batch_size >= len(trainset): - return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores) + return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"}) # Or evaluate on a minibatch else: return evaluate( candidate_program, devset=create_minibatch(trainset, batch_size, rng), - return_all_scores=return_all_scores + return_all_scores=return_all_scores, + callback_metadata={"metric_key": "eval_minibatch"} ) except Exception: logger.error("An exception occurred during evaluation", exc_info=True) diff --git a/tests/teleprompt/test_utils.py b/tests/teleprompt/test_utils.py index f067e488dc..e6f2d8a398 100644 --- a/tests/teleprompt/test_utils.py +++ b/tests/teleprompt/test_utils.py @@ -23,6 +23,7 @@ def test_eval_candidate_program_full_trainset(): evaluate.assert_called_once() _, called_kwargs = evaluate.call_args assert len(called_kwargs['devset']) == len(trainset) + assert called_kwargs['callback_metadata'] == {"metric_key": "eval_full"} assert result == 0 def test_eval_candidate_program_minibatch(): @@ -36,6 +37,7 @@ def test_eval_candidate_program_minibatch(): evaluate.assert_called_once() _, called_kwargs = evaluate.call_args assert len(called_kwargs['devset']) == batch_size + assert called_kwargs['callback_metadata'] == {"metric_key": "eval_minibatch"} assert result == 0 @pytest.mark.parametrize("return_all_scores", [True, False])