From be79732a1d2ba8b3be57c09597b4fda2a9fa787b Mon Sep 17 00:00:00 2001 From: klopsahlong Date: Tue, 19 Nov 2024 15:46:12 -0800 Subject: [PATCH 1/2] fixing up trial logging --- dspy/teleprompt/mipro_optimizer_v2.py | 96 ++++++++++++++++----------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py index 6603a259b2..a2f7f38545 100644 --- a/dspy/teleprompt/mipro_optimizer_v2.py +++ b/dspy/teleprompt/mipro_optimizer_v2.py @@ -477,22 +477,26 @@ def _optimize_prompt_parameters( default_score = eval_candidate_program(len(valset), valset, program, evaluate, self.rng) print(f"Default program score: {default_score}\n") + trial_logs = {} + trial_logs[-1] = {} + trial_logs[-1]["full_eval_program_path"] = save_candidate_program( + program, self.log_dir, -1 + ) + trial_logs[-1]["full_eval_score"] = default_score + trial_logs[-1]["total_eval_calls_so_far"] = len(valset) + trial_logs[-1]["full_eval_program"] = program.deepcopy() + # Initialize optimization variables best_score = default_score best_program = program.deepcopy() - trial_logs = {} - total_eval_calls = 0 - if minibatch: - scores = [] - else: - scores = [default_score] - full_eval_scores = [default_score] + total_eval_calls = len(valset) + score_data= [(best_score, program.deepcopy(), True)] param_score_dict = defaultdict(list) fully_evaled_param_combos = {} # Define the objective function def objective(trial): - nonlocal program, best_program, best_score, trial_logs, total_eval_calls, scores, full_eval_scores + nonlocal program, best_program, best_score, trial_logs, total_eval_calls, score_data trial_num = trial.number + 1 if minibatch: @@ -520,16 +524,12 @@ def objective(trial): print("Evaluating the following candidate program...\n") print_full_program(candidate_program) - # Save the candidate program - trial_logs[trial_num]["program_path"] = save_candidate_program( - candidate_program, self.log_dir, trial_num - ) - - # Evaluate the candidate program + # Evaluate the candidate program (on minibatch if minibatch=True) batch_size = minibatch_size if minibatch else len(valset) score = eval_candidate_program( batch_size, valset, candidate_program, evaluate, self.rng ) + total_eval_calls += batch_size # Update best score and program if not minibatch and score > best_score: @@ -538,39 +538,36 @@ def objective(trial): print(f"{GREEN}Best full score so far!{ENDC} Score: {score}") # Log evaluation results - scores.append(score) + score_data.append((score, program, batch_size >= len(valset))) # score, prog, full_eval if minibatch: self._log_minibatch_eval( score, best_score, batch_size, chosen_params, - scores, - full_eval_scores, + score_data, trial, num_trials, + trial_logs, + trial_num, + candidate_program, + total_eval_calls, ) else: self._log_normal_eval( - score, best_score, chosen_params, scores, trial, num_trials + score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls ) categorical_key = ",".join(map(str, chosen_params)) param_score_dict[categorical_key].append( (score, candidate_program), ) - trial_logs[trial_num]["num_eval_calls"] = batch_size - trial_logs[trial_num]["full_eval"] = batch_size >= len(valset) - trial_logs[trial_num]["score"] = score - total_eval_calls += batch_size - trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls - trial_logs[trial_num]["program"] = candidate_program.deepcopy() # If minibatch, perform full evaluation at intervals if minibatch and ( (trial_num % minibatch_full_eval_steps == 0) or (trial_num == num_trials) ): - best_score, best_program = self._perform_full_evaluation( + best_score, best_program, total_eval_calls = self._perform_full_evaluation( trial_num, param_score_dict, fully_evaled_param_combos, @@ -578,7 +575,7 @@ def objective(trial): valset, trial_logs, total_eval_calls, - full_eval_scores, + score_data, best_score, best_program, ) @@ -602,6 +599,10 @@ def objective(trial): best_program.score = best_score best_program.prompt_model_total_calls = self.prompt_model_total_calls best_program.total_calls = self.total_calls + # Attach all evaluated candidate programs (full + minibatch eval if applicable, with full first and minibatch to follow) + best_program.candidate_programs = sorted(score_data, key=lambda x: x[0], reverse=True) + # Attach all programs that were evaluated on the full trainset, in descending order of score + best_program.full_eval_candidate_programs = [score_data for score_data in best_program.candidate_programs if score_data[2]] print(f"Returning best identified program with score {best_score}!") @@ -613,16 +614,26 @@ def _log_minibatch_eval( best_score, batch_size, chosen_params, - scores, - full_eval_scores, + score_data, trial, num_trials, + trial_logs, + trial_num, + candidate_program, + total_eval_calls, ): + trial_logs[trial_num]["mb_program_path"] = save_candidate_program( + candidate_program, self.log_dir, trial_num + ) + trial_logs[trial_num]["mb_score"] = score + trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls + trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy() + print( f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}." ) - print(f"Minibatch scores so far: {'['+', '.join([f'{s}' for s in scores])+']'}") - trajectory = "[" + ", ".join([f"{s}" for s in full_eval_scores]) + "]" + print(f"Minibatch scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if not s[2]]) +']'}") + trajectory = "[" + ", ".join([f"{s[0]}" for s in score_data if s[2]]) + "]" print(f"Full eval scores so far: {trajectory}") print(f"Best full score so far: {best_score}") print( @@ -630,10 +641,17 @@ def _log_minibatch_eval( ) def _log_normal_eval( - self, score, best_score, chosen_params, scores, trial, num_trials + self, score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls ): + trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program( + candidate_program, self.log_dir, trial_num + ) + trial_logs[trial_num]["full_eval_score"] = score + trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls + trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy() + print(f"Score: {score} with parameters {chosen_params}.") - print(f"Scores so far: {'['+', '.join([f'{s}' for s in scores])+']'}") + print(f"Scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if s[2]])+']'}") print(f"Best score so far: {best_score}") print(f'{"="*len(f"===== Trial {trial.number+1} / {num_trials} =====")}\n\n') @@ -681,7 +699,7 @@ def _perform_full_evaluation( valset: List, trial_logs: Dict, total_eval_calls: int, - full_eval_scores: List[int], + score_data, best_score: float, best_program: Any, ): @@ -699,7 +717,7 @@ def _perform_full_evaluation( full_eval_score = eval_candidate_program( len(valset), valset, highest_mean_program, evaluate, self.rng ) - full_eval_scores.append(full_eval_score) + score_data.append((full_eval_score, highest_mean_program, True)) # Log full evaluation results fully_evaled_param_combos[combo_key] = { @@ -708,24 +726,24 @@ def _perform_full_evaluation( } total_eval_calls += len(valset) trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls - trial_logs[trial_num]["full_eval"] = True - trial_logs[trial_num]["program_path"] = save_candidate_program( + trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program( program=highest_mean_program, log_dir=self.log_dir, trial_num=trial_num, note="full_eval", ) - trial_logs[trial_num]["score"] = full_eval_score + trial_logs[trial_num]["full_eval_program"] = highest_mean_program + trial_logs[trial_num]["full_eval_score"] = full_eval_score # Update best score and program if necessary if full_eval_score > best_score: print(f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}") best_score = full_eval_score best_program = highest_mean_program.deepcopy() - trajectory = "[" + ", ".join([f"{s}" for s in full_eval_scores]) + "]" + trajectory = "[" + ", ".join([f"{s[0]}" for s in score_data if s[2]]) + "]" print(f"Full eval scores so far: {trajectory}") print(f"Best full score so far: {best_score}") print(len(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") * "=") print("\n") - return best_score, best_program + return best_score, best_program, total_eval_calls From 8e33792c91a2a2c4eed7108a411d11f931743429 Mon Sep 17 00:00:00 2001 From: klopsahlong Date: Thu, 21 Nov 2024 06:31:27 -0800 Subject: [PATCH 2/2] fixed naming of candidate_programs, fixed bug of saving the default program instead of the candidate_program in minibatching case --- dspy/teleprompt/mipro_optimizer_v2.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py index 4ad1d26f9f..348875abe5 100644 --- a/dspy/teleprompt/mipro_optimizer_v2.py +++ b/dspy/teleprompt/mipro_optimizer_v2.py @@ -540,7 +540,7 @@ def objective(trial): logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}") # Log evaluation results - score_data.append((score, program, batch_size >= len(valset))) # score, prog, full_eval + score_data.append((score, candidate_program, batch_size >= len(valset))) # score, prog, full_eval if minibatch: self._log_minibatch_eval( score, @@ -601,10 +601,11 @@ def objective(trial): best_program.score = best_score best_program.prompt_model_total_calls = self.prompt_model_total_calls best_program.total_calls = self.total_calls - # Attach all evaluated candidate programs (full + minibatch eval if applicable, with full first and minibatch to follow) - best_program.candidate_programs = sorted(score_data, key=lambda x: x[0], reverse=True) + sorted_candidate_programs = sorted(score_data, key=lambda x: x[0], reverse=True) + # Attach all minibatch programs + best_program.mb_candidate_programs = [score_data for score_data in sorted_candidate_programs if not score_data[2]] # Attach all programs that were evaluated on the full trainset, in descending order of score - best_program.full_eval_candidate_programs = [score_data for score_data in best_program.candidate_programs if score_data[2]] + best_program.candidate_programs = [score_data for score_data in sorted_candidate_programs if score_data[2]] logger.info(f"Returning best identified program with score {best_score}!")