diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py index 5a25c36e53..81ab80a869 100644 --- a/dspy/teleprompt/bootstrap.py +++ b/dspy/teleprompt/bootstrap.py @@ -33,37 +33,36 @@ logger = logging.getLogger(__name__) + class BootstrapFewShot(Teleprompter): def __init__( self, metric=None, metric_threshold=None, - teacher_settings: Optional[Dict]=None, + teacher_settings: Optional[Dict] = None, max_bootstrapped_demos=4, max_labeled_demos=16, max_rounds=1, max_errors=5, ): - """ - A Teleprompter class that composes a set of demos/examples to go into a predictor's prompt. + """A Teleprompter class that composes a set of demos/examples to go into a predictor's prompt. These demos come from a combination of labeled examples in the training set, and bootstrapped demos. Args: - metric: Callable - A function that compares an expected value and predicted value, outputting the result of that comparison. - metric_threshold: optional float, default `None` - If the metric yields a numerical value, then check it against this threshold when - deciding whether or not to accept a bootstrap example. - teacher_settings: dict, optional - Settings for the `teacher` model. - max_bootstrapped_demos: int, default 4 - Maximum number of bootstrapped demonstrations to include - max_labeled_demos: int, default 16 - Maximum number of labeled demonstrations to include. - max_rounds: int, default 1 - Number of iterations to attempt generating the required bootstrap examples. If unsuccessful after `max_rounds`, the program ends. - max_errors: int, default 5 - Maximum number of errors until program ends. + metric (Callable): A function that compares an expected value and predicted value, + outputting the result of that comparison. + metric_threshold (float, optional): If the metric yields a numerical value, then check it + against this threshold when deciding whether or not to accept a bootstrap example. + Defaults to None. + teacher_settings (dict, optional): Settings for the `teacher` model. + Defaults to None. + max_bootstrapped_demos (int): Maximum number of bootstrapped demonstrations to include. + Defaults to 4. + max_labeled_demos (int): Maximum number of labeled demonstrations to include. + Defaults to 16. + max_rounds (int): Number of iterations to attempt generating the required bootstrap + examples. If unsuccessful after `max_rounds`, the program ends. Defaults to 1. + max_errors (int): Maximum number of errors until program ends. Defaults to 5. """ self.metric = metric self.metric_threshold = metric_threshold @@ -117,9 +116,10 @@ def _prepare_predictor_mappings(self): if hasattr(predictor1.signature, "equals"): assert predictor1.signature.equals( predictor2.signature, - ), (f"Student and teacher must have the same signatures. " + ), ( + f"Student and teacher must have the same signatures. " f"{type(predictor1.signature)} != {type(predictor2.signature)}" - ) + ) else: # fallback in case if .equals is not implemented (e.g. dsp.Prompt) assert predictor1.signature == predictor2.signature, ( @@ -149,7 +149,8 @@ def _bootstrap(self, *, max_bootstraps=None): self.name2traces = {name: [] for name in self.name2predictor} for example_idx, example in enumerate(tqdm.tqdm(self.trainset)): - if len(bootstrapped) >= max_bootstraps: break + if len(bootstrapped) >= max_bootstraps: + break for round_idx in range(self.max_rounds): bootstrap_attempts += 1 @@ -175,8 +176,8 @@ def _bootstrap(self, *, max_bootstraps=None): # score = evaluate(self.metric, display_table=False, display_progress=True) def _bootstrap_one_example(self, example, round_idx=0): - name2traces = {} #self.name2traces - teacher = self.teacher # .deepcopy() + name2traces = {} + teacher = self.teacher predictor_cache = {} try: @@ -235,10 +236,11 @@ def _bootstrap_one_example(self, example, round_idx=0): name2traces[predictor_name] = name2traces.get(predictor_name, []) name2traces[predictor_name].append(demo) - + # Update the traces for name, demos in name2traces.items(): from datasets.fingerprint import Hasher + # If there are multiple traces for the same predictor in the sample example, # sample 50/50 from the first N-1 traces or the last trace. if len(demos) > 1: diff --git a/dspy/teleprompt/bootstrap_finetune.py b/dspy/teleprompt/bootstrap_finetune.py index aa5169e900..6c670a38c8 100644 --- a/dspy/teleprompt/bootstrap_finetune.py +++ b/dspy/teleprompt/bootstrap_finetune.py @@ -1,5 +1,5 @@ -from collections import defaultdict import logging +from collections import defaultdict from typing import Any, Callable, Dict, List, Optional, Union import dspy @@ -12,12 +12,10 @@ from dspy.primitives.program import Program from dspy.teleprompt.teleprompt import Teleprompter - logger = logging.getLogger(__name__) class FinetuneTeleprompter(Teleprompter): - def __init__( self, train_kwargs: Optional[Union[Dict[str, Any], Dict[LM, Dict[str, Any]]]] = None, @@ -41,23 +39,25 @@ def __init__( train_kwargs: Optional[Union[Dict[str, Any], Dict[LM, Dict[str, Any]]]] = None, adapter: Optional[Union[Adapter, Dict[LM, Adapter]]] = None, exclude_demos: bool = False, - num_threads: int = 6 + num_threads: int = 6, ): # TODO(feature): Inputs train_kwargs (a dict with string keys) and # adapter (Adapter) can depend on the LM they are used with. We are - # takingthese as parameters for the time being. However, they can be + # takingthese as parameters for the time being. However, they can be # attached to LMs themselves -- an LM could know which adapter it should # be used with along with the train_kwargs. This will lead the only # required argument for LM.finetune() to be the train dataset. - + super().__init__(train_kwargs=train_kwargs) self.metric = metric self.multitask = multitask self.adapter: Dict[LM, Adapter] = self.convert_to_lm_dict(adapter) self.exclude_demos = exclude_demos self.num_threads = num_threads - - def compile(self, student: Program, trainset: List[Example], teacher: Optional[Union[Program, List[Program]]] = None) -> Program: + + def compile( + self, student: Program, trainset: List[Example], teacher: Optional[Union[Program, List[Program]]] = None + ) -> Program: # TODO: Print statements can be converted to logger.info if we ensure # that the default DSPy logger logs info level messages in notebook # environments. @@ -71,7 +71,9 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[U teachers = [prepare_teacher(student, t) for t in teachers] for t in teachers: set_missing_predictor_lms(t) - trace_data += bootstrap_trace_data(program=t, dataset=trainset, metric=self.metric, num_threads=self.num_threads) + trace_data += bootstrap_trace_data( + program=t, dataset=trainset, metric=self.metric, num_threads=self.num_threads + ) logger.info("Preparing the train data...") key_to_data = {} @@ -79,16 +81,31 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[U data_pred_ind = None if self.multitask else pred_ind training_key = (pred.lm, data_pred_ind) if training_key not in key_to_data: - train_data, data_format = self._prepare_finetune_data(trace_data=trace_data, lm=pred.lm, pred_ind=data_pred_ind) + train_data, data_format = self._prepare_finetune_data( + trace_data=trace_data, lm=pred.lm, pred_ind=data_pred_ind + ) logger.info(f"Using {len(train_data)} data points for fine-tuning the model: {pred.lm.model}") - finetune_kwargs = dict(lm=pred.lm, train_data=train_data, train_data_format=data_format, train_kwargs=self.train_kwargs[pred.lm]) + finetune_kwargs = dict( + lm=pred.lm, + train_data=train_data, + train_data_format=data_format, + train_kwargs=self.train_kwargs[pred.lm], + ) key_to_data[training_key] = finetune_kwargs - + logger.info("Starting LM fine-tuning...") # TODO(feature): We could run batches of fine-tuning jobs in sequence # to avoid exceeding the number of threads. - err = f"BootstrapFinetune requires `num_threads` to be bigger than or equal to the number of fine-tuning jobs. There are {len(key_to_data)} fine-tuning jobs to start, but the number of threads is: {self.num_threads}! If the `multitask` flag is set to False, the number of fine-tuning jobs will be equal to the number of predictors in the student program. If the `multitask` flag is set to True, the number of fine-tuning jobs will be equal to: 1 if there is only a context LM, or the number of unique LMs attached to the predictors in the student program. In any case, the number of fine-tuning jobs will be less than or equal to the number of predictors." - assert len(key_to_data) <= self.num_threads, err + if len(key_to_data) > self.num_threads: + raise ValueError( + "BootstrapFinetune requires `num_threads` to be bigger than or equal to the number of fine-tuning " + f"jobs. There are {len(key_to_data)} fine-tuning jobs to start, but the number of threads is: " + f"{self.num_threads}! If the `multitask` flag is set to False, the number of fine-tuning jobs will " + "be equal to the number of predictors in the student program. If the `multitask` flag is set to True, " + "the number of fine-tuning jobs will be equal to: 1 if there is only a context LM, or the number of " + "unique LMs attached to the predictors in the student program. In any case, the number of fine-tuning " + "jobs will be less than or equal to the number of predictors." + ) logger.info(f"{len(key_to_data)} fine-tuning job(s) to start") key_to_lm = self.finetune_lms(key_to_data) @@ -98,10 +115,10 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[U training_key = (pred.lm, data_pred_ind) pred.lm = key_to_lm[training_key] # TODO: What should the correct behavior be here? Should - # BootstrapFinetune modify the prompt demos according to the + # BootstrapFinetune modify the prompt demos according to the # train data? pred.demos = [] if self.exclude_demos else pred.demos - + logger.info("BootstrapFinetune has finished compiling the student program") student._compiled = True return student @@ -120,10 +137,13 @@ def finetune_lms(finetune_dict) -> Dict[Any, LM]: # up resources for fine-tuning. This might mean introducing a new # provider method (e.g. prepare_for_finetune) that can be called # before fine-tuning is started. - logger.info("Calling lm.kill() on the LM to be fine-tuned to free up resources. This won't have any effect if the LM is not running.") + logger.info( + "Calling lm.kill() on the LM to be fine-tuned to free up resources. This won't have any effect if the " + "LM is not running." + ) lm.kill() key_to_job[key] = lm.finetune(**finetune_kwargs) - + key_to_lm = {} for ind, (key, job) in enumerate(key_to_job.items()): key_to_lm[key] = job.result() @@ -143,13 +163,16 @@ def _prepare_finetune_data(self, trace_data: List[Dict[str, Any]], lm: LM, pred_ adapter = self.adapter[lm] or lm.infer_adapter() data_format = infer_data_format(adapter) for item in trace_data: - for pred_ind, _ in enumerate(item['trace']): + for pred_ind, _ in enumerate(item["trace"]): include_data = pred_ind is None or pred_ind == pred_ind if include_data: - call_data = build_call_data_from_trace(trace=item['trace'], pred_ind=pred_ind, adapter=adapter, exclude_demos=self.exclude_demos) + call_data = build_call_data_from_trace( + trace=item["trace"], pred_ind=pred_ind, adapter=adapter, exclude_demos=self.exclude_demos + ) data.append(call_data) import random + random.Random(0).shuffle(data) return data, data_format @@ -189,8 +212,11 @@ def bootstrap_trace_data( # Return a list of dicts with the following keys: # example_ind, example, prediction, trace, and score (if metric != None) evaluator = Evaluate( - devset=dataset, num_threads=num_threads, display_progress=True, return_outputs=True, - provide_traceback=True # TODO(check with team) + devset=dataset, + num_threads=num_threads, + display_progress=True, + return_outputs=True, + provide_traceback=True, # TODO(check with team) ) def wrapped_metric(example, prediction, trace=None): @@ -286,11 +312,10 @@ def assert_structural_equivalency(program1: object, program2: object): pzip = zip(program1.named_predictors(), program2.named_predictors()) for ind, ((name1, pred1), (name2, pred2)) in enumerate(pzip): - err = f"Program predictor names must match at corresponding indices for structural equivalency. The predictor names for the programs do not match at index {ind}: '{name1}' != '{name2}'" + err = f"Program predictor names must match at corresponding indices for structural equivalency. The predictor names for the programs do not match at index {ind}: '{name1}' != '{name2}'" assert name1 == name2, err assert isinstance(pred1, Predict) assert isinstance(pred2, Predict) - # assert pred1.signature.equals(pred2.signature) def assert_no_shared_predictor(program1: Program, program2: Program): @@ -303,17 +328,18 @@ def assert_no_shared_predictor(program1: Program, program2: Program): assert not shared_ids, err -def get_unique_lms(program: Program) -> List[LM]: - lms = [pred.lm for pred in program.predictors()] - lms = list(set(lms)) - return lms +def get_unique_lms(program: Program) -> List[LM]: + lms = [pred.lm for pred in program.predictors()] + return list(set(lms)) + def launch_lms(program: Program): lms = get_unique_lms(program) for lm in lms: lm.launch() -def kill_lms(program: Program): - lms = get_unique_lms(program) - for lm in lms: + +def kill_lms(program: Program): + lms = get_unique_lms(program) + for lm in lms: lm.kill() diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py index 6231265eea..246f0005d2 100644 --- a/dspy/teleprompt/mipro_optimizer_v2.py +++ b/dspy/teleprompt/mipro_optimizer_v2.py @@ -67,9 +67,7 @@ def __init__( # Validate 'auto' parameter allowed_modes = {None, "light", "medium", "heavy"} if auto not in allowed_modes: - raise ValueError( - f"Invalid value for auto: {auto}. Must be one of {allowed_modes}." - ) + raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.") self.auto = auto self.num_candidates = num_candidates @@ -126,9 +124,7 @@ def compile( trainset, valset = self._set_and_validate_datasets(trainset, valset) # Set hyperparameters based on run mode (if set) - zeroshot_opt = (self.max_bootstrapped_demos == 0) and ( - self.max_labeled_demos == 0 - ) + zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0) num_trials, valset, minibatch = self._set_hyperparams_from_run_mode( student, num_trials, minibatch, zeroshot_opt, valset ) @@ -137,9 +133,7 @@ def compile( self._print_auto_run_settings(num_trials, minibatch, valset) if minibatch and minibatch_size > len(valset): - raise ValueError( - f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}." - ) + raise ValueError(f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}.") # Estimate LM calls and get user confirmation if requires_permission_to_run: @@ -200,10 +194,8 @@ def compile( ) return best_program - - def _set_random_seeds(self, - seed - ): + + def _set_random_seeds(self, seed): self.rng = random.Random(seed) np.random.seed(seed) @@ -226,9 +218,7 @@ def _set_hyperparams_from_run_mode( num_trials = auto_settings["num_trials"] valset = create_minibatch(valset, batch_size=auto_settings["val_size"], rng=self.rng) minibatch = len(valset) > MIN_MINIBATCH_SIZE - self.num_candidates = int( - np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars])) - ) + self.num_candidates = int(np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars]))) return num_trials, valset, minibatch @@ -238,9 +228,7 @@ def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]): if valset is None: if len(trainset) < 2: - raise ValueError( - "Trainset must have at least 2 examples if no valset specified." - ) + raise ValueError("Trainset must have at least 2 examples if no valset specified.") valset_size = min(1000, max(1, int(len(trainset) * 0.80))) cutoff = len(trainset) - valset_size valset = trainset[cutoff:] @@ -276,9 +264,7 @@ def _estimate_lm_calls( estimated_prompt_model_calls = ( 10 # Data summarizer calls + self.num_candidates * num_predictors # Candidate generation - + ( - num_predictors + 1 if program_aware_proposer else 0 - ) # Program-aware proposer + + (num_predictors + 1 if program_aware_proposer else 0) # Program-aware proposer ) prompt_model_line = ( f"{YELLOW}- Prompt Generation: {BLUE}{BOLD}10{ENDC}{YELLOW} data summarizer calls + " @@ -298,9 +284,7 @@ def _estimate_lm_calls( ) else: full_eval_steps = num_trials // minibatch_full_eval_steps + 1 - estimated_task_model_calls = ( - minibatch_size * num_trials + len(valset) * full_eval_steps - ) + estimated_task_model_calls = minibatch_size * num_trials + len(valset) * full_eval_steps task_model_line = ( f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{minibatch_size}{ENDC}{YELLOW} examples in minibatch * " f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches + " @@ -363,15 +347,12 @@ def _get_user_confirmation( """ ) - user_input = input( - f"{user_message}\n{user_confirmation_message}\n" - "Do you wish to continue? (y/n): " - ).strip().lower() + user_input = ( + input(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ").strip().lower() + ) return user_input == "y" - def _bootstrap_fewshot_examples( - self, program: Any, trainset: List, seed: int, teacher: Any - ) -> Optional[List]: + def _bootstrap_fewshot_examples(self, program: Any, trainset: List, seed: int, teacher: Any) -> Optional[List]: logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==") if self.max_bootstrapped_demos > 0: logger.info( @@ -389,15 +370,9 @@ def _bootstrap_fewshot_examples( student=program, num_candidate_sets=self.num_candidates, trainset=trainset, - max_labeled_demos=( - LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT - if zeroshot - else self.max_labeled_demos - ), + max_labeled_demos=(LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_labeled_demos), max_bootstrapped_demos=( - BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT - if zeroshot - else self.max_bootstrapped_demos + BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_bootstrapped_demos ), metric=self.metric, max_errors=self.max_errors, @@ -438,13 +413,13 @@ def _propose_instructions( program_aware=program_aware_proposer, use_dataset_summary=data_aware_proposer, use_task_demos=fewshot_aware_proposer, - num_demos_in_context = BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT, + num_demos_in_context=BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT, use_tip=tip_aware_proposer, set_tip_randomly=tip_aware_proposer, use_instruct_history=False, set_history_randomly=False, verbose=self.verbose, - rng=self.rng + rng=self.rng, ) logger.info("\nProposing instructions...\n") @@ -479,7 +454,6 @@ def _optimize_prompt_parameters( minibatch_full_eval_steps: int, seed: int, ) -> Optional[Any]: - # Run optimization optuna.logging.set_verbosity(optuna.logging.WARNING) logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==") @@ -491,14 +465,14 @@ def _optimize_prompt_parameters( adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==") - default_score, baseline_results = eval_candidate_program(len(valset), valset, program, evaluate, self.rng, return_all_scores=True) + default_score, baseline_results = eval_candidate_program( + len(valset), valset, program, evaluate, self.rng, return_all_scores=True + ) logger.info(f"Default program score: {default_score}\n") trial_logs = {} trial_logs[-1] = {} - trial_logs[-1]["full_eval_program_path"] = save_candidate_program( - program, self.log_dir, -1 - ) + trial_logs[-1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1) trial_logs[-1]["full_eval_score"] = default_score trial_logs[-1]["total_eval_calls_so_far"] = len(valset) trial_logs[-1]["full_eval_program"] = program.deepcopy() @@ -543,9 +517,7 @@ def objective(trial): # Evaluate the candidate program (on minibatch if minibatch=True) batch_size = minibatch_size if minibatch else len(valset) - score = eval_candidate_program( - batch_size, valset, candidate_program, evaluate, self.rng - ) + score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng) total_eval_calls += batch_size # Update best score and program @@ -555,7 +527,9 @@ def objective(trial): logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}") # Log evaluation results - score_data.append({"score": score, "program": candidate_program, "full_eval": batch_size >= len(valset)}) # score, prog, full_eval + score_data.append( + {"score": score, "program": candidate_program, "full_eval": batch_size >= len(valset)} + ) # score, prog, full_eval if minibatch: self._log_minibatch_eval( score, @@ -572,7 +546,18 @@ def objective(trial): ) else: self._log_normal_eval( - score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls + score, + best_score, + chosen_params, + score_data, + trial, + num_trials, + trial_logs, + trial_num, + valset, + batch_size, + candidate_program, + total_eval_calls, ) categorical_key = ",".join(map(str, chosen_params)) param_score_dict[categorical_key].append( @@ -580,10 +565,7 @@ def objective(trial): ) # If minibatch, perform full evaluation at intervals (and at the very end) - if minibatch and ( - (trial_num % minibatch_full_eval_steps == 0) - or (trial_num == (adjusted_num_trials -1)) - ): + if minibatch and ((trial_num % minibatch_full_eval_steps == 0) or (trial_num == (adjusted_num_trials - 1))): best_score, best_program, total_eval_calls = self._perform_full_evaluation( trial_num, adjusted_num_trials, @@ -598,9 +580,9 @@ def objective(trial): best_program, study, instruction_candidates, - demo_candidates + demo_candidates, ) - + return score sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True) @@ -613,11 +595,11 @@ def objective(trial): # Add default run as a baseline in optuna (TODO: figure out how to weight this by # of samples evaluated on) trial = optuna.trial.create_trial( params=default_params, - distributions= self._get_param_distributions(program, instruction_candidates, demo_candidates), + distributions=self._get_param_distributions(program, instruction_candidates, demo_candidates), value=default_score, ) study.add_trial(trial) - study.optimize(objective, n_trials=num_trials-1) + study.optimize(objective, n_trials=num_trials - 1) # Attach logs to best program if best_program is not None and self.track_stats: @@ -627,9 +609,13 @@ def objective(trial): best_program.total_calls = self.total_calls sorted_candidate_programs = sorted(score_data, key=lambda x: x["score"], reverse=True) # Attach all minibatch programs - best_program.mb_candidate_programs = [score_data for score_data in sorted_candidate_programs if not score_data["full_eval"]] + best_program.mb_candidate_programs = [ + score_data for score_data in sorted_candidate_programs if not score_data["full_eval"] + ] # Attach all programs that were evaluated on the full trainset, in descending order of score - best_program.candidate_programs = [score_data for score_data in sorted_candidate_programs if score_data["full_eval"]] + best_program.candidate_programs = [ + score_data for score_data in sorted_candidate_programs if score_data["full_eval"] + ] logger.info(f"Returning best identified program with score {best_score}!") @@ -649,28 +635,36 @@ def _log_minibatch_eval( candidate_program, total_eval_calls, ): - trial_logs[trial_num]["mb_program_path"] = save_candidate_program( - candidate_program, self.log_dir, trial_num - ) + trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num) trial_logs[trial_num]["mb_score"] = score trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy() - - logger.info( - f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}." - ) - minibatch_scores = ', '.join([f'{s["score"]}' for s in score_data if not s["full_eval"]]) - logger.info(f"Minibatch scores so far: {'['+ minibatch_scores +']'}") - full_eval_scores = ', '.join([f'{s["score"]}' for s in score_data if s["full_eval"]]) + + logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.") + minibatch_scores = ", ".join([f"{s['score']}" for s in score_data if not s["full_eval"]]) + logger.info(f"Minibatch scores so far: {'[' + minibatch_scores + ']'}") + full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]]) trajectory = "[" + full_eval_scores + "]" logger.info(f"Full eval scores so far: {trajectory}") logger.info(f"Best full score so far: {best_score}") logger.info( - f'{"="*len(f"== Trial {trial.number+1} / {adjusted_num_trials} - Minibatch Evaluation ==")}\n\n' + f"{'=' * len(f'== Trial {trial.number + 1} / {adjusted_num_trials} - Minibatch Evaluation ==')}\n\n" ) def _log_normal_eval( - self, score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls + self, + score, + best_score, + chosen_params, + score_data, + trial, + num_trials, + trial_logs, + trial_num, + valset, + batch_size, + candidate_program, + total_eval_calls, ): trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program( candidate_program, self.log_dir, trial_num @@ -680,10 +674,10 @@ def _log_normal_eval( trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy() logger.info(f"Score: {score} with parameters {chosen_params}.") - full_eval_scores = ', '.join([f'{s["score"]}' for s in score_data if s["full_eval"]]) - logger.info(f"Scores so far: {'['+full_eval_scores+']'}") + full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]]) + logger.info(f"Scores so far: {'[' + full_eval_scores + ']'}") logger.info(f"Best score so far: {best_score}") - logger.info(f'{"="*len(f"===== Trial {trial.number+1} / {num_trials} =====")}\n\n') + logger.info(f"{'=' * len(f'===== Trial {trial.number + 1} / {num_trials} =====')}\n\n") def _select_and_insert_instructions_and_demos( self, @@ -703,18 +697,14 @@ def _select_and_insert_instructions_and_demos( f"{i}_predictor_instruction", range(len(instruction_candidates[i])) ) selected_instruction = instruction_candidates[i][instruction_idx] - updated_signature = get_signature(predictor).with_instructions( - selected_instruction - ) + updated_signature = get_signature(predictor).with_instructions(selected_instruction) set_signature(predictor, updated_signature) trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}") raw_chosen_params[f"{i}_predictor_instruction"] = instruction_idx # Select demos if available if demo_candidates: - demos_idx = trial.suggest_categorical( - f"{i}_predictor_demos", range(len(demo_candidates[i])) - ) + demos_idx = trial.suggest_categorical(f"{i}_predictor_demos", range(len(demo_candidates[i]))) predictor.demos = demo_candidates[i][demos_idx] trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}") @@ -726,7 +716,9 @@ def _get_param_distributions(self, program, instruction_candidates, demo_candida param_distributions = {} for i in range(len(instruction_candidates)): - param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(range(len(instruction_candidates[i]))) + param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution( + range(len(instruction_candidates[i])) + ) if demo_candidates: param_distributions[f"{i}_predictor_demos"] = CategoricalDistribution(range(len(demo_candidates[i]))) @@ -749,26 +741,20 @@ def _perform_full_evaluation( instruction_candidates: List, demo_candidates: List, ): - logger.info(f"===== Trial {trial_num+1} / {adjusted_num_trials} - Full Evaluation =====") + logger.info(f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation =====") # Identify best program to evaluate fully - highest_mean_program, mean_score, combo_key, params = ( - get_program_with_highest_avg_score( - param_score_dict, fully_evaled_param_combos - ) - ) - logger.info( - f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials..." - ) - full_eval_score = eval_candidate_program( - len(valset), valset, highest_mean_program, evaluate, self.rng + highest_mean_program, mean_score, combo_key, params = get_program_with_highest_avg_score( + param_score_dict, fully_evaled_param_combos ) + logger.info(f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials...") + full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng) score_data.append({"score": full_eval_score, "program": highest_mean_program, "full_eval": True}) # Log full eval as a trial so that optuna can learn from the new results trial = optuna.trial.create_trial( params=params, - distributions= self._get_param_distributions(best_program, instruction_candidates, demo_candidates), + distributions=self._get_param_distributions(best_program, instruction_candidates, demo_candidates), value=full_eval_score, ) study.add_trial(trial) @@ -794,11 +780,11 @@ def _perform_full_evaluation( logger.info(f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}") best_score = full_eval_score best_program = highest_mean_program.deepcopy() - full_eval_scores = ', '.join([f'{s["score"]}' for s in score_data if s["full_eval"]]) + full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]]) trajectory = "[" + full_eval_scores + "]" logger.info(f"Full eval scores so far: {trajectory}") logger.info(f"Best full score so far: {best_score}") - logger.info(len(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") * "=") + logger.info(len(f"===== Full Eval {len(fully_evaled_param_combos) + 1} =====") * "=") logger.info("\n") return best_score, best_program, total_eval_calls