diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
index 5a25c36e53..81ab80a869 100644
--- a/dspy/teleprompt/bootstrap.py
+++ b/dspy/teleprompt/bootstrap.py
@@ -33,37 +33,36 @@
 
 logger = logging.getLogger(__name__)
 
+
 class BootstrapFewShot(Teleprompter):
     def __init__(
         self,
         metric=None,
         metric_threshold=None,
-        teacher_settings: Optional[Dict]=None,
+        teacher_settings: Optional[Dict] = None,
         max_bootstrapped_demos=4,
         max_labeled_demos=16,
         max_rounds=1,
         max_errors=5,
     ):
-        """
-        A Teleprompter class that composes a set of demos/examples to go into a predictor's prompt.
+        """A Teleprompter class that composes a set of demos/examples to go into a predictor's prompt.
         These demos come from a combination of labeled examples in the training set, and bootstrapped demos.
 
         Args:
-            metric: Callable
-                A function that compares an expected value and predicted value, outputting the result of that comparison. 
-            metric_threshold: optional float, default `None`
-                If the metric yields a numerical value, then check it against this threshold when
-                deciding whether or not to accept a bootstrap example.
-            teacher_settings: dict, optional
-                Settings for the `teacher` model.
-            max_bootstrapped_demos: int, default 4
-                Maximum number of bootstrapped demonstrations to include
-            max_labeled_demos: int, default 16
-                Maximum number of labeled demonstrations to include.
-            max_rounds: int, default 1
-                Number of iterations to attempt generating the required bootstrap examples. If unsuccessful after `max_rounds`, the program ends.
-            max_errors: int, default 5
-                Maximum number of errors until program ends.
+            metric (Callable): A function that compares an expected value and predicted value,
+                outputting the result of that comparison.
+            metric_threshold (float, optional): If the metric yields a numerical value, then check it
+                against this threshold when deciding whether or not to accept a bootstrap example.
+                Defaults to None.
+            teacher_settings (dict, optional): Settings for the `teacher` model.
+                Defaults to None.
+            max_bootstrapped_demos (int): Maximum number of bootstrapped demonstrations to include.
+                Defaults to 4.
+            max_labeled_demos (int): Maximum number of labeled demonstrations to include.
+                Defaults to 16.
+            max_rounds (int): Number of iterations to attempt generating the required bootstrap
+                examples. If unsuccessful after `max_rounds`, the program ends. Defaults to 1.
+            max_errors (int): Maximum number of errors until program ends. Defaults to 5.
         """
         self.metric = metric
         self.metric_threshold = metric_threshold
@@ -117,9 +116,10 @@ def _prepare_predictor_mappings(self):
             if hasattr(predictor1.signature, "equals"):
                 assert predictor1.signature.equals(
                     predictor2.signature,
-                ), (f"Student and teacher must have the same signatures. "
+                ), (
+                    f"Student and teacher must have the same signatures. "
                     f"{type(predictor1.signature)} != {type(predictor2.signature)}"
-                    )
+                )
             else:
                 # fallback in case if .equals is not implemented (e.g. dsp.Prompt)
                 assert predictor1.signature == predictor2.signature, (
@@ -149,7 +149,8 @@ def _bootstrap(self, *, max_bootstraps=None):
         self.name2traces = {name: [] for name in self.name2predictor}
 
         for example_idx, example in enumerate(tqdm.tqdm(self.trainset)):
-            if len(bootstrapped) >= max_bootstraps: break
+            if len(bootstrapped) >= max_bootstraps:
+                break
 
             for round_idx in range(self.max_rounds):
                 bootstrap_attempts += 1
@@ -175,8 +176,8 @@ def _bootstrap(self, *, max_bootstraps=None):
         # score = evaluate(self.metric, display_table=False, display_progress=True)
 
     def _bootstrap_one_example(self, example, round_idx=0):
-        name2traces = {} #self.name2traces
-        teacher = self.teacher  # .deepcopy()
+        name2traces = {}
+        teacher = self.teacher
         predictor_cache = {}
 
         try:
@@ -235,10 +236,11 @@ def _bootstrap_one_example(self, example, round_idx=0):
 
                 name2traces[predictor_name] = name2traces.get(predictor_name, [])
                 name2traces[predictor_name].append(demo)
-        
+
             # Update the traces
             for name, demos in name2traces.items():
                 from datasets.fingerprint import Hasher
+
                 # If there are multiple traces for the same predictor in the sample example,
                 # sample 50/50 from the first N-1 traces or the last trace.
                 if len(demos) > 1:
diff --git a/dspy/teleprompt/bootstrap_finetune.py b/dspy/teleprompt/bootstrap_finetune.py
index aa5169e900..6c670a38c8 100644
--- a/dspy/teleprompt/bootstrap_finetune.py
+++ b/dspy/teleprompt/bootstrap_finetune.py
@@ -1,5 +1,5 @@
-from collections import defaultdict
 import logging
+from collections import defaultdict
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import dspy
@@ -12,12 +12,10 @@
 from dspy.primitives.program import Program
 from dspy.teleprompt.teleprompt import Teleprompter
 
-
 logger = logging.getLogger(__name__)
 
 
 class FinetuneTeleprompter(Teleprompter):
-
     def __init__(
         self,
         train_kwargs: Optional[Union[Dict[str, Any], Dict[LM, Dict[str, Any]]]] = None,
@@ -41,23 +39,25 @@ def __init__(
         train_kwargs: Optional[Union[Dict[str, Any], Dict[LM, Dict[str, Any]]]] = None,
         adapter: Optional[Union[Adapter, Dict[LM, Adapter]]] = None,
         exclude_demos: bool = False,
-        num_threads: int = 6
+        num_threads: int = 6,
     ):
         # TODO(feature): Inputs train_kwargs (a dict with string keys) and
         # adapter (Adapter) can depend on the LM they are used with. We are
-        # takingthese as parameters for the time being. However, they can be 
+        # takingthese as parameters for the time being. However, they can be
         # attached to LMs themselves -- an LM could know which adapter it should
         # be used with along with the train_kwargs. This will lead the only
         # required argument for LM.finetune() to be the train dataset.
-        
+
         super().__init__(train_kwargs=train_kwargs)
         self.metric = metric
         self.multitask = multitask
         self.adapter: Dict[LM, Adapter] = self.convert_to_lm_dict(adapter)
         self.exclude_demos = exclude_demos
         self.num_threads = num_threads
-    
-    def compile(self, student: Program, trainset: List[Example], teacher: Optional[Union[Program, List[Program]]] = None) -> Program:
+
+    def compile(
+        self, student: Program, trainset: List[Example], teacher: Optional[Union[Program, List[Program]]] = None
+    ) -> Program:
         # TODO: Print statements can be converted to logger.info if we ensure
         # that the default DSPy logger logs info level messages in notebook
         # environments.
@@ -71,7 +71,9 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[U
         teachers = [prepare_teacher(student, t) for t in teachers]
         for t in teachers:
             set_missing_predictor_lms(t)
-            trace_data += bootstrap_trace_data(program=t, dataset=trainset, metric=self.metric, num_threads=self.num_threads)
+            trace_data += bootstrap_trace_data(
+                program=t, dataset=trainset, metric=self.metric, num_threads=self.num_threads
+            )
 
         logger.info("Preparing the train data...")
         key_to_data = {}
@@ -79,16 +81,31 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[U
             data_pred_ind = None if self.multitask else pred_ind
             training_key = (pred.lm, data_pred_ind)
             if training_key not in key_to_data:
-                train_data, data_format = self._prepare_finetune_data(trace_data=trace_data, lm=pred.lm, pred_ind=data_pred_ind)
+                train_data, data_format = self._prepare_finetune_data(
+                    trace_data=trace_data, lm=pred.lm, pred_ind=data_pred_ind
+                )
                 logger.info(f"Using {len(train_data)} data points for fine-tuning the model: {pred.lm.model}")
-                finetune_kwargs = dict(lm=pred.lm, train_data=train_data, train_data_format=data_format, train_kwargs=self.train_kwargs[pred.lm])
+                finetune_kwargs = dict(
+                    lm=pred.lm,
+                    train_data=train_data,
+                    train_data_format=data_format,
+                    train_kwargs=self.train_kwargs[pred.lm],
+                )
                 key_to_data[training_key] = finetune_kwargs
-        
+
         logger.info("Starting LM fine-tuning...")
         # TODO(feature): We could run batches of fine-tuning jobs in sequence
         # to avoid exceeding the number of threads.
-        err = f"BootstrapFinetune requires `num_threads` to be bigger than or equal to the number of fine-tuning jobs. There are {len(key_to_data)} fine-tuning jobs to start, but the number of threads is: {self.num_threads}! If the `multitask` flag is set to False, the number of fine-tuning jobs will be equal to the number of predictors in the student program. If the `multitask` flag is set to True, the number of fine-tuning jobs will be equal to: 1 if there is only a context LM, or the number of unique LMs attached to the predictors in the student program. In any case, the number of fine-tuning jobs will be less than or equal to the number of predictors."
-        assert len(key_to_data) <= self.num_threads, err
+        if len(key_to_data) > self.num_threads:
+            raise ValueError(
+                "BootstrapFinetune requires `num_threads` to be bigger than or equal to the number of fine-tuning "
+                f"jobs. There are {len(key_to_data)} fine-tuning jobs to start, but the number of threads is: "
+                f"{self.num_threads}! If the `multitask` flag is set to False, the number of fine-tuning jobs will "
+                "be equal to the number of predictors in the student program. If the `multitask` flag is set to True, "
+                "the number of fine-tuning jobs will be equal to: 1 if there is only a context LM, or the number of "
+                "unique LMs attached to the predictors in the student program. In any case, the number of fine-tuning "
+                "jobs will be less than or equal to the number of predictors."
+            )
         logger.info(f"{len(key_to_data)} fine-tuning job(s) to start")
         key_to_lm = self.finetune_lms(key_to_data)
 
@@ -98,10 +115,10 @@ def compile(self, student: Program, trainset: List[Example], teacher: Optional[U
             training_key = (pred.lm, data_pred_ind)
             pred.lm = key_to_lm[training_key]
             # TODO: What should the correct behavior be here? Should
-            # BootstrapFinetune modify the prompt demos according to the 
+            # BootstrapFinetune modify the prompt demos according to the
             # train data?
             pred.demos = [] if self.exclude_demos else pred.demos
-        
+
         logger.info("BootstrapFinetune has finished compiling the student program")
         student._compiled = True
         return student
@@ -120,10 +137,13 @@ def finetune_lms(finetune_dict) -> Dict[Any, LM]:
             # up resources for fine-tuning. This might mean introducing a new
             # provider method (e.g. prepare_for_finetune) that can be called
             # before fine-tuning is started.
-            logger.info("Calling lm.kill() on the LM to be fine-tuned to free up resources. This won't have any effect if the LM is not running.")
+            logger.info(
+                "Calling lm.kill() on the LM to be fine-tuned to free up resources. This won't have any effect if the "
+                "LM is not running."
+            )
             lm.kill()
             key_to_job[key] = lm.finetune(**finetune_kwargs)
-        
+
         key_to_lm = {}
         for ind, (key, job) in enumerate(key_to_job.items()):
             key_to_lm[key] = job.result()
@@ -143,13 +163,16 @@ def _prepare_finetune_data(self, trace_data: List[Dict[str, Any]], lm: LM, pred_
         adapter = self.adapter[lm] or lm.infer_adapter()
         data_format = infer_data_format(adapter)
         for item in trace_data:
-            for pred_ind, _ in enumerate(item['trace']):
+            for pred_ind, _ in enumerate(item["trace"]):
                 include_data = pred_ind is None or pred_ind == pred_ind
                 if include_data:
-                    call_data = build_call_data_from_trace(trace=item['trace'], pred_ind=pred_ind,  adapter=adapter, exclude_demos=self.exclude_demos)
+                    call_data = build_call_data_from_trace(
+                        trace=item["trace"], pred_ind=pred_ind, adapter=adapter, exclude_demos=self.exclude_demos
+                    )
                     data.append(call_data)
 
         import random
+
         random.Random(0).shuffle(data)
 
         return data, data_format
@@ -189,8 +212,11 @@ def bootstrap_trace_data(
     # Return a list of dicts with the following keys:
     #     example_ind, example, prediction, trace, and score (if metric != None)
     evaluator = Evaluate(
-        devset=dataset, num_threads=num_threads, display_progress=True, return_outputs=True,
-        provide_traceback=True  # TODO(check with team)
+        devset=dataset,
+        num_threads=num_threads,
+        display_progress=True,
+        return_outputs=True,
+        provide_traceback=True,  # TODO(check with team)
     )
 
     def wrapped_metric(example, prediction, trace=None):
@@ -286,11 +312,10 @@ def assert_structural_equivalency(program1: object, program2: object):
 
     pzip = zip(program1.named_predictors(), program2.named_predictors())
     for ind, ((name1, pred1), (name2, pred2)) in enumerate(pzip):
-        err =  f"Program predictor names must match at  corresponding indices for structural equivalency. The predictor names for the programs do not match at index {ind}: '{name1}' != '{name2}'"
+        err = f"Program predictor names must match at  corresponding indices for structural equivalency. The predictor names for the programs do not match at index {ind}: '{name1}' != '{name2}'"
         assert name1 == name2, err
         assert isinstance(pred1, Predict)
         assert isinstance(pred2, Predict)
-        # assert pred1.signature.equals(pred2.signature)
 
 
 def assert_no_shared_predictor(program1: Program, program2: Program):
@@ -303,17 +328,18 @@ def assert_no_shared_predictor(program1: Program, program2: Program):
     assert not shared_ids, err
 
 
-def get_unique_lms(program: Program) -> List[LM]:	
-    lms = [pred.lm for pred in program.predictors()]	
-    lms = list(set(lms))	
-    return lms
+def get_unique_lms(program: Program) -> List[LM]:
+    lms = [pred.lm for pred in program.predictors()]
+    return list(set(lms))
+
 
 def launch_lms(program: Program):
     lms = get_unique_lms(program)
     for lm in lms:
         lm.launch()
 
-def kill_lms(program: Program):	
-    lms = get_unique_lms(program)	
-    for lm in lms:	
+
+def kill_lms(program: Program):
+    lms = get_unique_lms(program)
+    for lm in lms:
         lm.kill()
diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py
index 6231265eea..246f0005d2 100644
--- a/dspy/teleprompt/mipro_optimizer_v2.py
+++ b/dspy/teleprompt/mipro_optimizer_v2.py
@@ -67,9 +67,7 @@ def __init__(
         # Validate 'auto' parameter
         allowed_modes = {None, "light", "medium", "heavy"}
         if auto not in allowed_modes:
-            raise ValueError(
-                f"Invalid value for auto: {auto}. Must be one of {allowed_modes}."
-            )
+            raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.")
         self.auto = auto
 
         self.num_candidates = num_candidates
@@ -126,9 +124,7 @@ def compile(
         trainset, valset = self._set_and_validate_datasets(trainset, valset)
 
         # Set hyperparameters based on run mode (if set)
-        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (
-            self.max_labeled_demos == 0
-        )
+        zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
         num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
             student, num_trials, minibatch, zeroshot_opt, valset
         )
@@ -137,9 +133,7 @@ def compile(
             self._print_auto_run_settings(num_trials, minibatch, valset)
 
         if minibatch and minibatch_size > len(valset):
-            raise ValueError(
-                f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}."
-            )
+            raise ValueError(f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}.")
 
         # Estimate LM calls and get user confirmation
         if requires_permission_to_run:
@@ -200,10 +194,8 @@ def compile(
         )
 
         return best_program
-    
-    def _set_random_seeds(self,
-        seed
-    ):
+
+    def _set_random_seeds(self, seed):
         self.rng = random.Random(seed)
         np.random.seed(seed)
 
@@ -226,9 +218,7 @@ def _set_hyperparams_from_run_mode(
         num_trials = auto_settings["num_trials"]
         valset = create_minibatch(valset, batch_size=auto_settings["val_size"], rng=self.rng)
         minibatch = len(valset) > MIN_MINIBATCH_SIZE
-        self.num_candidates = int(
-            np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars]))
-        )
+        self.num_candidates = int(np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars])))
 
         return num_trials, valset, minibatch
 
@@ -238,9 +228,7 @@ def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]):
 
         if valset is None:
             if len(trainset) < 2:
-                raise ValueError(
-                    "Trainset must have at least 2 examples if no valset specified."
-                )
+                raise ValueError("Trainset must have at least 2 examples if no valset specified.")
             valset_size = min(1000, max(1, int(len(trainset) * 0.80)))
             cutoff = len(trainset) - valset_size
             valset = trainset[cutoff:]
@@ -276,9 +264,7 @@ def _estimate_lm_calls(
         estimated_prompt_model_calls = (
             10  # Data summarizer calls
             + self.num_candidates * num_predictors  # Candidate generation
-            + (
-                num_predictors + 1 if program_aware_proposer else 0
-            )  # Program-aware proposer
+            + (num_predictors + 1 if program_aware_proposer else 0)  # Program-aware proposer
         )
         prompt_model_line = (
             f"{YELLOW}- Prompt Generation: {BLUE}{BOLD}10{ENDC}{YELLOW} data summarizer calls + "
@@ -298,9 +284,7 @@ def _estimate_lm_calls(
             )
         else:
             full_eval_steps = num_trials // minibatch_full_eval_steps + 1
-            estimated_task_model_calls = (
-                minibatch_size * num_trials + len(valset) * full_eval_steps
-            )
+            estimated_task_model_calls = minibatch_size * num_trials + len(valset) * full_eval_steps
             task_model_line = (
                 f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{minibatch_size}{ENDC}{YELLOW} examples in minibatch * "
                 f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches + "
@@ -363,15 +347,12 @@ def _get_user_confirmation(
         """
         )
 
-        user_input = input(
-            f"{user_message}\n{user_confirmation_message}\n"
-            "Do you wish to continue? (y/n): "
-        ).strip().lower()
+        user_input = (
+            input(f"{user_message}\n{user_confirmation_message}\nDo you wish to continue? (y/n): ").strip().lower()
+        )
         return user_input == "y"
 
-    def _bootstrap_fewshot_examples(
-        self, program: Any, trainset: List, seed: int, teacher: Any
-    ) -> Optional[List]:
+    def _bootstrap_fewshot_examples(self, program: Any, trainset: List, seed: int, teacher: Any) -> Optional[List]:
         logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==")
         if self.max_bootstrapped_demos > 0:
             logger.info(
@@ -389,15 +370,9 @@ def _bootstrap_fewshot_examples(
                 student=program,
                 num_candidate_sets=self.num_candidates,
                 trainset=trainset,
-                max_labeled_demos=(
-                    LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT
-                    if zeroshot
-                    else self.max_labeled_demos
-                ),
+                max_labeled_demos=(LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_labeled_demos),
                 max_bootstrapped_demos=(
-                    BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT
-                    if zeroshot
-                    else self.max_bootstrapped_demos
+                    BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_bootstrapped_demos
                 ),
                 metric=self.metric,
                 max_errors=self.max_errors,
@@ -438,13 +413,13 @@ def _propose_instructions(
             program_aware=program_aware_proposer,
             use_dataset_summary=data_aware_proposer,
             use_task_demos=fewshot_aware_proposer,
-            num_demos_in_context = BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT,
+            num_demos_in_context=BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT,
             use_tip=tip_aware_proposer,
             set_tip_randomly=tip_aware_proposer,
             use_instruct_history=False,
             set_history_randomly=False,
             verbose=self.verbose,
-            rng=self.rng
+            rng=self.rng,
         )
 
         logger.info("\nProposing instructions...\n")
@@ -479,7 +454,6 @@ def _optimize_prompt_parameters(
         minibatch_full_eval_steps: int,
         seed: int,
     ) -> Optional[Any]:
-
         # Run optimization
         optuna.logging.set_verbosity(optuna.logging.WARNING)
         logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==")
@@ -491,14 +465,14 @@ def _optimize_prompt_parameters(
         adjusted_num_trials = (num_trials + num_trials // minibatch_full_eval_steps + 1) if minibatch else num_trials
         logger.info(f"== Trial {1} / {adjusted_num_trials} - Full Evaluation of Default Program ==")
 
-        default_score, baseline_results = eval_candidate_program(len(valset), valset, program, evaluate, self.rng, return_all_scores=True)
+        default_score, baseline_results = eval_candidate_program(
+            len(valset), valset, program, evaluate, self.rng, return_all_scores=True
+        )
         logger.info(f"Default program score: {default_score}\n")
 
         trial_logs = {}
         trial_logs[-1] = {}
-        trial_logs[-1]["full_eval_program_path"] = save_candidate_program(
-            program, self.log_dir, -1
-        )
+        trial_logs[-1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1)
         trial_logs[-1]["full_eval_score"] = default_score
         trial_logs[-1]["total_eval_calls_so_far"] = len(valset)
         trial_logs[-1]["full_eval_program"] = program.deepcopy()
@@ -543,9 +517,7 @@ def objective(trial):
 
             # Evaluate the candidate program (on minibatch if minibatch=True)
             batch_size = minibatch_size if minibatch else len(valset)
-            score = eval_candidate_program(
-               batch_size, valset, candidate_program, evaluate, self.rng
-            )
+            score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng)
             total_eval_calls += batch_size
 
             # Update best score and program
@@ -555,7 +527,9 @@ def objective(trial):
                 logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}")
 
             # Log evaluation results
-            score_data.append({"score": score, "program": candidate_program, "full_eval": batch_size >= len(valset)}) # score, prog, full_eval
+            score_data.append(
+                {"score": score, "program": candidate_program, "full_eval": batch_size >= len(valset)}
+            )  # score, prog, full_eval
             if minibatch:
                 self._log_minibatch_eval(
                     score,
@@ -572,7 +546,18 @@ def objective(trial):
                 )
             else:
                 self._log_normal_eval(
-                    score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls
+                    score,
+                    best_score,
+                    chosen_params,
+                    score_data,
+                    trial,
+                    num_trials,
+                    trial_logs,
+                    trial_num,
+                    valset,
+                    batch_size,
+                    candidate_program,
+                    total_eval_calls,
                 )
             categorical_key = ",".join(map(str, chosen_params))
             param_score_dict[categorical_key].append(
@@ -580,10 +565,7 @@ def objective(trial):
             )
 
             # If minibatch, perform full evaluation at intervals (and at the very end)
-            if minibatch and (
-                (trial_num % minibatch_full_eval_steps == 0)
-                or (trial_num == (adjusted_num_trials -1))
-            ):
+            if minibatch and ((trial_num % minibatch_full_eval_steps == 0) or (trial_num == (adjusted_num_trials - 1))):
                 best_score, best_program, total_eval_calls = self._perform_full_evaluation(
                     trial_num,
                     adjusted_num_trials,
@@ -598,9 +580,9 @@ def objective(trial):
                     best_program,
                     study,
                     instruction_candidates,
-                    demo_candidates
+                    demo_candidates,
                 )
-            
+
             return score
 
         sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
@@ -613,11 +595,11 @@ def objective(trial):
         # Add default run as a baseline in optuna (TODO: figure out how to weight this by # of samples evaluated on)
         trial = optuna.trial.create_trial(
             params=default_params,
-            distributions= self._get_param_distributions(program, instruction_candidates, demo_candidates),
+            distributions=self._get_param_distributions(program, instruction_candidates, demo_candidates),
             value=default_score,
         )
         study.add_trial(trial)
-        study.optimize(objective, n_trials=num_trials-1)
+        study.optimize(objective, n_trials=num_trials - 1)
 
         # Attach logs to best program
         if best_program is not None and self.track_stats:
@@ -627,9 +609,13 @@ def objective(trial):
             best_program.total_calls = self.total_calls
             sorted_candidate_programs = sorted(score_data, key=lambda x: x["score"], reverse=True)
             # Attach all minibatch programs
-            best_program.mb_candidate_programs = [score_data for score_data in sorted_candidate_programs if not score_data["full_eval"]]
+            best_program.mb_candidate_programs = [
+                score_data for score_data in sorted_candidate_programs if not score_data["full_eval"]
+            ]
             # Attach all programs that were evaluated on the full trainset, in descending order of score
-            best_program.candidate_programs = [score_data for score_data in sorted_candidate_programs if score_data["full_eval"]]
+            best_program.candidate_programs = [
+                score_data for score_data in sorted_candidate_programs if score_data["full_eval"]
+            ]
 
         logger.info(f"Returning best identified program with score {best_score}!")
 
@@ -649,28 +635,36 @@ def _log_minibatch_eval(
         candidate_program,
         total_eval_calls,
     ):
-        trial_logs[trial_num]["mb_program_path"] = save_candidate_program(
-            candidate_program, self.log_dir, trial_num
-        )
+        trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num)
         trial_logs[trial_num]["mb_score"] = score
         trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls
         trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy()
-        
-        logger.info(
-            f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}."
-        )
-        minibatch_scores = ', '.join([f'{s["score"]}' for s in score_data if not s["full_eval"]])
-        logger.info(f"Minibatch scores so far: {'['+ minibatch_scores +']'}")
-        full_eval_scores = ', '.join([f'{s["score"]}' for s in score_data if s["full_eval"]])
+
+        logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.")
+        minibatch_scores = ", ".join([f"{s['score']}" for s in score_data if not s["full_eval"]])
+        logger.info(f"Minibatch scores so far: {'[' + minibatch_scores + ']'}")
+        full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
         trajectory = "[" + full_eval_scores + "]"
         logger.info(f"Full eval scores so far: {trajectory}")
         logger.info(f"Best full score so far: {best_score}")
         logger.info(
-            f'{"="*len(f"== Trial {trial.number+1} / {adjusted_num_trials} - Minibatch Evaluation ==")}\n\n'
+            f"{'=' * len(f'== Trial {trial.number + 1} / {adjusted_num_trials} - Minibatch Evaluation ==')}\n\n"
         )
 
     def _log_normal_eval(
-        self, score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls
+        self,
+        score,
+        best_score,
+        chosen_params,
+        score_data,
+        trial,
+        num_trials,
+        trial_logs,
+        trial_num,
+        valset,
+        batch_size,
+        candidate_program,
+        total_eval_calls,
     ):
         trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program(
             candidate_program, self.log_dir, trial_num
@@ -680,10 +674,10 @@ def _log_normal_eval(
         trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy()
 
         logger.info(f"Score: {score} with parameters {chosen_params}.")
-        full_eval_scores = ', '.join([f'{s["score"]}' for s in score_data if s["full_eval"]])
-        logger.info(f"Scores so far: {'['+full_eval_scores+']'}")
+        full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
+        logger.info(f"Scores so far: {'[' + full_eval_scores + ']'}")
         logger.info(f"Best score so far: {best_score}")
-        logger.info(f'{"="*len(f"===== Trial {trial.number+1} / {num_trials} =====")}\n\n')
+        logger.info(f"{'=' * len(f'===== Trial {trial.number + 1} / {num_trials} =====')}\n\n")
 
     def _select_and_insert_instructions_and_demos(
         self,
@@ -703,18 +697,14 @@ def _select_and_insert_instructions_and_demos(
                 f"{i}_predictor_instruction", range(len(instruction_candidates[i]))
             )
             selected_instruction = instruction_candidates[i][instruction_idx]
-            updated_signature = get_signature(predictor).with_instructions(
-                selected_instruction
-            )
+            updated_signature = get_signature(predictor).with_instructions(selected_instruction)
             set_signature(predictor, updated_signature)
             trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx
             chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}")
             raw_chosen_params[f"{i}_predictor_instruction"] = instruction_idx
             # Select demos if available
             if demo_candidates:
-                demos_idx = trial.suggest_categorical(
-                    f"{i}_predictor_demos", range(len(demo_candidates[i]))
-                )
+                demos_idx = trial.suggest_categorical(f"{i}_predictor_demos", range(len(demo_candidates[i])))
                 predictor.demos = demo_candidates[i][demos_idx]
                 trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx
                 chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}")
@@ -726,7 +716,9 @@ def _get_param_distributions(self, program, instruction_candidates, demo_candida
         param_distributions = {}
 
         for i in range(len(instruction_candidates)):
-            param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(range(len(instruction_candidates[i])))
+            param_distributions[f"{i}_predictor_instruction"] = CategoricalDistribution(
+                range(len(instruction_candidates[i]))
+            )
             if demo_candidates:
                 param_distributions[f"{i}_predictor_demos"] = CategoricalDistribution(range(len(demo_candidates[i])))
 
@@ -749,26 +741,20 @@ def _perform_full_evaluation(
         instruction_candidates: List,
         demo_candidates: List,
     ):
-        logger.info(f"===== Trial {trial_num+1} / {adjusted_num_trials} - Full Evaluation =====")
+        logger.info(f"===== Trial {trial_num + 1} / {adjusted_num_trials} - Full Evaluation =====")
 
         # Identify best program to evaluate fully
-        highest_mean_program, mean_score, combo_key, params = (
-            get_program_with_highest_avg_score(
-                param_score_dict, fully_evaled_param_combos
-            )
-        )
-        logger.info(
-            f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials..."
-        )
-        full_eval_score = eval_candidate_program(
-            len(valset), valset, highest_mean_program, evaluate, self.rng
+        highest_mean_program, mean_score, combo_key, params = get_program_with_highest_avg_score(
+            param_score_dict, fully_evaled_param_combos
         )
+        logger.info(f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials...")
+        full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng)
         score_data.append({"score": full_eval_score, "program": highest_mean_program, "full_eval": True})
 
         # Log full eval as a trial so that optuna can learn from the new results
         trial = optuna.trial.create_trial(
             params=params,
-            distributions= self._get_param_distributions(best_program, instruction_candidates, demo_candidates),
+            distributions=self._get_param_distributions(best_program, instruction_candidates, demo_candidates),
             value=full_eval_score,
         )
         study.add_trial(trial)
@@ -794,11 +780,11 @@ def _perform_full_evaluation(
             logger.info(f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}")
             best_score = full_eval_score
             best_program = highest_mean_program.deepcopy()
-        full_eval_scores = ', '.join([f'{s["score"]}' for s in score_data if s["full_eval"]])
+        full_eval_scores = ", ".join([f"{s['score']}" for s in score_data if s["full_eval"]])
         trajectory = "[" + full_eval_scores + "]"
         logger.info(f"Full eval scores so far: {trajectory}")
         logger.info(f"Best full score so far: {best_score}")
-        logger.info(len(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") * "=")
+        logger.info(len(f"===== Full Eval {len(fully_evaled_param_combos) + 1} =====") * "=")
         logger.info("\n")
 
         return best_score, best_program, total_eval_calls