From 7a9be5ed1d8969853b8a657a6c412f9b31bd98ff Mon Sep 17 00:00:00 2001
From: DanielUH2019 <danielcardenascabrera2016@gmail.com>
Date: Sat, 28 Oct 2023 22:36:16 -0400
Subject: [PATCH 1/3] Apply ruff check and formatter

---
 dsp/__init__.py                            |   2 +-
 dsp/evaluation/utils.py                    |  59 +++--
 dsp/modules/__init__.py                    |   2 +-
 dsp/modules/azurecognitivesearch.py        |  62 +++--
 dsp/modules/cache_utils.py                 |   6 +-
 dsp/modules/cohere.py                      |   3 +-
 dsp/modules/finetuning/__init__.py         |   2 +-
 dsp/modules/finetuning/finetune_hf.py      | 274 ++++++++++++++-------
 dsp/modules/gpt3.py                        |  17 +-
 dsp/modules/hf.py                          |  57 +++--
 dsp/modules/hf_client.py                   | 126 ++++++----
 dsp/modules/lm.py                          |   4 +-
 dsp/modules/pyserini.py                    |  68 +++--
 dsp/modules/sbert.py                       |   4 +-
 dsp/modules/sentence_vectorizer.py         |  66 ++---
 dsp/primitives/compiler.py                 |  87 ++++---
 dsp/primitives/demonstrate.py              |   3 +-
 dsp/primitives/inspect.py                  | 140 +++++------
 dsp/primitives/predict.py                  |   1 -
 dsp/primitives/primitives.py               |  18 +-
 dsp/primitives/search.py                   |  10 +-
 dsp/templates/__init__.py                  |   1 -
 dsp/templates/template_v2.py               |  51 ++--
 dsp/templates/template_v3.py               |  11 +-
 dsp/templates/utils.py                     |   2 +-
 dsp/utils/ann_utils.py                     |  42 ++--
 dsp/utils/dpr.py                           |  85 ++++---
 dsp/utils/metrics.py                       | 183 ++++++++++++--
 dsp/utils/settings.py                      |   5 +-
 dsp/utils/settings_v2.py                   |  41 ++-
 dsp/utils/utils.py                         |  15 +-
 dspy/datasets/__init__.py                  |   2 +-
 dspy/datasets/colors.py                    | 149 ++++++++++-
 dspy/datasets/dataset.py                   |  88 ++++---
 dspy/datasets/gsm8k.py                     |  63 ++---
 dspy/datasets/hotpotqa.py                  |  70 +++---
 dspy/evaluate/__init__.py                  |   2 +-
 dspy/evaluate/auto_evaluation.py           |  25 +-
 dspy/evaluate/evaluate.py                  | 142 +++++++----
 dspy/evaluate/metrics.py                   |  20 +-
 dspy/predict/__init__.py                   |   4 +-
 dspy/predict/aggregation.py                |  36 +--
 dspy/predict/chain_of_thought.py           |  28 ++-
 dspy/predict/chain_of_thought_with_hint.py |  44 ++--
 dspy/predict/knn.py                        |  22 +-
 dspy/predict/multi_chain_comparison.py     |  36 ++-
 dspy/predict/parameter.py                  |   1 +
 dspy/predict/predict.py                    |  36 +--
 dspy/predict/program_of_thought.py         | 162 ++++++++----
 dspy/predict/react.py                      | 108 +++++---
 dspy/primitives/__init__.py                |   2 +-
 dspy/primitives/box.py                     |  71 ++++--
 dspy/primitives/example.py                 |  56 +++--
 dspy/primitives/module.py                  |  16 +-
 dspy/primitives/prediction.py              |  34 ++-
 dspy/primitives/program.py                 |  24 +-
 dspy/primitives/python_interpreter.py      | 148 ++++++-----
 dspy/retrieve/__init__.py                  |   2 +-
 dspy/retrieve/retrieve.py                  |  21 +-
 dspy/retrieve/you_rm.py                    |  74 +++---
 dspy/signatures/field.py                   |  16 +-
 dspy/signatures/signature.py               |  76 +++---
 dspy/teleprompt/bootstrap.py               | 116 ++++++---
 dspy/teleprompt/ensemble.py                |  14 +-
 dspy/teleprompt/finetune.py                | 125 ++++++----
 dspy/teleprompt/knn_fewshot.py             |  11 +-
 dspy/teleprompt/random_search.py           | 113 ++++++---
 dspy/teleprompt/teleprompt.py              |   7 -
 dspy/teleprompt/teleprompt_optuna.py       |  69 ++++--
 dspy/teleprompt/vanilla.py                 |  10 +-
 inspect-app/app.py                         |  30 +--
 setup.py                                   |  14 +-
 72 files changed, 2336 insertions(+), 1198 deletions(-)

diff --git a/dsp/__init__.py b/dsp/__init__.py
index fc8a742415..443c7ca1aa 100644
--- a/dsp/__init__.py
+++ b/dsp/__init__.py
@@ -40,4 +40,4 @@ def __getattr__(self, name):
 import sys
 sys.modules[__name__] = DspModule()
 
-"""
\ No newline at end of file
+"""
diff --git a/dsp/evaluation/utils.py b/dsp/evaluation/utils.py
index 5ce9a4b1df..0779c2f5d0 100644
--- a/dsp/evaluation/utils.py
+++ b/dsp/evaluation/utils.py
@@ -1,12 +1,9 @@
-from openai import InvalidRequestError
-from openai.error import APIError
-
 import dsp
 import tqdm
 import pandas as pd
 
 from IPython.display import display
-from dsp.utils import EM, F1, HotPotF1
+from dsp.utils import EM
 
 
 def evaluateRetrieval(fn, dev, metric=None):
@@ -19,17 +16,24 @@ def evaluateRetrieval(fn, dev, metric=None):
         d = dict(example)
 
         # d['prediction'] = prediction.answer
-        d['correct'] =  dsp.passage_match(prediction.context, example.answer)
+        d["correct"] = dsp.passage_match(prediction.context, example.answer)
         data.append(d)
 
     df = pd.DataFrame(data)
 
-    percentage = round(100.0 * df['correct'].sum() / len(dev), 1)
+    percentage = round(100.0 * df["correct"].sum() / len(dev), 1)
     print(f"Answered {df['correct'].sum()} / {len(dev)} ({percentage}%) correctly.")
-    df['correct'] = df['correct'].apply(lambda x: '✔️' if x else '❌')
+    df["correct"] = df["correct"].apply(lambda x: "✔️" if x else "❌")
 
     pd.options.display.max_colwidth = None
-    display(df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}, {'selector': 'td', 'props': [('text-align', 'left')]}]))
+    display(
+        df.style.set_table_styles(
+            [
+                {"selector": "th", "props": [("text-align", "left")]},
+                {"selector": "td", "props": [("text-align", "left")]},
+            ]
+        )
+    )
 
 
 def evaluateAnswer(fn, dev, metric=EM):
@@ -43,19 +47,25 @@ def evaluateAnswer(fn, dev, metric=EM):
 
         pred = prediction.answer
 
-        d['prediction'] = pred
-        d['correct'] = metric(pred, example.answer)
+        d["prediction"] = pred
+        d["correct"] = metric(pred, example.answer)
         data.append(d)
 
     df = pd.DataFrame(data)
 
-    percentage = round(100.0 * df['correct'].sum() / len(dev), 1)
+    percentage = round(100.0 * df["correct"].sum() / len(dev), 1)
     print(f"Answered {df['correct'].sum()} / {len(dev)} ({percentage}%) correctly.")
-    df['correct'] = df['correct'].apply(lambda x: '✔️' if x else '❌')
+    df["correct"] = df["correct"].apply(lambda x: "✔️" if x else "❌")
 
     pd.options.display.max_colwidth = None
-    display(df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}, {'selector': 'td', 'props': [('text-align', 'left')]}]))
-
+    display(
+        df.style.set_table_styles(
+            [
+                {"selector": "th", "props": [("text-align", "left")]},
+                {"selector": "td", "props": [("text-align", "left")]},
+            ]
+        )
+    )
 
 
 def evaluate(fn, dev, metric=EM):
@@ -67,21 +77,26 @@ def evaluate(fn, dev, metric=EM):
 
         d = dict(example)
 
-        pred = prediction#.answer
+        pred = prediction  # .answer
 
-        d['prediction'] = pred
-        d['correct'] = metric(pred, example.answer)
+        d["prediction"] = pred
+        d["correct"] = metric(pred, example.answer)
         data.append(d)
 
     df = pd.DataFrame(data)
 
-    percentage = round(100.0 * df['correct'].sum() / len(dev), 1)
+    percentage = round(100.0 * df["correct"].sum() / len(dev), 1)
     print(f"Answered {df['correct'].sum()} / {len(dev)} ({percentage}%) correctly.")
-    df['correct'] = df['correct'].apply(lambda x: '✔️' if x else '❌')
+    df["correct"] = df["correct"].apply(lambda x: "✔️" if x else "❌")
 
     pd.options.display.max_colwidth = None
-    display(df.style.set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}, {'selector': 'td', 'props': [('text-align', 'left')]}]))
+    display(
+        df.style.set_table_styles(
+            [
+                {"selector": "th", "props": [("text-align", "left")]},
+                {"selector": "td", "props": [("text-align", "left")]},
+            ]
+        )
+    )
 
     return percentage
-
-
diff --git a/dsp/modules/__init__.py b/dsp/modules/__init__.py
index c07bc8fb02..dc226b144e 100644
--- a/dsp/modules/__init__.py
+++ b/dsp/modules/__init__.py
@@ -8,4 +8,4 @@
 from .pyserini import *
 
 from .hf_client import HFClientTGI
-from .hf_client import Anyscale
\ No newline at end of file
+from .hf_client import Anyscale
diff --git a/dsp/modules/azurecognitivesearch.py b/dsp/modules/azurecognitivesearch.py
index 0d68c3cbee..ee5e11c5ac 100644
--- a/dsp/modules/azurecognitivesearch.py
+++ b/dsp/modules/azurecognitivesearch.py
@@ -1,16 +1,18 @@
-from typing import Optional, Union, Any
+from typing import Union, Any
 
 from dsp.utils import dotdict
+
 try:
     from azure.core.credentials import AzureKeyCredential
     from azure.search.documents import SearchClient
     from azure.search.documents._paging import SearchItemPaged
-except ImportError as e:
+except ImportError:
     raise ImportError(
         "You need to install azure-search-documents library"
         "Please use the command: pip install azure-search-documents"
     )
 
+
 class AzureCognitiveSearch:
     """Wrapper for the Azure Congitive Search Retrieval."""
 
@@ -19,51 +21,59 @@ def __init__(
         search_service_name: str,
         search_api_key: str,
         search_index_name: str,
-        field_text: str, # required field to map with "content" field in dsp framework
-        field_score: str, # required field to map with "score" field in dsp framework
-
+        field_text: str,  # required field to map with "content" field in dsp framework
+        field_score: str,  # required field to map with "score" field in dsp framework
     ):
         self.search_service_name = search_service_name
         self.search_api_key = search_api_key
         self.search_index_name = search_index_name
-        self.endpoint=f"https://{self.search_service_name}.search.windows.net"
-        self.field_text = field_text # field name of the text content
-        self.field_score = field_score # field name of the search score
+        self.endpoint = f"https://{self.search_service_name}.search.windows.net"
+        self.field_text = field_text  # field name of the text content
+        self.field_score = field_score  # field name of the search score
         # Create a client
         self.credential = AzureKeyCredential(self.search_api_key)
-        self.client = SearchClient(endpoint=self.endpoint,
-                        index_name=self.search_index_name,
-                        credential=self.credential)
+        self.client = SearchClient(
+            endpoint=self.endpoint,
+            index_name=self.search_index_name,
+            credential=self.credential,
+        )
 
     def __call__(self, query: str, k: int = 10) -> Union[list[str], list[dotdict]]:
-        
-        topk: list[dict[str, Any]] = azure_search_request(self.field_text, self.field_score, self.client, query, k)
-        topk = [{**d, "long_text": d["text"]} for d in topk]            
+        topk: list[dict[str, Any]] = azure_search_request(
+            self.field_text, self.field_score, self.client, query, k
+        )
+        topk = [{**d, "long_text": d["text"]} for d in topk]
 
         return [dotdict(psg) for psg in topk]
 
-def azure_search_request(key_content: str, key_score: str,  client: SearchClient, query: str, top: int =1):
-    '''
+
+def azure_search_request(
+    key_content: str, key_score: str, client: SearchClient, query: str, top: int = 1
+):
+    """
     Search in Azure Conginitve Search Index
-    '''
-    results = client.search(search_text=query,top=top)
+    """
+    results = client.search(search_text=query, top=top)
     results = process_azure_result(results, key_content, key_content)
 
     return results
 
-def process_azure_result(results:SearchItemPaged, content_key:str, content_score: str):
-    '''
+
+def process_azure_result(
+    results: SearchItemPaged, content_key: str, content_score: str
+):
+    """
     process received result from Azure cognitive search as dictionary array and map content and score to correct format
-    '''
+    """
     res = []
     for result in results:
         tmp = {}
         for key, value in result.items():
-            if(key == content_key):
-                tmp["text"] = value # assign content
-            elif(key == content_score):
+            if key == content_key:
+                tmp["text"] = value  # assign content
+            elif key == content_score:
                 tmp["score"] = value
             else:
-                tmp[key] = value            
+                tmp[key] = value
         res.append(tmp)
-    return res 
+    return res
diff --git a/dsp/modules/cache_utils.py b/dsp/modules/cache_utils.py
index 78270c879a..8a7387417a 100644
--- a/dsp/modules/cache_utils.py
+++ b/dsp/modules/cache_utils.py
@@ -24,10 +24,12 @@ def wrapper(*args, **kwargs):
         return decorator
 
 
-cachedir = os.environ.get('DSP_CACHEDIR') or os.path.join(Path.home(), 'cachedir_joblib')
+cachedir = os.environ.get("DSP_CACHEDIR") or os.path.join(
+    Path.home(), "cachedir_joblib"
+)
 CacheMemory = Memory(location=cachedir, verbose=0)
 
-cachedir2 = os.environ.get('DSP_NOTEBOOK_CACHEDIR')
+cachedir2 = os.environ.get("DSP_NOTEBOOK_CACHEDIR")
 NotebookCacheMemory = dotdict()
 NotebookCacheMemory.cache = noop_decorator
 
diff --git a/dsp/modules/cohere.py b/dsp/modules/cohere.py
index 789704cac8..fc6e0a04fc 100644
--- a/dsp/modules/cohere.py
+++ b/dsp/modules/cohere.py
@@ -6,6 +6,7 @@
 
 try:
     import cohere
+
     cohere_api_error = cohere.CohereAPIError
 except ImportError:
     cohere_api_error = Exception
@@ -106,7 +107,7 @@ def __call__(
         prompt: str,
         only_completed: bool = True,
         return_sorted: bool = False,
-        **kwargs
+        **kwargs,
     ):
         assert only_completed, "for now"
         assert return_sorted is False, "for now"
diff --git a/dsp/modules/finetuning/__init__.py b/dsp/modules/finetuning/__init__.py
index f0ef12e8de..9be81e1ee5 100644
--- a/dsp/modules/finetuning/__init__.py
+++ b/dsp/modules/finetuning/__init__.py
@@ -1 +1 @@
-from .finetune_hf import *
\ No newline at end of file
+from .finetune_hf import *
diff --git a/dsp/modules/finetuning/finetune_hf.py b/dsp/modules/finetuning/finetune_hf.py
index e899c4c292..76fb1ec3f9 100644
--- a/dsp/modules/finetuning/finetune_hf.py
+++ b/dsp/modules/finetuning/finetune_hf.py
@@ -5,7 +5,6 @@
 import copy
 import glob
 import torch
-import random
 import warnings
 import evaluate
 import numpy as np
@@ -24,7 +23,10 @@
     Seq2SeqTrainingArguments,
     DataCollatorForSeq2Seq,
 )
-# from peft import get_peft_model, LoraConfig, TaskType
+
+from peft.utils.peft_types import TaskType
+from peft.mapping import get_peft_model
+from peft.tuners.lora import LoraConfig
 from transformers.trainer_callback import TrainerCallback
 
 # from dsp.modules.finetuning.fid import *
@@ -57,12 +59,12 @@ def _freeze_model_layers(model, unfreeze_last_n):
     for i, m in enumerate(model.transformer.h):
         if i >= NUM_DECODER_LAYERS - unfreeze_last_n:
             for parameter in m.parameters():
-                parameter.requires_grad = True 
+                parameter.requires_grad = True
 
     # Unfreeze parameters after decoder block
     for parameter in model.transformer.ln_f.parameters():
         parameter.requires_grad = True
-    for parameter in model.lm_head.parameters():        
+    for parameter in model.lm_head.parameters():
         parameter.requires_grad = True
     return model
 
@@ -71,6 +73,7 @@ def _load_data(path):
     # dataset = Dataset.from_json(path)
     L = []
     import ujson
+
     with open(path) as f:
         for line in f:
             L.append(ujson.loads(line))
@@ -79,23 +82,43 @@ def _load_data(path):
     return dataset
 
 
-def preprocess_prompt(text, tokenizer, encoder_decoder_model, decoder_only_model, rationale):
-    text = f'{text} ' if encoder_decoder_model else f'{text} {tokenizer.sep_token}'
+def preprocess_prompt(
+    text, tokenizer, encoder_decoder_model, decoder_only_model, rationale
+):
+    text = f"{text} " if encoder_decoder_model else f"{text} {tokenizer.sep_token}"
     return text
 
 
-def preprocess_completion(text, tokenizer, encoder_decoder_model, decoder_only_model, rationale):
-    text = text if encoder_decoder_model else f'{text}{tokenizer.eos_token}'
+def preprocess_completion(
+    text, tokenizer, encoder_decoder_model, decoder_only_model, rationale
+):
+    text = text if encoder_decoder_model else f"{text}{tokenizer.eos_token}"
     return text.lstrip()
 
 
-def _preprocess_data(dataset, tokenizer, encoder_decoder_model, decoder_only_model, config):
-    dataset = dataset.map(lambda x: {
-        "prompt": preprocess_prompt(x["prompt"], tokenizer, encoder_decoder_model, decoder_only_model, config['rationale']),
-        "completion": preprocess_completion(x["completion"], tokenizer, encoder_decoder_model, decoder_only_model, config['rationale']),
-    })
+def _preprocess_data(
+    dataset, tokenizer, encoder_decoder_model, decoder_only_model, config
+):
+    dataset = dataset.map(
+        lambda x: {
+            "prompt": preprocess_prompt(
+                x["prompt"],
+                tokenizer,
+                encoder_decoder_model,
+                decoder_only_model,
+                config["rationale"],
+            ),
+            "completion": preprocess_completion(
+                x["completion"],
+                tokenizer,
+                encoder_decoder_model,
+                decoder_only_model,
+                config["rationale"],
+            ),
+        }
+    )
     skipped = [x for x in dataset if x["completion"] is None]
-    print(f'# examples skipped due to parsing error: {len(skipped)} / {len(dataset)}')
+    print(f"# examples skipped due to parsing error: {len(skipped)} / {len(dataset)}")
     dataset = dataset.filter(lambda x: x["completion"])
     return dataset
 
@@ -106,24 +129,41 @@ def get_dataset_stats(dataset, tokenizer, column):
         max_length = max([len(x) for x in tokenized_inputs["input_ids"]])
         return max_length
 
-    def get_tokens_seq2seq(sample, max_source_length, max_target_length, padding="max_length"):
+    def get_tokens_seq2seq(
+        sample, max_source_length, max_target_length, padding="max_length"
+    ):
         # Tokenize inputs
-        model_inputs = tokenizer(sample["prompt"], max_length=max_source_length, padding=padding, truncation=True)
+        model_inputs = tokenizer(
+            sample["prompt"],
+            max_length=max_source_length,
+            padding=padding,
+            truncation=True,
+        )
 
         # Tokenize targets
-        labels = tokenizer(text_target=sample["completion"], max_length=max_target_length, padding=padding, truncation=True)
+        labels = tokenizer(
+            text_target=sample["completion"],
+            max_length=max_target_length,
+            padding=padding,
+            truncation=True,
+        )
         labels = labels["input_ids"]
 
         # Replace all tokenizer.pad_token_id in the labels by IGNORE_INDEX when we want to ignore padding in the loss.
         if padding == "max_length":
-            labels = [[(l if l != tokenizer.pad_token_id else IGNORE_INDEX) for l in label] for label in labels]
+            labels = [
+                [(L if L != tokenizer.pad_token_id else IGNORE_INDEX) for L in label]
+                for label in labels
+            ]
 
         model_inputs["labels"] = labels
         return model_inputs
 
     def get_tokens_causal(sample, max_length, padding="max_length"):
         # Tokenize inputs
-        model_inputs = tokenizer(sample["combined"], max_length=max_length, padding=padding, truncation=True)
+        model_inputs = tokenizer(
+            sample["combined"], max_length=max_length, padding=padding, truncation=True
+        )
 
         # Create targets
         labels = copy.deepcopy(model_inputs["input_ids"])
@@ -134,7 +174,10 @@ def get_tokens_causal(sample, max_length, padding="max_length"):
 
         # Replace all tokenizer.pad_token_id in the labels by IGNORE_INDEX when we want to ignore padding in the loss.
         if padding == "max_length":
-            labels = [[(l if l != tokenizer.pad_token_id else IGNORE_INDEX) for l in label] for label in labels]
+            labels = [
+                [(L if L != tokenizer.pad_token_id else IGNORE_INDEX) for L in label]
+                for label in labels
+            ]
 
         model_inputs["labels"] = labels
         return model_inputs
@@ -142,15 +185,29 @@ def get_tokens_causal(sample, max_length, padding="max_length"):
     if encoder_decoder_model:
         max_source_length = get_dataset_stats(dataset, tokenizer, "prompt")
         max_target_length = get_dataset_stats(dataset, tokenizer, "completion")
-        kwargs = {"max_source_length" : max_source_length, "max_target_length" : max_target_length}
-        tokenized_dataset = dataset.map(get_tokens_seq2seq, batched=True, fn_kwargs=kwargs)
+        kwargs = {
+            "max_source_length": max_source_length,
+            "max_target_length": max_target_length,
+        }
+        tokenized_dataset = dataset.map(
+            get_tokens_seq2seq, batched=True, fn_kwargs=kwargs
+        )
 
     elif decoder_only_model:
-        dataset = dataset.map(lambda example: {"combined": example["prompt"] + " " + example["completion"]})
-        dataset = dataset.filter(lambda x: len(tokenizer(x["combined"])["input_ids"]) <= tokenizer.model_max_length)
+        dataset = dataset.map(
+            lambda example: {
+                "combined": example["prompt"] + " " + example["completion"]
+            }
+        )
+        dataset = dataset.filter(
+            lambda x: len(tokenizer(x["combined"])["input_ids"])
+            <= tokenizer.model_max_length
+        )
         max_length = get_dataset_stats(dataset, tokenizer, "combined")
-        kwargs = {"max_length" : max_length}
-        tokenized_dataset = dataset.map(get_tokens_causal, batched=True, fn_kwargs=kwargs)
+        kwargs = {"max_length": max_length}
+        tokenized_dataset = dataset.map(
+            get_tokens_causal, batched=True, fn_kwargs=kwargs
+        )
 
     print(f"Dataset statistics: {kwargs}")
     print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}")
@@ -167,9 +224,13 @@ def _compute_metrics(metric, eval_preds, tokenizer):
     labels = np.where(labels != IGNORE_INDEX, labels, tokenizer.pad_token_id)
     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 
-    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+    result = metric.compute(
+        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
+    )
     result = {k: round(v * 100, 4) for k, v in result.items()}
-    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+    prediction_lens = [
+        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
+    ]
     result["gen_len"] = np.mean(prediction_lens)
     return result
 
@@ -179,7 +240,9 @@ def on_train_end(self, args, state, control, **kwargs):
         peft_model_path = state.best_model_checkpoint
         kwargs["model"].save_pretrained(peft_model_path)
 
-        pytorch_model_path = os.path.join(state.best_model_checkpoint, "pytorch_model.bin")
+        pytorch_model_path = os.path.join(
+            state.best_model_checkpoint, "pytorch_model.bin"
+        )
         os.remove(pytorch_model_path) if os.path.exists(pytorch_model_path) else None
 
 
@@ -189,13 +252,13 @@ def _train_seq2seq(model, tokenizer, tokenized_dataset, metric, config):
 
     # Define training args
     training_args = Seq2SeqTrainingArguments(
-        output_dir=config['output_dir'],
-        per_device_train_batch_size=config['batch_size'],
-        gradient_accumulation_steps=config['gradient_accumulation_steps'],
-        per_device_eval_batch_size=config['batch_size'],
+        output_dir=config["output_dir"],
+        per_device_train_batch_size=config["batch_size"],
+        gradient_accumulation_steps=config["gradient_accumulation_steps"],
+        per_device_eval_batch_size=config["batch_size"],
         predict_with_generate=True,
-        learning_rate=config['lr'], #1e-4, # 5e-5
-        num_train_epochs=config['epochs'],
+        learning_rate=config["lr"],  # 1e-4, # 5e-5
+        num_train_epochs=config["epochs"],
         # logging & evaluation strategies
         log_level="error",
         logging_dir=f"{config['output_dir']}/logs",
@@ -203,11 +266,11 @@ def _train_seq2seq(model, tokenizer, tokenized_dataset, metric, config):
         logging_steps=500,
         evaluation_strategy="epoch",
         save_strategy="epoch",
-        save_total_limit=config['epochs'],
+        save_total_limit=config["epochs"],
         load_best_model_at_end=True,
         report_to="tensorboard",
-        fp16=config['fp16'],
-        bf16=config['bf16'],
+        fp16=config["fp16"],
+        bf16=config["bf16"],
     )
 
     # Create trainer instance
@@ -219,7 +282,7 @@ def _train_seq2seq(model, tokenizer, tokenized_dataset, metric, config):
         eval_dataset=tokenized_dataset["test"],
         data_collator=data_collator,
         compute_metrics=lambda x: _compute_metrics(metric, x, tokenizer),
-        callbacks=[PeftSavingCallback] if config['peft'] else None,
+        callbacks=[PeftSavingCallback] if config["peft"] else None,
     )
 
     trainer.train()
@@ -239,8 +302,12 @@ def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
         input_embeddings = model.get_input_embeddings().weight.data
         output_embeddings = model.get_output_embeddings().weight.data
 
-        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
-        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True
+        )
 
         input_embeddings[-num_new_tokens:] = input_embeddings_avg
         output_embeddings[-num_new_tokens:] = output_embeddings_avg
@@ -251,18 +318,29 @@ class DataCollatorForSupervisedDataset(object):
     """
     Collate examples for supervised fine-tuning.
     """
+
     tokenizer: PreTrainedTokenizer
 
     def __call__(self, instances):
         pad_token_id = self.tokenizer.pad_token_id
 
-        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
         input_ids, labels = torch.tensor(input_ids), torch.tensor(labels)
 
-        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=pad_token_id
+        )
 
-        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
-        return dict(input_ids=input_ids, labels=labels, attention_mask=input_ids.ne(pad_token_id))
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(pad_token_id),
+        )
 
 
 def _train_causal(model, tokenizer, tokenized_dataset, metric, config):
@@ -271,12 +349,12 @@ def _train_causal(model, tokenizer, tokenized_dataset, metric, config):
 
     # Define training args
     training_args = TrainingArguments(
-        output_dir=config['output_dir'],
-        per_device_train_batch_size=config['batch_size'],
-        gradient_accumulation_steps=config['gradient_accumulation_steps'],
-        per_device_eval_batch_size=config['batch_size'],
-        learning_rate=config['lr'], #1e-4,# 5e-5
-        num_train_epochs=config['epochs'],
+        output_dir=config["output_dir"],
+        per_device_train_batch_size=config["batch_size"],
+        gradient_accumulation_steps=config["gradient_accumulation_steps"],
+        per_device_eval_batch_size=config["batch_size"],
+        learning_rate=config["lr"],  # 1e-4,# 5e-5
+        num_train_epochs=config["epochs"],
         # logging & evaluation strategies
         log_level="error",
         logging_dir=f"{config['output_dir']}/logs",
@@ -284,11 +362,11 @@ def _train_causal(model, tokenizer, tokenized_dataset, metric, config):
         logging_steps=500,
         evaluation_strategy="epoch",
         save_strategy="epoch",
-        save_total_limit=config['epochs'],
+        save_total_limit=config["epochs"],
         load_best_model_at_end=True,
         report_to="tensorboard",
-        fp16=config['fp16'],
-        bf16=config['bf16'],
+        fp16=config["fp16"],
+        bf16=config["bf16"],
     )
 
     # Create trainer instance
@@ -299,7 +377,7 @@ def _train_causal(model, tokenizer, tokenized_dataset, metric, config):
         train_dataset=tokenized_dataset["train"],
         eval_dataset=tokenized_dataset["test"],
         data_collator=data_collator,
-        callbacks=[PeftSavingCallback] if config['peft'] else None,
+        callbacks=[PeftSavingCallback] if config["peft"] else None,
     )
 
     trainer.train()
@@ -310,43 +388,65 @@ def _train_causal(model, tokenizer, tokenized_dataset, metric, config):
 def finetune_hf(data_path, target, config):
     set_seed(42)
 
-    output_dir = os.path.join('../finetuning_ckpts', config['save'])
+    output_dir = os.path.join("../finetuning_ckpts", config["save"])
 
     if os.path.exists(output_dir):
         # training completed, load best model
-        ckpts = glob.glob(f'{output_dir}/checkpoint*')
-        final_ckpt = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))[-1]
-        with open(os.path.join(final_ckpt, 'trainer_state.json'), 'r') as f:
+        ckpts = glob.glob(f"{output_dir}/checkpoint*")
+        final_ckpt = sorted(ckpts, key=lambda x: int(x.split("-")[-1]))[-1]
+        with open(os.path.join(final_ckpt, "trainer_state.json"), "r") as f:
             state = json.load(f)
-        best_model_checkpoint = state['best_model_checkpoint']
+        best_model_checkpoint = state["best_model_checkpoint"]
 
     else:
         os.makedirs(output_dir, exist_ok=True)
-        config['target'] = target
-        config['output_dir'] = output_dir
-        with open(os.path.join(config['output_dir'], 'compiler_config.json'), 'w') as f:
+        config["target"] = target
+        config["output_dir"] = output_dir
+        with open(os.path.join(config["output_dir"], "compiler_config.json"), "w") as f:
             json.dump(config, f)
 
         architecture = AutoConfig.from_pretrained(target).__dict__["architectures"][0]
-        encoder_decoder_model = ("ConditionalGeneration" in architecture) or ("T5WithLMHeadModel" in architecture)
-        decoder_only_model = ("CausalLM" in architecture) or ("GPT2LMHeadModel" in architecture)
-        assert encoder_decoder_model or decoder_only_model, f"Unknown HuggingFace model class: {target}"
-        assert not config['fid'] or encoder_decoder_model, f"Model must be encoder-decoder for Fusion in Decoder"
-        assert not config['fid'] or not config['peft'], f"FiD and PEFT can't be trained together"
+        encoder_decoder_model = ("ConditionalGeneration" in architecture) or (
+            "T5WithLMHeadModel" in architecture
+        )
+        decoder_only_model = ("CausalLM" in architecture) or (
+            "GPT2LMHeadModel" in architecture
+        )
+        assert (
+            encoder_decoder_model or decoder_only_model
+        ), f"Unknown HuggingFace model class: {target}"
+        assert (
+            not config["fid"] or encoder_decoder_model
+        ), "Model must be encoder-decoder for Fusion in Decoder"
+        assert (
+            not config["fid"] or not config["peft"]
+        ), "FiD and PEFT can't be trained together"
 
         # load model
-        AutoModelClass = AutoModelForSeq2SeqLM if encoder_decoder_model else AutoModelForCausalLM
-        if config['peft']:
-            model = AutoModelClass.from_pretrained(target, device_map='auto')
-            task_type = TaskType.SEQ_2_SEQ_LM if encoder_decoder_model else TaskType.CAUSAL_LM
-            peft_config = LoraConfig(task_type=task_type, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
+        AutoModelClass = (
+            AutoModelForSeq2SeqLM if encoder_decoder_model else AutoModelForCausalLM
+        )
+        if config["peft"]:
+            model = AutoModelClass.from_pretrained(target, device_map="auto")
+            task_type = (
+                TaskType.SEQ_2_SEQ_LM if encoder_decoder_model else TaskType.CAUSAL_LM
+            )
+            peft_config = LoraConfig(
+                task_type=task_type,
+                inference_mode=False,
+                r=8,
+                lora_alpha=32,
+                lora_dropout=0.1,
+            )
             model = get_peft_model(model, peft_config)
             model.print_trainable_parameters()
         else:
-            if config['fid']:
-                t5 = AutoModelClass.from_pretrained(target)
-                model = FiDT5(t5.config)
-                model.load_t5(t5.state_dict())
+            if config["fid"]:
+                # dsp.modules.finetuning.fid no longer exists
+                raise NotImplementedError()
+                # t5 = AutoModelClass.from_pretrained(target)
+                # model = FiDT5(t5.config)
+                # model.load_t5(t5.state_dict())
             else:
                 model = AutoModelClass.from_pretrained(target)
                 # model = _freeze_model_layers(model, unfreeze_last_n=2)
@@ -358,17 +458,25 @@ def finetune_hf(data_path, target, config):
 
         # load data
         dataset = _load_data(data_path)
-        dataset = _preprocess_data(dataset, tokenizer, encoder_decoder_model, decoder_only_model, config)
-        tokenized_dataset = _tokenize_dataset(dataset, tokenizer, encoder_decoder_model, decoder_only_model)
+        dataset = _preprocess_data(
+            dataset, tokenizer, encoder_decoder_model, decoder_only_model, config
+        )
+        tokenized_dataset = _tokenize_dataset(
+            dataset, tokenizer, encoder_decoder_model, decoder_only_model
+        )
         tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
-        print(f'Finetuning dataset: {tokenized_dataset}')
+        print(f"Finetuning dataset: {tokenized_dataset}")
 
         # start training
         metric = evaluate.load("rouge")
         if encoder_decoder_model:
-            best_model_checkpoint = _train_seq2seq(model, tokenizer, tokenized_dataset, metric, config)
+            best_model_checkpoint = _train_seq2seq(
+                model, tokenizer, tokenized_dataset, metric, config
+            )
         elif decoder_only_model:
-            best_model_checkpoint = _train_causal(model, tokenizer, tokenized_dataset, metric, config)
+            best_model_checkpoint = _train_causal(
+                model, tokenizer, tokenized_dataset, metric, config
+            )
 
-    print(f'Best checkpoint of model: {best_model_checkpoint}')
+    print(f"Best checkpoint of model: {best_model_checkpoint}")
     return best_model_checkpoint
diff --git a/dsp/modules/gpt3.py b/dsp/modules/gpt3.py
index f0c7b6679f..732f48a871 100644
--- a/dsp/modules/gpt3.py
+++ b/dsp/modules/gpt3.py
@@ -42,7 +42,12 @@ def __init__(
         super().__init__(model)
         self.provider = "openai"
 
-        default_model_type = "chat" if ('gpt-3.5' in model or 'turbo' in model or 'gpt-4' in model) and ('instruct' not in model) else "text"
+        default_model_type = (
+            "chat"
+            if ("gpt-3.5" in model or "turbo" in model or "gpt-4" in model)
+            and ("instruct" not in model)
+            else "text"
+        )
         self.model_type = model_type if model_type else default_model_type
 
         if api_provider == "azure":
@@ -70,7 +75,7 @@ def __init__(
             "n": 1,
             **kwargs,
         }  # TODO: add kwargs above for </s>
-        
+
         if api_provider != "azure":
             self.kwargs["model"] = model
         self.history: list[dict[str, Any]] = []
@@ -85,11 +90,9 @@ def basic_request(self, prompt: str, **kwargs) -> OpenAIObject:
         if self.model_type == "chat":
             # caching mechanism requires hashable kwargs
             kwargs["messages"] = [{"role": "user", "content": prompt}]
-            kwargs = {
-                "stringify_request": json.dumps(kwargs)
-            }
+            kwargs = {"stringify_request": json.dumps(kwargs)}
             response = cached_gpt3_turbo_request(**kwargs)
-            
+
         else:
             kwargs["prompt"] = prompt
             response = cached_gpt3_request(**kwargs)
@@ -114,7 +117,7 @@ def request(self, prompt: str, **kwargs) -> OpenAIObject:
         """Handles retreival of GPT-3 completions whilst handling rate limiting and caching."""
         if "model_type" in kwargs:
             del kwargs["model_type"]
-        
+
         return self.basic_request(prompt, **kwargs)
 
     def _get_choice_text(self, choice: dict[str, Any]) -> str:
diff --git a/dsp/modules/hf.py b/dsp/modules/hf.py
index 79cdffeb1c..6353764bbd 100644
--- a/dsp/modules/hf.py
+++ b/dsp/modules/hf.py
@@ -1,13 +1,10 @@
-import os
-import json
 # from peft import PeftConfig, PeftModel
 # from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from typing import Optional, Literal
 
 from dsp.modules.lm import LM
 # from dsp.modules.finetuning.finetune_hf import preprocess_prompt
-from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
-import functools
+
 
 def openai_to_hf(**kwargs):
     hf_kwargs = {}
@@ -29,19 +26,31 @@ def openai_to_hf(**kwargs):
 
 
 class HFModel(LM):
-    def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool = False,
-                 hf_device_map: Literal["auto", "balanced", "balanced_low_0", "sequential"] = "auto"):
+    def __init__(
+        self,
+        model: str,
+        checkpoint: Optional[str] = None,
+        is_client: bool = False,
+        hf_device_map: Literal[
+            "auto", "balanced", "balanced_low_0", "sequential"
+        ] = "auto",
+    ):
         """wrapper for Hugging Face models
 
         Args:
             model (str): HF model identifier to load and use
             checkpoint (str, optional): load specific checkpoints of the model. Defaults to None.
             is_client (bool, optional): whether to access models via client. Defaults to False.
-            hf_device_map (str, optional): HF config strategy to load the model. 
+            hf_device_map (str, optional): HF config strategy to load the model.
                 Recommeded to use "auto", which will help loading large models using accelerate. Defaults to "auto".
         """
         try:
-            from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, AutoConfig
+            from transformers import (
+                AutoModelForSeq2SeqLM,
+                AutoModelForCausalLM,
+                AutoTokenizer,
+                AutoConfig,
+            )
             import torch
         except ImportError as exc:
             raise ModuleNotFoundError(
@@ -54,18 +63,32 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if not self.is_client:
             try:
-                architecture = AutoConfig.from_pretrained(model).__dict__["architectures"][0]
-                self.encoder_decoder_model = ("ConditionalGeneration" in architecture) or ("T5WithLMHeadModel" in architecture)
-                self.decoder_only_model = ("CausalLM" in architecture) or ("GPT2LMHeadModel" in architecture)
-                assert self.encoder_decoder_model or self.decoder_only_model, f"Unknown HuggingFace model class: {model}"
-                self.tokenizer = AutoTokenizer.from_pretrained(model if checkpoint is None else checkpoint)
+                architecture = AutoConfig.from_pretrained(model).__dict__[
+                    "architectures"
+                ][0]
+                self.encoder_decoder_model = (
+                    "ConditionalGeneration" in architecture
+                ) or ("T5WithLMHeadModel" in architecture)
+                self.decoder_only_model = ("CausalLM" in architecture) or (
+                    "GPT2LMHeadModel" in architecture
+                )
+                assert (
+                    self.encoder_decoder_model or self.decoder_only_model
+                ), f"Unknown HuggingFace model class: {model}"
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    model if checkpoint is None else checkpoint
+                )
 
                 self.rationale = True
-                AutoModelClass = AutoModelForSeq2SeqLM if self.encoder_decoder_model else AutoModelForCausalLM
+                AutoModelClass = (
+                    AutoModelForSeq2SeqLM
+                    if self.encoder_decoder_model
+                    else AutoModelForCausalLM
+                )
                 if checkpoint:
                     # with open(os.path.join(checkpoint, '..', 'compiler_config.json'), 'r') as f:
                     #     config = json.load(f)
-                    self.rationale = False #config['rationale']
+                    self.rationale = False  # config['rationale']
                     # if config['peft']:
                     #     peft_config = PeftConfig.from_pretrained(checkpoint)
                     #     self.model = AutoModelClass.from_pretrained(peft_config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map=hf_device_map)
@@ -78,7 +101,7 @@ def __init__(self, model: str, checkpoint: Optional[str] = None, is_client: bool
             except ValueError:
                 self.model = AutoModelForCausalLM.from_pretrained(
                     model if checkpoint is None else checkpoint,
-                    device_map=hf_device_map
+                    device_map=hf_device_map,
                 )
                 self.drop_prompt_from_output = True
                 self.tokenizer = AutoTokenizer.from_pretrained(model)
@@ -107,7 +130,7 @@ def _generate(self, prompt, **kwargs):
         # print(prompt)
         if isinstance(prompt, dict):
             try:
-                prompt = prompt['messages'][0]['content']
+                prompt = prompt["messages"][0]["content"]
             except (KeyError, IndexError, TypeError):
                 print("Failed to extract 'content' from the prompt.")
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
diff --git a/dsp/modules/hf_client.py b/dsp/modules/hf_client.py
index 003771f106..ad9a8d8229 100644
--- a/dsp/modules/hf_client.py
+++ b/dsp/modules/hf_client.py
@@ -1,20 +1,19 @@
-import functools
 import os
 import random
 import requests
 from dsp.modules.hf import HFModel, openai_to_hf
-from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory, cache_turn_on
-import os
+from dsp.modules.cache_utils import CacheMemory, NotebookCacheMemory
 import subprocess
 import re
 import shutil
-import time
 
 # from dsp.modules.adapter import TurboAdapter, DavinciAdapter, LlamaAdapter
 
 
 class HFClientTGI(HFModel):
-    def __init__(self, model, port, url="http://future-hgx-1", http_request_kwargs=None, **kwargs):
+    def __init__(
+        self, model, port, url="http://future-hgx-1", http_request_kwargs=None, **kwargs
+    ):
         super().__init__(model=model, is_client=True)
 
         self.url = url
@@ -38,15 +37,15 @@ def _generate(self, prompt, **kwargs):
         kwargs = {**self.kwargs, **kwargs}
 
         payload = {
-        "inputs": prompt,
-        "parameters": {
-            "do_sample": kwargs["n"] > 1,
-            "best_of": kwargs["n"],
-            "details": kwargs["n"] > 1,
-            # "max_new_tokens": kwargs.get('max_tokens', kwargs.get('max_new_tokens', 75)),
-            # "stop": ["\n", "\n\n"],
-            **kwargs,
-            }
+            "inputs": prompt,
+            "parameters": {
+                "do_sample": kwargs["n"] > 1,
+                "best_of": kwargs["n"],
+                "details": kwargs["n"] > 1,
+                # "max_new_tokens": kwargs.get('max_tokens', kwargs.get('max_new_tokens', 75)),
+                # "stop": ["\n", "\n\n"],
+                **kwargs,
+            },
         }
 
         payload["parameters"] = openai_to_hf(**payload["parameters"])
@@ -85,17 +84,18 @@ def _generate(self, prompt, **kwargs):
 
             response = {"prompt": prompt, "choices": [{"text": c} for c in completions]}
             return response
-        except Exception as e:
+        except Exception:
             print("Failed to parse JSON response:", response.text)
             raise Exception("Received invalid JSON response from server")
 
 
-@CacheMemory.cache(ignore=['arg'])
+@CacheMemory.cache(ignore=["arg"])
 def send_hftgi_request_v01(arg, url, ports, **kwargs):
     return requests.post(arg, **kwargs)
 
+
 # @functools.lru_cache(maxsize=None if cache_turn_on else 0)
-@NotebookCacheMemory.cache(ignore=['arg'])
+@NotebookCacheMemory.cache(ignore=["arg"])
 def send_hftgi_request_v01_wrapped(arg, url, ports, **kwargs):
     return send_hftgi_request_v01(arg, url, ports, **kwargs)
 
@@ -104,37 +104,68 @@ def send_hftgi_request_v01_wrapped(arg, url, ports, **kwargs):
 def send_hftgi_request_v00(arg, **kwargs):
     return requests.post(arg, **kwargs)
 
+
 class HFServerTGI:
     def __init__(self, user_dir):
-        self.model_weights_dir = os.path.abspath(os.path.join(os.getcwd(), "text-generation-inference", user_dir))
+        self.model_weights_dir = os.path.abspath(
+            os.path.join(os.getcwd(), "text-generation-inference", user_dir)
+        )
         if not os.path.exists(self.model_weights_dir):
             os.makedirs(self.model_weights_dir)
 
     def close_server(self, port):
-        process = subprocess.Popen(['docker', 'ps'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        process = subprocess.Popen(
+            ["docker", "ps"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
         stdout, _ = process.communicate()
         print(stdout)
         if stdout:
-            container_ids = stdout.decode().strip().split('\n')
+            container_ids = stdout.decode().strip().split("\n")
             container_ids = container_ids[1:]
             for container_id in container_ids:
-                match = re.search(r'^([a-zA-Z0-9]+)', container_id)
+                match = re.search(r"^([a-zA-Z0-9]+)", container_id)
                 if match:
                     container_id = match.group(1)
-                    port_mapping = subprocess.check_output(['docker', 'port', container_id]).decode().strip()
-                    if f'0.0.0.0:{port}' in port_mapping:
-                        subprocess.run(['docker', 'stop', container_id])
-
-    def run_server(self, port, model_name=None, model_path=None, env_variable=None, gpus="all", num_shard=1, max_input_length=4000, max_total_tokens=4096, max_best_of=100):        
+                    port_mapping = (
+                        subprocess.check_output(["docker", "port", container_id])
+                        .decode()
+                        .strip()
+                    )
+                    if f"0.0.0.0:{port}" in port_mapping:
+                        subprocess.run(["docker", "stop", container_id])
+
+    def run_server(
+        self,
+        port,
+        model_name=None,
+        model_path=None,
+        env_variable=None,
+        gpus="all",
+        num_shard=1,
+        max_input_length=4000,
+        max_total_tokens=4096,
+        max_best_of=100,
+    ):
         self.close_server(port)
         if model_path:
             model_file_name = os.path.basename(model_path)
             link_path = os.path.join(self.model_weights_dir, model_file_name)
             shutil.copytree(model_path, link_path)
-            model_name = os.path.sep + os.path.basename(self.model_weights_dir) + os.path.sep + os.path.basename(model_path)
-        docker_command = f'docker run --gpus {gpus} --shm-size 1g -p {port}:80 -v {self.model_weights_dir}:{os.path.sep + os.path.basename(self.model_weights_dir)} -e {env_variable} ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id {model_name} --num-shard {num_shard} --max-input-length {max_input_length} --max-total-tokens {max_total_tokens} --max-best-of {max_best_of}'
+            model_name = (
+                os.path.sep
+                + os.path.basename(self.model_weights_dir)
+                + os.path.sep
+                + os.path.basename(model_path)
+            )
+        docker_command = f"docker run --gpus {gpus} --shm-size 1g -p {port}:80 -v {self.model_weights_dir}:{os.path.sep + os.path.basename(self.model_weights_dir)} -e {env_variable} ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id {model_name} --num-shard {num_shard} --max-input-length {max_input_length} --max-total-tokens {max_total_tokens} --max-best-of {max_best_of}"
         print(f"Connect Command: {docker_command}")
-        docker_process = subprocess.Popen(docker_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+        docker_process = subprocess.Popen(
+            docker_command,
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
         connected = False
         output = []
         while True:
@@ -142,7 +173,7 @@ def run_server(self, port, model_name=None, model_path=None, env_variable=None,
             if not line:
                 break
             output.append(line.strip())
-            if 'Connected' in line:
+            if "Connected" in line:
                 connected = True
                 break
         if not connected:
@@ -152,6 +183,7 @@ def run_server(self, port, model_name=None, model_path=None, env_variable=None,
             docker_process.terminate()
         docker_process.wait()
 
+
 class Anyscale(HFModel):
     def __init__(self, model, **kwargs):
         super().__init__(model=model, is_client=True)
@@ -159,36 +191,35 @@ def __init__(self, model, **kwargs):
         self.api_base = os.getenv("OPENAI_API_BASE")
         self.token = os.getenv("OPENAI_API_KEY")
         self.model = model
-        self.kwargs = {
-            "temperature": 0.0,
-            "n": 1,
-            **kwargs
-        }
+        self.kwargs = {"temperature": 0.0, "n": 1, **kwargs}
 
     def _generate(self, prompt, use_chat_api=True, **kwargs):
         url = f"{self.api_base}/chat/completions"
         kwargs = {**self.kwargs, **kwargs}
 
         temperature = kwargs.get("temperature")
-        max_tokens = kwargs.get("max_tokens", 150) 
+        max_tokens = kwargs.get("max_tokens", 150)
 
         if use_chat_api:
             messages = [
-                {"role": "system", "content": "You are a helpful assistant. You must continue the user text directly without *any* additional interjections."},
-                {"role": "user", "content": prompt}
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant. You must continue the user text directly without *any* additional interjections.",
+                },
+                {"role": "user", "content": prompt},
             ]
             body = {
                 "model": self.model,
                 "messages": messages,
                 "temperature": temperature,
-                "max_tokens": max_tokens
+                "max_tokens": max_tokens,
             }
         else:
             body = {
                 "model": self.model,
                 "prompt": f"[INST]{prompt}[/INST]",
                 "temperature": temperature,
-                "max_tokens": max_tokens
+                "max_tokens": max_tokens,
             }
 
         headers = {"Authorization": f"Bearer {self.token}"}
@@ -197,10 +228,17 @@ def _generate(self, prompt, use_chat_api=True, **kwargs):
             with self.session.post(url, headers=headers, json=body) as resp:
                 resp_json = resp.json()
                 if use_chat_api:
-                    completions = [resp_json.get('choices', [])[0].get('message', {}).get('content', "")]
+                    completions = [
+                        resp_json.get("choices", [])[0]
+                        .get("message", {})
+                        .get("content", "")
+                    ]
                 else:
-                    completions = [resp_json.get('choices', [])[0].get('text', "")]
-                response = {"prompt": prompt, "choices": [{"text": c} for c in completions]}
+                    completions = [resp_json.get("choices", [])[0].get("text", "")]
+                response = {
+                    "prompt": prompt,
+                    "choices": [{"text": c} for c in completions],
+                }
                 return response
         except Exception as e:
             print(f"Failed to parse JSON response: {e}")
@@ -226,6 +264,6 @@ def _generate(self, prompt, **kwargs):
             completions = [{"text": output}]
             response = {"prompt": prompt, "choices": completions}
             return response
-        except Exception as e:
+        except Exception:
             print("Failed to parse output:", response.text)
             raise Exception("Received invalid output")
diff --git a/dsp/modules/lm.py b/dsp/modules/lm.py
index 468c350ab0..c51d72b5c9 100644
--- a/dsp/modules/lm.py
+++ b/dsp/modules/lm.py
@@ -70,7 +70,7 @@ def inspect_history(self, n: int = 1, skip: int = 0):
             if provider == "cohere":
                 text = choices[0].text
             elif provider == "openai":
-                text = ' ' + self._get_choice_text(choices[0]).strip()
+                text = " " + self._get_choice_text(choices[0]).strip()
             else:
                 text = choices[0]["text"]
             self.print_green(text, end="")
@@ -86,6 +86,6 @@ def __call__(self, prompt, only_completed=True, return_sorted=False, **kwargs):
     def copy(self, **kwargs):
         """Returns a copy of the language model with the same parameters."""
         kwargs = {**self.kwargs, **kwargs}
-        model = kwargs.pop('model')
+        model = kwargs.pop("model")
 
         return self.__class__(model, **kwargs)
diff --git a/dsp/modules/pyserini.py b/dsp/modules/pyserini.py
index 94ce970ea2..1fe031a4bc 100644
--- a/dsp/modules/pyserini.py
+++ b/dsp/modules/pyserini.py
@@ -8,15 +8,17 @@
 class PyseriniRetriever:
     """Wrapper for retrieval with Pyserini. Supports using either pyserini prebuilt faiss indexes or your own faiss index."""
 
-    def __init__(self, 
-                 query_encoder: str = 'castorini/dkrr-dpr-nq-retriever', 
-                 index: str = 'wikipedia-dpr-dkrr-nq', 
-                 dataset: Dataset = None,
-                 id_field: str = '_id',
-                 text_fields: list[str] = ['text']) -> None:
+    def __init__(
+        self,
+        query_encoder: str = "castorini/dkrr-dpr-nq-retriever",
+        index: str = "wikipedia-dpr-dkrr-nq",
+        dataset: Dataset = None,
+        id_field: str = "_id",
+        text_fields: list[str] = ["text"],
+    ) -> None:
         """
         Args:
-        
+
             query_encoder (`str`):
                 Huggingface model to encode queries
             index (`str`):
@@ -28,17 +30,25 @@ def __init__(self,
             text_fields (`list[str]`):
                 A list of the names of the text fields for the dataset used for retrieval.
         """
-        
+
         # Keep pyserini as an optional dependency
         from pyserini.search import FaissSearcher
-        from pyserini.prebuilt_index_info import TF_INDEX_INFO, FAISS_INDEX_INFO, IMPACT_INDEX_INFO
-        
+        from pyserini.prebuilt_index_info import (
+            TF_INDEX_INFO,
+            FAISS_INDEX_INFO,
+            IMPACT_INDEX_INFO,
+        )
+
         self.encoder = FaissSearcher._init_encoder_from_str(query_encoder)
         self.dataset = dataset
         self.id_field = id_field
         self.text_fields = text_fields
-        
-        if index in TF_INDEX_INFO or index in FAISS_INDEX_INFO or index in IMPACT_INDEX_INFO:
+
+        if (
+            index in TF_INDEX_INFO
+            or index in FAISS_INDEX_INFO
+            or index in IMPACT_INDEX_INFO
+        ):
             self.searcher = FaissSearcher.from_prebuilt_index(index, self.encoder)
         else:
             self.searcher = FaissSearcher(index_dir=index, query_encoder=self.encoder)
@@ -46,31 +56,35 @@ def __init__(self,
             self.dataset_id_to_index = {}
             for i, docid in enumerate(self.dataset[self.id_field]):
                 self.dataset_id_to_index[docid] = i
-                
 
     def __call__(
-        self, query: str, k: int = 10, threads: int = 16,
+        self,
+        query: str,
+        k: int = 10,
+        threads: int = 16,
     ) -> Union[list[str], list[dotdict]]:
         hits = self.searcher.search(query, k=k, threads=threads)
-        
+
         topk = []
         for rank, hit in enumerate(hits, start=1):
             if self.dataset is not None:
                 row = self.dataset_id_to_index[hit.docid]
-                text = ' '.join(self.dataset[field][row] for field in self.text_fields)
+                text = " ".join(self.dataset[field][row] for field in self.text_fields)
                 pid = self.dataset[self.id_field][row]
             else:
                 # Pyserini prebuilt faiss indexes can perform docid lookup
                 psg = json.loads(self.searcher.doc(hit.docid).raw())
-                text = ' '.join(psg[field] for field in self.text_fields)
+                text = " ".join(psg[field] for field in self.text_fields)
                 pid = psg[self.id_field]
-            
-            topk.append({
-                'text': text,
-                'long_text': text,
-                'pid': pid,
-                'score': hit.score,
-                'rank': rank,
-            })
-        
-        return [dotdict(psg) for psg in topk]
\ No newline at end of file
+
+            topk.append(
+                {
+                    "text": text,
+                    "long_text": text,
+                    "pid": pid,
+                    "score": hit.score,
+                    "rank": rank,
+                }
+            )
+
+        return [dotdict(psg) for psg in topk]
diff --git a/dsp/modules/sbert.py b/dsp/modules/sbert.py
index f8ac3f80c0..87f70eb89a 100644
--- a/dsp/modules/sbert.py
+++ b/dsp/modules/sbert.py
@@ -1,6 +1,6 @@
 class SentenceTransformersCrossEncoder:
-    """Wrapper for sentence-transformers cross-encoder model.
-    """
+    """Wrapper for sentence-transformers cross-encoder model."""
+
     def __init__(
         self, model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"
     ):
diff --git a/dsp/modules/sentence_vectorizer.py b/dsp/modules/sentence_vectorizer.py
index 0c18902d4c..d50a1a9085 100644
--- a/dsp/modules/sentence_vectorizer.py
+++ b/dsp/modules/sentence_vectorizer.py
@@ -3,17 +3,22 @@
 
 import numpy as np
 import openai
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from dsp.primitives import Example
 
 
 class BaseSentenceVectorizer(abc.ABC):
-    '''
+    """
     Base Class for Vectorizers. The main purpose is to vectorize text (doc/query)
     for ANN/KNN indexes. `__call__` method takes `List[Example]` as a single input, then extracts
     `field_to_vectorize` from every Example and convert them into embeddings.
     You can customize extraction logic in the `_extract_text_from_examples` method.
-    '''
+    """
+
     # embeddings will be computed based on the string in this attribute of Example object
-    field_to_vectorize = 'text_to_vectorize'
+    field_to_vectorize = "text_to_vectorize"
 
     def __init__(self) -> None:
         pass
@@ -24,42 +29,42 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
     def _extract_text_from_examples(self, inp_examples: List["Example"]) -> List[str]:
         text_to_vectorize = [
-            getattr(example, self.field_to_vectorize)
-            for example in inp_examples
+            getattr(example, self.field_to_vectorize) for example in inp_examples
         ]
         return text_to_vectorize
 
 
 class SentenceTransformersVectorizer(BaseSentenceVectorizer):
-    '''
+    """
     Vectorizer bsaed on `SentenceTransformers` models. You can pick any model from this link:
     https://huggingface.co/sentence-transformers
     More details about models:
     https://www.sbert.net/docs/pretrained_models.html
-    '''
+    """
+
     def __init__(
         self,
-        model_name_or_path: str = 'all-MiniLM-L6-v2',
+        model_name_or_path: str = "all-MiniLM-L6-v2",
         vectorize_bs: int = 256,
         max_gpu_devices: int = 1,
-        normalize_embeddings: bool = False
+        normalize_embeddings: bool = False,
     ):
         # this isn't a good practice, but with top-level import the whole DSP
         # module import will be slow (>5 sec), because SentenceTransformer is doing
         # it's directory/file-related magic under the hood :(
-        
+
         try:
             from sentence_transformers import SentenceTransformer
-        except ImportError as e:
+        except ImportError:
             raise ImportError(
                 "You need to install sentence_transformers library to use pretrained embedders. "
                 "Please check the official doc https://www.sbert.net/ "
                 "or simply run `pip install sentence-transformers"
             )
         from dsp.utils.ann_utils import determine_devices
-        
+
         self.num_devices, self.is_gpu = determine_devices(max_gpu_devices)
-        self.proxy_device = 'cuda' if self.is_gpu else 'cpu'
+        self.proxy_device = "cuda" if self.is_gpu else "cpu"
 
         self.model = SentenceTransformer(model_name_or_path, device=self.proxy_device)
 
@@ -75,9 +80,7 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
             pool = self.model.start_multi_process_pool(target_devices=target_devices)
             # Compute the embeddings using the multi-process pool
             emb = self.model.encode_multi_process(
-                sentences=text_to_vectorize,
-                pool=pool,
-                batch_size=self.vectorize_bs
+                sentences=text_to_vectorize, pool=pool, batch_size=self.vectorize_bs
             )
             self.model.stop_multi_process_pool(pool)
             # for some reason, multi-GPU setup doesn't accept normalize_embeddings parameter
@@ -89,17 +92,18 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
             emb = self.model.encode(
                 sentences=text_to_vectorize,
                 batch_size=self.vectorize_bs,
-                normalize_embeddings=self.normalize_embeddings
+                normalize_embeddings=self.normalize_embeddings,
             )
             return emb
 
 
 class NaiveGetFieldVectorizer(BaseSentenceVectorizer):
-    '''
-    If embeddings were precomputed, then we could just extract them from the proper field 
+    """
+    If embeddings were precomputed, then we could just extract them from the proper field
     (set by `field_with_embedding`) from each `Example`.
-    '''
-    def __init__(self, field_with_embedding: str = 'vectorized'):
+    """
+
+    def __init__(self, field_with_embedding: str = "vectorized"):
         self.field_with_embedding = field_with_embedding
 
     def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
@@ -112,16 +116,17 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
 
 class OpenAIVectorizer(BaseSentenceVectorizer):
-    '''
+    """
     This vectorizer uses OpenAI API to convert texts to embeddings. Changing `model` is not
     recommended. More about the model: https://openai.com/blog/new-and-improved-embedding-model/
     `api_key` should be passed as an argument or as env variable (`OPENAI_API_KEY`).
-    '''
+    """
+
     def __init__(
         self,
-        model: str = 'text-embedding-ada-002',
+        model: str = "text-embedding-ada-002",
         embed_batch_size: int = 1024,
-        api_key: Optional[str] = None
+        api_key: Optional[str] = None,
     ):
         self.model = model
         self.embed_batch_size = embed_batch_size
@@ -138,14 +143,13 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
         for cur_batch_idx in range(n_batches):  # tqdm.tqdm?
             start_idx = cur_batch_idx * self.embed_batch_size
             end_idx = (cur_batch_idx + 1) * self.embed_batch_size
-            cur_batch = text_to_vectorize[start_idx: end_idx]
+            cur_batch = text_to_vectorize[start_idx:end_idx]
             # OpenAI API call:
-            response = openai.Embedding.create(
-                model=self.model,
-                input=cur_batch
-            )
+            response = openai.Embedding.create(model=self.model, input=cur_batch)
 
-            cur_batch_embeddings = [cur_obj['embedding'] for cur_obj in response['data']]
+            cur_batch_embeddings = [
+                cur_obj["embedding"] for cur_obj in response["data"]
+            ]
             embeddings_list.extend(cur_batch_embeddings)
 
         embeddings = np.array(embeddings_list, dtype=np.float32)
diff --git a/dsp/primitives/compiler.py b/dsp/primitives/compiler.py
index 996919e856..da602ec842 100644
--- a/dsp/primitives/compiler.py
+++ b/dsp/primitives/compiler.py
@@ -7,18 +7,28 @@
 
 import dsp
 from datasets.fingerprint import Hasher
+import logging
 
-if os.environ.get('DSP_NOTEBOOK_CACHEDIR'):
-    training_data_directory = os.path.join(os.environ.get('DSP_NOTEBOOK_CACHEDIR'), 'compiler')
+if os.environ.get("DSP_NOTEBOOK_CACHEDIR"):
+    training_data_directory = os.path.join(
+        os.environ.get("DSP_NOTEBOOK_CACHEDIR"), "compiler"
+    )
 else:
-    training_data_directory = 'cache/compiler'
+    training_data_directory = "cache/compiler"
 
 
-compilations_assumed_to_exist={'ft-zvEdzQVQ5xwlxvNPrxl6kpnw': 'ada:ft-stanfordpraglab-2023-02-09-19-50-49'}
+logger = logging.getLogger(__name__)
+
+compilations_assumed_to_exist = {
+    "ft-zvEdzQVQ5xwlxvNPrxl6kpnw": "ada:ft-stanfordpraglab-2023-02-09-19-50-49"
+}
 
 
 def openai_check_finetune(jobname):
-    if dsp.settings.force_reuse_cached_compilation and jobname in compilations_assumed_to_exist:
+    if (
+        dsp.settings.force_reuse_cached_compilation
+        and jobname in compilations_assumed_to_exist
+    ):
         return compilations_assumed_to_exist[jobname]
 
     command = f"""openai api fine_tunes.get -i {jobname}"""
@@ -29,14 +39,17 @@ def openai_check_finetune(jobname):
 
     try:
         output = ujson.loads(output)
-        if output['status'] == 'succeeded':
-            return output['fine_tuned_model']
+        if output["status"] == "succeeded":
+            return output["fine_tuned_model"]
 
-        if output['status'] in ['pending', 'running']:
-            print(f'Compiling, run ```openai api fine_tunes.follow -i {jobname}``` for details...')
+        if output["status"] in ["pending", "running"]:
+            print(
+                f"Compiling, run ```openai api fine_tunes.follow -i {jobname}``` for details..."
+            )
             time.sleep(60)
             return openai_check_finetune(jobname)
-    except:
+    except Exception as e:
+        logger.exception(e)
         pass
 
     return False
@@ -49,13 +62,13 @@ def convert_to_training_point2(y, inputs, outputs, template):
     prompt = template(y_, show_guidelines=False)
 
     completion = y[outputs[0]]
-    output_fields = template.fields[len(inputs):]
+    output_fields = template.fields[len(inputs) :]
 
     for field in output_fields[1:]:
         completion += f"\n\n{field.name} " + y[field.output_variable]
-    
+
     completion = " " + completion + " </s>"
-    return {'prompt': prompt, 'completion': completion}
+    return {"prompt": prompt, "completion": completion}
 
 
 def simulate(program, input_examples):
@@ -67,9 +80,18 @@ def simulate(program, input_examples):
         if prediction is not None:
             # assert len(prediction.compiling_stages) == 2, "TMP"
             for stage in prediction.compiling_stages:
-                name, template, inputs, outputs = stage['name'], stage['template'], stage['inputs'], stage['outputs']
-                training_data.append(convert_to_training_point2(prediction.get(name), inputs, outputs, template))
-    
+                name, template, inputs, outputs = (
+                    stage["name"],
+                    stage["template"],
+                    stage["inputs"],
+                    stage["outputs"],
+                )
+                training_data.append(
+                    convert_to_training_point2(
+                        prediction.get(name), inputs, outputs, template
+                    )
+                )
+
     r = random.Random(0)
     r.shuffle(training_data)
 
@@ -84,19 +106,21 @@ def openai_finetune_(name, target):
     print(command)
 
     # command = """python script.py"""
-    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    process = subprocess.Popen(
+        command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
 
     while line := process.stdout.readline().decode().strip():
-        if 'created fine-tune:' in line.lower():
+        if "created fine-tune:" in line.lower():
             jobname = line.split()[-1]
             break
-        
+
     #     if 'costs $' in line.lower():
     #         cost = line.split()[-1]
     #         break
 
     # assert cost[0] == '$'
-    
+
     # if float(cost[1:]) > 300:
     #     print(f'Got cost {cost} -- you may wanna cancel the job: openai api fine_tunes.cancel -i {jobname}')
 
@@ -115,7 +139,7 @@ def openai_finetune_(name, target):
 def openai_finetune(name, target):
     print(name)
     training_data_path = name_to_path(name)
-    training_data_path += '.model'
+    training_data_path += ".model"
 
     # if path + stuff exists, load the tuple from it
     try:
@@ -124,14 +148,15 @@ def openai_finetune(name, target):
 
         if openai_check_finetune(jobname):
             return jobname, ft
-    except:
+    except Exception as e:
+        logger.exception(e)
         pass
-    
+
     jobname, ft = openai_finetune_(name, target)
 
-    with open(training_data_path, 'w') as f:
-        f.write(ujson.dumps((jobname, ft)) + '\n')
-    
+    with open(training_data_path, "w") as f:
+        f.write(ujson.dumps((jobname, ft)) + "\n")
+
     return jobname, ft
 
 
@@ -139,7 +164,7 @@ def name_to_path(name):
     if not os.path.exists(training_data_directory):
         os.makedirs(training_data_directory)
 
-    training_data_path = os.path.join(training_data_directory, f'{name}.jsonl')
+    training_data_path = os.path.join(training_data_directory, f"{name}.jsonl")
     return training_data_path
 
 
@@ -148,9 +173,9 @@ def finetune(training_data, target):
     name = Hasher.hash(training_data)
     training_data_path = name_to_path(name)
 
-    with open(training_data_path, 'w') as f:
+    with open(training_data_path, "w") as f:
         for line in training_data:
-            f.write(ujson.dumps(line) + '\n')
+            f.write(ujson.dumps(line) + "\n")
 
     jobname, ft = openai_finetune(name, target)
     print(ft)
@@ -158,8 +183,9 @@ def finetune(training_data, target):
     ft = dsp.GPT3(model=ft, stop=" </s>")
     return ft
 
+
 # 4. Return updated program.
-def compile(program, examples, target='ada'):
+def compile(program, examples, target="ada"):
     training_data = simulate(program, examples)
     compiled_lm = finetune(training_data, target=target)
 
@@ -169,4 +195,3 @@ def compiled_program(*args, **kwargs):
 
     compiled_program.lm = compiled_lm
     return compiled_program
-
diff --git a/dsp/primitives/demonstrate.py b/dsp/primitives/demonstrate.py
index 525314105f..13a5f7882f 100644
--- a/dsp/primitives/demonstrate.py
+++ b/dsp/primitives/demonstrate.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 import dsp
+from dsp.modules.sentence_vectorizer import BaseSentenceVectorizer
 from dsp.utils import EM, F1, DPR_normalize, dotdict, has_answer, normalize_text
 
 
@@ -148,7 +149,7 @@ def cast_naive_get_question_and_answer(inp_example: Example) -> Example:
 def knn(
     train: list[Example],
     cast: Callable[[Example], Example] = cast_naive_get_only_question_text,
-    **knn_args
+    **knn_args,
 ) -> Callable[[Example, int], list[Example]]:
     """
     A function that vectorizes train data using `dsm.settings.vectorizer`, then build an ANN/KNN
diff --git a/dsp/primitives/inspect.py b/dsp/primitives/inspect.py
index 2b6202bbb3..5f607d69e7 100644
--- a/dsp/primitives/inspect.py
+++ b/dsp/primitives/inspect.py
@@ -6,87 +6,83 @@
 
 
 class FuncInspector:
-  def __init__(self):
-    self.calls = []
+    def __init__(self):
+        self.calls = []
 
+    def inspect_inner(self, func, function_calls):
+        def wrapper(*args, **kwargs):
+            result = func(*args, **kwargs)
+            self.merge_result(result, function_calls)
+            return result
 
-  def inspect_inner(self, func, function_calls):
-    def wrapper(*args, **kwargs):
-      result = func(*args, **kwargs)
-      self.merge_result(result, function_calls)
-      return result
-    return wrapper
+        return wrapper
 
+    def inspect_func(self, func):
+        def wrapper(*args, **kwargs):
+            result = func(*args, **kwargs)
+            stack = inspect.stack()
+            function_calls = []
+            for i in range(len(stack)):
+                if stack[i][3] == "<module>":
+                    break
+                if stack[i][3] != "wrapper":
+                    function_calls.append(stack[i][3])
+            function_calls.reverse()
+            result = self.inspect_inner(result, function_calls)
+            return result
 
-  def inspect_func(self, func):
-    def wrapper(*args, **kwargs):
-      result = func(*args, **kwargs)
-      stack = inspect.stack()
-      function_calls = []
-      for i in range(len(stack)):
-        if stack[i][3] == "<module>":
-          break
-        if stack[i][3] != "wrapper":
-          function_calls.append(stack[i][3])
-      function_calls.reverse()
-      result = self.inspect_inner(result, function_calls)
-      return result
-    return wrapper
-  
-  
-  def parse(self, obj, delete_empty=False):
-    if isinstance(obj, list):
-      for elem in obj:
-        self.parse(elem, delete_empty)
-    if isinstance(obj, dict):
-      to_delete = []
-      for key in obj:
-        if delete_empty and not obj[key]:
-          to_delete.append(key)
-        elif key == "completions":
-          to_delete.append(key)
-        else:
-          self.parse(obj[key], delete_empty)
-      for key in to_delete:
-        obj.pop(key)
-
+        return wrapper
 
-  def merge_result(self, result, function_calls):
-    prev_list = self.calls
-    prev_call = {} if not prev_list else prev_list[-1]
-    for call in function_calls[:-1]:
-      if call not in prev_call:
-        prev_call = {call: []}
-        prev_list.append(prev_call)
-      prev_list = prev_call[call]
-      prev_call = {} if not prev_list else prev_list[-1]
+    def parse(self, obj, delete_empty=False):
+        if isinstance(obj, list):
+            for elem in obj:
+                self.parse(elem, delete_empty)
+        if isinstance(obj, dict):
+            to_delete = []
+            for key in obj:
+                if delete_empty and not obj[key]:
+                    to_delete.append(key)
+                elif key == "completions":
+                    to_delete.append(key)
+                else:
+                    self.parse(obj[key], delete_empty)
+            for key in to_delete:
+                obj.pop(key)
 
-    example_obj = result[0]
-    self.parse(example_obj)
-    prev_list.append({ function_calls[-1]: example_obj })
+    def merge_result(self, result, function_calls):
+        prev_list = self.calls
+        prev_call = {} if not prev_list else prev_list[-1]
+        for call in function_calls[:-1]:
+            if call not in prev_call:
+                prev_call = {call: []}
+                prev_list.append(prev_call)
+            prev_list = prev_call[call]
+            prev_call = {} if not prev_list else prev_list[-1]
 
+        example_obj = result[0]
+        self.parse(example_obj)
+        prev_list.append({function_calls[-1]: example_obj})
 
-  def view_data(self):
-    chars = string.digits + string.ascii_lowercase
-    id = ''.join(random.choices(chars, k=8))
+    def view_data(self):
+        chars = string.digits + string.ascii_lowercase
+        id = "".join(random.choices(chars, k=8))
 
-    post_url = 'http://127.0.0.1:5000/log-item'
-    parsed_calls = self.calls.copy()
-    self.parse(parsed_calls, delete_empty=True)
-    data = {'id': id, 'content': parsed_calls}
-    response = requests.post(post_url, json=data)
-    
-    if response.status_code == 201:
-      print('Data created successfully')
-    else:
-      print(f'Error sending data to server: {response.status_code}')
-      return
+        post_url = "http://127.0.0.1:5000/log-item"
+        parsed_calls = self.calls.copy()
+        self.parse(parsed_calls, delete_empty=True)
+        data = {"id": id, "content": parsed_calls}
+        response = requests.post(post_url, json=data)
 
-    frontend_url = f"http://localhost:3000?id={id}"
-    print(f"View the data here, {frontend_url}")
+        if response.status_code == 201:
+            print("Data created successfully")
+        else:
+            print(f"Error sending data to server: {response.status_code}")
+            return
 
+        frontend_url = f"http://localhost:3000?id={id}"
+        print(f"View the data here, {frontend_url}")
 
-  def output_json(self, out_path):
-    f = open(out_path, "w")
-    json_object = json.dumps(self.calls, indent=2)
-    f.write(json_object)
+    def output_json(self, out_path):
+        f = open(out_path, "w")
+        json_object = json.dumps(self.calls, indent=2)
+        f.write(json_object)
diff --git a/dsp/primitives/predict.py b/dsp/primitives/predict.py
index 6f442f6437..138ebdb912 100644
--- a/dsp/primitives/predict.py
+++ b/dsp/primitives/predict.py
@@ -3,7 +3,6 @@
 
 import dsp
 from dsp.utils import zipstar, normalize_text
-from dsp.primitives.inspect import FuncInspector
 from dsp.utils.utils import dotdict
 from dsp.templates.template_v3 import Template
 from dsp.primitives.demonstrate import Example
diff --git a/dsp/primitives/primitives.py b/dsp/primitives/primitives.py
index fd839dea25..72a0751dbc 100644
--- a/dsp/primitives/primitives.py
+++ b/dsp/primitives/primitives.py
@@ -1,22 +1,29 @@
 import dsp
-import copy
 from functools import wraps
 
+
 # applied right to left (innermost first, like function calls)
 def compose_decorators(*decorators):
     def decorator(func):
         for decorator in decorators[::-1]:
             func = decorator(func)
         return func
+
     return decorator
 
 
 def shallow_copy_example_args(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
-        args = [dsp.Example(arg) if isinstance(arg, dsp.Example) else arg for arg in args]
-        kwargs = {key: dsp.Example(value) if isinstance(value, dsp.Example) else value for key, value in kwargs.items()}
+        args = [
+            dsp.Example(arg) if isinstance(arg, dsp.Example) else arg for arg in args
+        ]
+        kwargs = {
+            key: dsp.Example(value) if isinstance(value, dsp.Example) else value
+            for key, value in kwargs.items()
+        }
         return func(*args, **kwargs)
+
     return wrapper
 
 
@@ -24,10 +31,9 @@ def wrapper(*args, **kwargs):
 # transformation = compose_decorators(handle_compilation, shallow_copy_example_args)
 
 
-
 def compiled(func):
     def wrapper(*args, **kwargs):
-        is_to_be_compiled = True #decorator_kwargs.get('compile', False)
+        is_to_be_compiled = True  # decorator_kwargs.get('compile', False)
         compiled_lm = dsp.settings.compiled_lm
 
         if is_to_be_compiled and compiled_lm:
@@ -38,7 +44,7 @@ def wrapper(*args, **kwargs):
                 old_demos = list(example.demos)
                 example = func(example.copy(demos=[]), **kwargs)
                 return example.copy(demos=old_demos)
-        
+
         with dsp.settings.context(compiling=True):
             return func(*args, **kwargs)
 
diff --git a/dsp/primitives/search.py b/dsp/primitives/search.py
index 833c914947..6802f9fa71 100644
--- a/dsp/primitives/search.py
+++ b/dsp/primitives/search.py
@@ -8,7 +8,7 @@ def retrieve(query: str, k: int, **kwargs) -> list[str]:
         raise AssertionError("No RM is loaded.")
     passages = dsp.settings.rm(query, k=k, **kwargs)
     passages = [psg.long_text for psg in passages]
-    
+
     if dsp.settings.reranker:
         passages_cs_scores = dsp.settings.reranker(query, passages)
         passages_cs_scores_sorted = np.argsort(passages_cs_scores)[::-1]
@@ -23,8 +23,10 @@ def retrieveRerankEnsemble(queries: list[str], k: int) -> list[str]:
     queries = [q for q in queries if q]
     passages = {}
     for query in queries:
-        retrieved_passages = dsp.settings.rm(query, k=k*3)
-        passages_cs_scores = dsp.settings.reranker(query, [psg.long_text for psg in retrieved_passages])
+        retrieved_passages = dsp.settings.rm(query, k=k * 3)
+        passages_cs_scores = dsp.settings.reranker(
+            query, [psg.long_text for psg in retrieved_passages]
+        )
         for idx in np.argsort(passages_cs_scores)[::-1]:
             psg = retrieved_passages[idx]
             passages[psg.long_text] = passages.get(psg.long_text, []) + [
@@ -43,7 +45,7 @@ def retrieveEnsemble(queries: list[str], k: int, by_prob: bool = True) -> list[s
         raise AssertionError("No RM is loaded.")
     if dsp.settings.reranker:
         return retrieveRerankEnsemble(queries, k)
-    
+
     queries = [q for q in queries if q]
 
     if len(queries) == 1:
diff --git a/dsp/templates/__init__.py b/dsp/templates/__init__.py
index b4aed76438..a0978d0672 100644
--- a/dsp/templates/__init__.py
+++ b/dsp/templates/__init__.py
@@ -1,4 +1,3 @@
 from .utils import *
 from .template_v2 import *
 from .template_v3 import *
-
diff --git a/dsp/templates/template_v2.py b/dsp/templates/template_v2.py
index 5bd751eee7..9af1f2c46b 100644
--- a/dsp/templates/template_v2.py
+++ b/dsp/templates/template_v2.py
@@ -44,7 +44,7 @@ def __init__(
                     variable = match.group(3)
                     description = None
                 else:
-                    raise ValueError(f"Could not parse template")
+                    raise ValueError("Could not parse template")
 
             var_match = re.match("(.*) -> (.*)", variable)
             if var_match is not None:
@@ -94,15 +94,19 @@ def query(self, example: Example, is_demo: bool = False) -> str:
 
                     def format_handler(x):
                         return " ".join(x.split())
-                
-                formatted_value = format_handler(example[field.input_variable])
-                separator = '\n' if field.separator == ' ' and '\n' in formatted_value else field.separator
 
-                result.append(
-                    f"{field.name}{separator}{formatted_value}"
+                formatted_value = format_handler(example[field.input_variable])
+                separator = (
+                    "\n"
+                    if field.separator == " " and "\n" in formatted_value
+                    else field.separator
                 )
 
-        if self._has_augmented_guidelines() and ("augmented" in example and example.augmented):
+                result.append(f"{field.name}{separator}{formatted_value}")
+
+        if self._has_augmented_guidelines() and (
+            "augmented" in example and example.augmented
+        ):
             return "\n\n".join([r for r in result if r])
         return "\n".join([r for r in result if r])
 
@@ -126,7 +130,8 @@ def guidelines(self, show_guidelines=True) -> str:
 
     def _has_augmented_guidelines(self):
         return len(self.fields) > 3 or any(
-            ("\n" in field.separator) or ('\n' in field.description) for field in self.fields
+            ("\n" in field.separator) or ("\n" in field.description)
+            for field in self.fields
         )
 
     def extract(
@@ -164,16 +169,27 @@ def extract(
 
                 if offset >= 0:
                     if dspy.settings.release >= 20231003:
-                        example[self.fields[idx].output_variable] = raw_pred[:offset].strip().rstrip('---').strip()
-                        raw_pred = raw_pred[offset + len(next_field_name) :].strip().rstrip('---').strip()
+                        example[self.fields[idx].output_variable] = (
+                            raw_pred[:offset].strip().rstrip("---").strip()
+                        )
+                        raw_pred = (
+                            raw_pred[offset + len(next_field_name) :]
+                            .strip()
+                            .rstrip("---")
+                            .strip()
+                        )
                     else:
-                        example[self.fields[idx].output_variable] = raw_pred[:offset].strip()
+                        example[self.fields[idx].output_variable] = raw_pred[
+                            :offset
+                        ].strip()
                         raw_pred = raw_pred[offset + len(next_field_name) :].strip()
 
                     idx += 1
                 else:
                     if dspy.settings.release >= 20231003:
-                        example[self.fields[idx].output_variable] = raw_pred.strip().rstrip('---').strip()
+                        example[self.fields[idx].output_variable] = (
+                            raw_pred.strip().rstrip("---").strip()
+                        )
                     else:
                         example[self.fields[idx].output_variable] = raw_pred.strip()
 
@@ -185,7 +201,9 @@ def extract(
                 assert idx == len(self.fields) - 1, (idx, len(self.fields))
 
                 if dspy.settings.release >= 20231003:
-                    example[self.fields[idx].output_variable] = raw_pred.strip().rstrip('---').strip()
+                    example[self.fields[idx].output_variable] = (
+                        raw_pred.strip().rstrip("---").strip()
+                    )
                 else:
                     example[self.fields[idx].output_variable] = raw_pred.strip()
 
@@ -196,7 +214,7 @@ def extract(
     def __call__(self, example, show_guidelines=True) -> str:
         example = dsp.Example(example)
 
-        if hasattr(dsp.settings, 'query_only') and dsp.settings.query_only:
+        if hasattr(dsp.settings, "query_only") and dsp.settings.query_only:
             return self.query(example)
 
         # The training data should not contain the output variable
@@ -238,11 +256,10 @@ def __call__(self, example, show_guidelines=True) -> str:
                     ademos.append(rdemo)
             else:
                 rdemos_.append(rdemo)
-        
+
         ademos = new_ademos + ademos
         rdemos = rdemos_
 
-
         long_query = self._has_augmented_guidelines()
 
         if long_query:
@@ -251,7 +268,7 @@ def __call__(self, example, show_guidelines=True) -> str:
         query = self.query(example)
 
         # if it has more lines than fields
-        if len(query.split('\n')) > len(self.fields):
+        if len(query.split("\n")) > len(self.fields):
             long_query = True
 
             if "augmented" not in example or not example.augmented:
diff --git a/dsp/templates/template_v3.py b/dsp/templates/template_v3.py
index fb097f8770..72317a4451 100644
--- a/dsp/templates/template_v3.py
+++ b/dsp/templates/template_v3.py
@@ -35,7 +35,9 @@ def __init__(self, instructions: str, **kwargs):
         for key, value in kwargs.items():
             prefix: str = value.prefix
             separator: str = (
-                " " if prefix.rstrip() == prefix and len(prefix) > 0 else prefix[len(prefix.rstrip()) :]
+                " "
+                if prefix.rstrip() == prefix and len(prefix) > 0
+                else prefix[len(prefix.rstrip()) :]
             )
             field = Field(
                 name=prefix.strip(),
@@ -48,12 +50,11 @@ def __init__(self, instructions: str, **kwargs):
 
             if value.format:
                 self.format_handlers[key] = value.format
-        
-    
+
     # equality
     def __eq__(self, other):
         if set(self.kwargs.keys()) != set(other.kwargs.keys()):
-            print('here2')
+            print("here2")
             return False
 
         for k in self.kwargs.keys():
@@ -61,7 +62,6 @@ def __eq__(self, other):
             if not v1 == v2:
                 print(k, v1, v2)
 
-            
         # print("here?", self.instructions == other.instructions, self.kwargs == other.kwargs)
         return self.instructions == other.instructions and self.kwargs == other.kwargs
 
@@ -70,4 +70,3 @@ def __str__(self) -> str:
         field_names = [field.name for field in self.fields]
 
         return f"Template({self.instructions}, {field_names})"
-    
diff --git a/dsp/templates/utils.py b/dsp/templates/utils.py
index 0b0d4891f3..c47101e22c 100644
--- a/dsp/templates/utils.py
+++ b/dsp/templates/utils.py
@@ -30,7 +30,7 @@ def psg2text(psg):
             return f"Title: {title.strip()} | Snippet: «{snippet.strip()}»"
         except Exception:
             pass
-        
+
         return f"«{psg}»"
 
     if len(passages) == 0:
diff --git a/dsp/utils/ann_utils.py b/dsp/utils/ann_utils.py
index 106db06bff..a2c42652a7 100644
--- a/dsp/utils/ann_utils.py
+++ b/dsp/utils/ann_utils.py
@@ -3,7 +3,7 @@
 try:
     import faiss
     from faiss import Index
-except ImportError as e:
+except ImportError:
     raise ImportError(
         "You need to install FAISS library to perform ANN/KNN. Please check the official doc: "
         "https://github.com/facebookresearch/faiss/blob/main/INSTALL.md"
@@ -33,12 +33,12 @@ def determine_devices(max_gpu_devices: int = 0) -> Tuple[int, bool]:
 
 
 def _get_brute_index(emb_dim: int, dist_type: str) -> Index:
-    if dist_type.lower() == 'ip':
+    if dist_type.lower() == "ip":
         index = faiss.IndexFlatIP(emb_dim)
-    elif dist_type.lower() == 'l2':
+    elif dist_type.lower() == "l2":
         index = faiss.IndexFlatL2(emb_dim)
     else:
-        raise ValueError(f'Wrong distance type for FAISS Flat Index: {dist_type}')
+        raise ValueError(f"Wrong distance type for FAISS Flat Index: {dist_type}")
 
     return index
 
@@ -48,24 +48,26 @@ def _get_ivf_index(
     n_objects: int,
     in_list_dist_type: str,
     centroid_dist_type: str,
-    encode_residuals: bool
+    encode_residuals: bool,
 ) -> Index:
     # according to the FAISS doc, this should be OK
-    n_list = int(4 * (n_objects ** 0.5))
+    n_list = int(4 * (n_objects**0.5))
 
-    if in_list_dist_type.lower() == 'ip':
+    if in_list_dist_type.lower() == "ip":
         quannizer = faiss.IndexFlatIP(emb_dim)
-    elif in_list_dist_type.lower() == 'l2':
+    elif in_list_dist_type.lower() == "l2":
         quannizer = faiss.IndexFlatL2(emb_dim)
     else:
-        raise ValueError(f'Wrong distance type for FAISS quantizer: {in_list_dist_type}')
+        raise ValueError(
+            f"Wrong distance type for FAISS quantizer: {in_list_dist_type}"
+        )
 
-    if centroid_dist_type.lower() == 'ip':
+    if centroid_dist_type.lower() == "ip":
         centroid_metric = faiss.METRIC_INNER_PRODUCT
-    elif centroid_dist_type.lower() == 'l2':
+    elif centroid_dist_type.lower() == "l2":
         centroid_metric = faiss.METRIC_L2
     else:
-        raise ValueError(f'Wrong distance type for FAISS index: {centroid_dist_type}')
+        raise ValueError(f"Wrong distance type for FAISS index: {centroid_dist_type}")
 
     index = faiss.IndexIVFScalarQuantizer(
         quannizer,
@@ -73,7 +75,7 @@ def _get_ivf_index(
         n_list,
         faiss.ScalarQuantizer.QT_fp16,  # TODO: should be optional?
         centroid_metric,
-        encode_residuals
+        encode_residuals,
     )
     return index
 
@@ -84,8 +86,8 @@ def create_faiss_index(
     n_probe: int = 10,
     max_gpu_devices: int = 0,
     encode_residuals: bool = True,
-    in_list_dist_type: str = 'L2',
-    centroid_dist_type: str = 'L2'
+    in_list_dist_type: str = "L2",
+    centroid_dist_type: str = "L2",
 ) -> Index:
     """
     Create IVF index (with IP or L2 dist), without adding data and training
@@ -96,14 +98,14 @@ def create_faiss_index(
         n_probe: number of closest IVF-clusters to check for neighbours.
             Doesn't affect bruteforce-based search.
         max_gpu_devices: maximum amount of GPUs to use for ANN-index. 0 if run on CPU.
-        encode_residuals: whether or not compute residuals. The residual vector is 
+        encode_residuals: whether or not compute residuals. The residual vector is
             the difference between a vector and the reconstruction that can be
             decoded from its representation in the index.
         in_list_dist_type: type of distance to calculate simmilarities within one IVF.
             Can be `IP` (for inner product) or `L2` distance. Case insensetive.
             If the index type is bruteforce (`n_objects` < 20_000), this variable will define
             the distane type for that bruteforce index. `centroid_dist_type` will be ignored.
-        centroid_dist_type: type of distance to calculate simmilarities between a query 
+        centroid_dist_type: type of distance to calculate simmilarities between a query
             and cluster centroids. Can be `IP` (for inner product) or `L2` distance.
             Case insensetive.
     Returns: untrained FAISS-index
@@ -118,7 +120,7 @@ def create_faiss_index(
             n_objects=n_objects,
             in_list_dist_type=in_list_dist_type,
             centroid_dist_type=centroid_dist_type,
-            encode_residuals=encode_residuals
+            encode_residuals=encode_residuals,
         )
 
     index.nprobe = n_probe
@@ -127,6 +129,8 @@ def create_faiss_index(
     if is_gpu:
         cloner_options = faiss.GpuMultipleClonerOptions()
         cloner_options.shard = True  # split (not replicate) one index between GPUs
-        index = faiss.index_cpu_to_gpus_list(index, cloner_options, list(range(num_devices)))
+        index = faiss.index_cpu_to_gpus_list(
+            index, cloner_options, list(range(num_devices))
+        )
 
     return index
diff --git a/dsp/utils/dpr.py b/dsp/utils/dpr.py
index 9ee24dd5aa..904cf403d1 100644
--- a/dsp/utils/dpr.py
+++ b/dsp/utils/dpr.py
@@ -4,14 +4,17 @@
     Original license: https://github.com/facebookresearch/DPR/blob/main/LICENSE
 """
 
-import string
-import spacy
+import copy
 import regex
 import unicodedata
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 class Tokens(object):
     """A class to represent a list of tokenized text."""
+
     TEXT = 0
     TEXT_WS = 1
     SPAN = 2
@@ -31,12 +34,12 @@ def __len__(self):
     def slice(self, i=None, j=None):
         """Return a view of the list of tokens from [i, j)."""
         new_tokens = copy.copy(self)
-        new_tokens.data = self.data[i: j]
+        new_tokens.data = self.data[i:j]
         return new_tokens
 
     def untokenize(self):
         """Returns the original text (with whitespace reinserted)."""
-        return ''.join([t[self.TEXT_WS] for t in self.data]).strip()
+        return "".join([t[self.TEXT_WS] for t in self.data]).strip()
 
     def words(self, uncased=False):
         """Returns a list of the text of each token
@@ -57,7 +60,7 @@ def pos(self):
         """Returns a list of part-of-speech tags of each token.
         Returns None if this annotation was not included.
         """
-        if 'pos' not in self.annotators:
+        if "pos" not in self.annotators:
             return None
         return [t[self.POS] for t in self.data]
 
@@ -65,7 +68,7 @@ def lemmas(self):
         """Returns a list of the lemmatized text of each token.
         Returns None if this annotation was not included.
         """
-        if 'lemma' not in self.annotators:
+        if "lemma" not in self.annotators:
             return None
         return [t[self.LEMMA] for t in self.data]
 
@@ -73,7 +76,7 @@ def entities(self):
         """Returns a list of named-entity-recognition tags of each token.
         Returns None if this annotation was not included.
         """
-        if 'ner' not in self.annotators:
+        if "ner" not in self.annotators:
             return None
         return [t[self.NER] for t in self.data]
 
@@ -94,14 +97,16 @@ def _skip(gram):
             return filter_fn(gram)
 
         words = self.words(uncased)
-        ngrams = [(s, e + 1)
-                  for s in range(len(words))
-                  for e in range(s, min(s + n, len(words)))
-                  if not _skip(words[s:e + 1])]
+        ngrams = [
+            (s, e + 1)
+            for s in range(len(words))
+            for e in range(s, min(s + n, len(words)))
+            if not _skip(words[s : e + 1])
+        ]
 
         # Concatenate into strings
         if as_strings:
-            ngrams = ['{}'.format(' '.join(words[s:e])) for (s, e) in ngrams]
+            ngrams = ["{}".format(" ".join(words[s:e])) for (s, e) in ngrams]
 
         return ngrams
 
@@ -110,7 +115,7 @@ def entity_groups(self):
         entities = self.entities()
         if not entities:
             return None
-        non_ent = self.opts.get('non_ent', 'O')
+        non_ent = self.opts.get("non_ent", "O")
         groups = []
         idx = 0
         while idx < len(entities):
@@ -119,7 +124,7 @@ def entity_groups(self):
             if ner_tag != non_ent:
                 # Chomp the sequence
                 start = idx
-                while (idx < len(entities) and entities[idx] == ner_tag):
+                while idx < len(entities) and entities[idx] == ner_tag:
                     idx += 1
                 groups.append((self.slice(start, idx).untokenize(), ner_tag))
             else:
@@ -143,8 +148,8 @@ def __del__(self):
 
 
 class SimpleTokenizer(Tokenizer):
-    ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
-    NON_WS = r'[^\p{Z}\p{C}]'
+    ALPHA_NUM = r"[\p{L}\p{N}\p{M}]+"
+    NON_WS = r"[^\p{Z}\p{C}]"
 
     def __init__(self, **kwargs):
         """
@@ -152,12 +157,14 @@ def __init__(self, **kwargs):
             annotators: None or empty set (only tokenizes).
         """
         self._regexp = regex.compile(
-            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
-            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
+            "(%s)|(%s)" % (self.ALPHA_NUM, self.NON_WS),
+            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE,
         )
-        if len(kwargs.get('annotators', {})) > 0:
-            logger.warning('%s only tokenizes! Skipping annotators: %s' %
-                           (type(self).__name__, kwargs.get('annotators')))
+        if len(kwargs.get("annotators", {})) > 0:
+            logger.warning(
+                "%s only tokenizes! Skipping annotators: %s"
+                % (type(self).__name__, kwargs.get("annotators"))
+            )
         self.annotators = set()
 
     def tokenize(self, text):
@@ -176,11 +183,13 @@ def tokenize(self, text):
                 end_ws = span[1]
 
             # Format data
-            data.append((
-                token,
-                text[start_ws: end_ws],
-                span,
-            ))
+            data.append(
+                (
+                    token,
+                    text[start_ws:end_ws],
+                    span,
+                )
+            )
         return Tokens(data, self.annotators)
 
 
@@ -189,7 +198,7 @@ def has_answer(tokenized_answers, text):
 
     for single_answer in tokenized_answers:
         for i in range(0, len(text) - len(single_answer) + 1):
-            if single_answer == text[i: i + len(single_answer)]:
+            if single_answer == text[i : i + len(single_answer)]:
                 return True
 
     return False
@@ -202,13 +211,19 @@ def locate_answers(tokenized_answers, text):
     tokenized_text = DPR_tokenize(text)
     occurrences = []
 
-    text_words, text_word_positions = tokenized_text.words(uncased=True), tokenized_text.offsets()
+    text_words, text_word_positions = (
+        tokenized_text.words(uncased=True),
+        tokenized_text.offsets(),
+    )
     answers_words = [ans.words(uncased=True) for ans in tokenized_answers]
 
     for single_answer in answers_words:
         for i in range(0, len(text_words) - len(single_answer) + 1):
-            if single_answer == text_words[i: i + len(single_answer)]:
-                (offset, _), (_, endpos) = text_word_positions[i], text_word_positions[i+len(single_answer)-1]
+            if single_answer == text_words[i : i + len(single_answer)]:
+                (offset, _), (_, endpos) = (
+                    text_word_positions[i],
+                    text_word_positions[i + len(single_answer) - 1],
+                )
                 occurrences.append((offset, endpos))
 
     return occurrences
@@ -218,7 +233,7 @@ def locate_answers(tokenized_answers, text):
 
 
 def DPR_tokenize(text):
-    return STokenizer.tokenize(unicodedata.normalize('NFD', text))
+    return STokenizer.tokenize(unicodedata.normalize("NFD", text))
 
 
 def DPR_normalize(text):
@@ -231,8 +246,8 @@ def strip_accents(text):
     text = unicodedata.normalize("NFD", text)
     output = []
     for char in text:
-      cat = unicodedata.category(char)
-      if cat == "Mn":
-        continue
-      output.append(char)
+        cat = unicodedata.category(char)
+        if cat == "Mn":
+            continue
+        output.append(char)
     return "".join(output)
diff --git a/dsp/utils/metrics.py b/dsp/utils/metrics.py
index f0d69f00f6..d9e0fa1315 100644
--- a/dsp/utils/metrics.py
+++ b/dsp/utils/metrics.py
@@ -7,41 +7,44 @@
 
 
 def EM(prediction, answers_list):
-    assert type(answers_list) == list
+    assert isinstance(answers_list, list)
 
     return max(em_score(prediction, ans) for ans in answers_list)
 
 
 def F1(prediction, answers_list):
-    assert type(answers_list) == list
+    assert isinstance(answers_list, list)
 
     return max(f1_score(prediction, ans) for ans in answers_list)
 
 
 def HotPotF1(prediction, answers_list):
-    assert type(answers_list) == list
+    assert isinstance(answers_list, list)
 
     return max(hotpot_f1_score(prediction, ans) for ans in answers_list)
 
 
 def nF1(history, prediction, answers_list, return_recall=False):
-    assert type(answers_list) == list
+    assert isinstance(answers_list, list)
 
-    return max(novel_f1_score(history, prediction, ans, return_recall=return_recall) for ans in answers_list)
+    return max(
+        novel_f1_score(history, prediction, ans, return_recall=return_recall)
+        for ans in answers_list
+    )
 
 
 def normalize_text(s):
-    s = unicodedata.normalize('NFD', s)
+    s = unicodedata.normalize("NFD", s)
 
     def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
+        return re.sub(r"\b(a|an|the)\b", " ", text)
 
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
 
     def remove_punc(text):
         exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
 
     def lower(text):
         return text.lower()
@@ -57,6 +60,7 @@ def em_score(prediction, ground_truth):
 # See: https://rajpurkar.github.io/SQuAD-explorer/ under Evaluation Script
 # See: QReCC's
 
+
 def f1_score(prediction, ground_truth):
     prediction_tokens = normalize_text(prediction).split()
     ground_truth_tokens = normalize_text(ground_truth).split()
@@ -67,7 +71,8 @@ def f1_score(prediction, ground_truth):
     if len(prediction_tokens) == len(ground_truth_tokens) == 0:
         # Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity.
         print_message(
-            "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")
+            "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n"
+        )
 
     if num_same == 0:
         return 0
@@ -83,9 +88,15 @@ def hotpot_f1_score(prediction, ground_truth):
     normalized_prediction = normalize_text(prediction)
     normalized_ground_truth = normalize_text(ground_truth)
 
-    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
+    if (
+        normalized_prediction in ["yes", "no", "noanswer"]
+        and normalized_prediction != normalized_ground_truth
+    ):
         return 0
-    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
+    if (
+        normalized_ground_truth in ["yes", "no", "noanswer"]
+        and normalized_prediction != normalized_ground_truth
+    ):
         return 0
 
     prediction_tokens = normalized_prediction.split()
@@ -110,7 +121,8 @@ def precision_score(prediction, ground_truth):
     if len(prediction_tokens) == len(ground_truth_tokens) == 0:
         # Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity.
         print_message(
-            "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")
+            "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n"
+        )
 
     if num_same == 0:
         return 0
@@ -121,16 +133,135 @@ def precision_score(prediction, ground_truth):
 
 
 # Source: https://gist.github.com/sebleier/554280
-stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
-             "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
-             "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
-             "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do",
-             "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
-             "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
-             "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
-             "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
-             "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
-             "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
+stopwords = [
+    "i",
+    "me",
+    "my",
+    "myself",
+    "we",
+    "our",
+    "ours",
+    "ourselves",
+    "you",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+    "he",
+    "him",
+    "his",
+    "himself",
+    "she",
+    "her",
+    "hers",
+    "herself",
+    "it",
+    "its",
+    "itself",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "themselves",
+    "what",
+    "which",
+    "who",
+    "whom",
+    "this",
+    "that",
+    "these",
+    "those",
+    "am",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "having",
+    "do",
+    "does",
+    "did",
+    "doing",
+    "a",
+    "an",
+    "the",
+    "and",
+    "but",
+    "if",
+    "or",
+    "because",
+    "as",
+    "until",
+    "while",
+    "of",
+    "at",
+    "by",
+    "for",
+    "with",
+    "about",
+    "against",
+    "between",
+    "into",
+    "through",
+    "during",
+    "before",
+    "after",
+    "above",
+    "below",
+    "to",
+    "from",
+    "up",
+    "down",
+    "in",
+    "out",
+    "on",
+    "off",
+    "over",
+    "under",
+    "again",
+    "further",
+    "then",
+    "once",
+    "here",
+    "there",
+    "when",
+    "where",
+    "why",
+    "how",
+    "all",
+    "any",
+    "both",
+    "each",
+    "few",
+    "more",
+    "most",
+    "other",
+    "some",
+    "such",
+    "no",
+    "nor",
+    "not",
+    "only",
+    "own",
+    "same",
+    "so",
+    "than",
+    "too",
+    "very",
+    "s",
+    "t",
+    "can",
+    "will",
+    "just",
+    "don",
+    "should",
+    "now",
+]
 
 
 def novel_f1_score(history, prediction, ground_truth, return_recall=False):
@@ -140,10 +271,8 @@ def novel_f1_score(history, prediction, ground_truth, return_recall=False):
 
     history_tokens = set(history_tokens + stopwords)
 
-    prediction_tokens = [
-        t for t in prediction_tokens if t not in history_tokens]
-    ground_truth_tokens = [
-        t for t in ground_truth_tokens if t not in history_tokens]
+    prediction_tokens = [t for t in prediction_tokens if t not in history_tokens]
+    ground_truth_tokens = [t for t in ground_truth_tokens if t not in history_tokens]
 
     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
     num_same = sum(common.values())
diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py
index b7f001e45d..c94cd1f8e3 100644
--- a/dsp/utils/settings.py
+++ b/dsp/utils/settings.py
@@ -2,6 +2,7 @@
 from dsp.utils.utils import dotdict
 import threading
 
+
 class Settings(object):
     """DSP configuration settings."""
 
@@ -17,7 +18,9 @@ def __new__(cls):
             cls._instance.main_tid = threading.get_ident()
             cls._instance.main_stack = []
             cls._instance.stack_by_thread = {}
-            cls._instance.stack_by_thread[threading.get_ident()] = cls._instance.main_stack
+            cls._instance.stack_by_thread[
+                threading.get_ident()
+            ] = cls._instance.main_stack
 
             #  TODO: remove first-class support for re-ranker and potentially combine with RM to form a pipeline of sorts
             #  eg: RetrieveThenRerankPipeline(RetrievalModel, Reranker)
diff --git a/dsp/utils/settings_v2.py b/dsp/utils/settings_v2.py
index 2998ca7cd4..557ed9f79c 100644
--- a/dsp/utils/settings_v2.py
+++ b/dsp/utils/settings_v2.py
@@ -3,19 +3,24 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import copy
 
+
 class Settings:
     def __init__(self):
         # A lock for ensuring thread-safety when accessing _parent_configs
         self._lock = threading.Lock()
-        
+
         # Dictionary to hold parent thread configurations
         self._parent_configs = {}
-        
+
         # Using thread-local storage to ensure that each thread has its own configuration stack
         self._local = threading.local()
 
     def _get_current_config(self):
-        return self._local.config_stack[-1] if hasattr(self._local, 'config_stack') and self._local.config_stack else {}
+        return (
+            self._local.config_stack[-1]
+            if hasattr(self._local, "config_stack") and self._local.config_stack
+            else {}
+        )
 
     def initialize_for_thread(self, parent_tid):
         """Initialize thread-local data for a new thread using its parent's config."""
@@ -28,17 +33,21 @@ def initialize_for_thread(self, parent_tid):
 
     @contextmanager
     def context(self, **kwargs):
-        current_config = copy.deepcopy(self._get_current_config())  # Deep copy the current configuration
+        current_config = copy.deepcopy(
+            self._get_current_config()
+        )  # Deep copy the current configuration
         current_config.update(kwargs)
-        
-        if not hasattr(self._local, 'config_stack'):
+
+        if not hasattr(self._local, "config_stack"):
             self._local.config_stack = []
-        
+
         self._local.config_stack.append(current_config)
 
         # Register the modified config as the potential parent config
         with self._lock:
-            self._parent_configs[threading.get_ident()] = copy.deepcopy(current_config)  # Deep copy to ensure immutability
+            self._parent_configs[threading.get_ident()] = copy.deepcopy(
+                current_config
+            )  # Deep copy to ensure immutability
 
         try:
             yield
@@ -49,6 +58,7 @@ def context(self, **kwargs):
             with self._lock:
                 self._parent_configs.pop(threading.get_ident(), None)
 
+
 # Singleton instance
 dsp_settings = Settings()
 
@@ -61,7 +71,9 @@ def thread_wrapper(program, parent_tid, *args, **kwargs):
 
 # Example test
 def sample_program(arg):
-    print(f"Thread {threading.get_ident()} with arg={arg} has config: {dsp_settings._get_current_config()}")
+    print(
+        f"Thread {threading.get_ident()} with arg={arg} has config: {dsp_settings._get_current_config()}"
+    )
 
 
 def main():
@@ -69,12 +81,17 @@ def main():
 
     with dsp_settings.context(a=10, b=20):  # Setting main thread's context
         with ThreadPoolExecutor(max_workers=2) as executor:
-            futures = {executor.submit(thread_wrapper, sample_program, parent_tid, arg) for arg in range(3)}
+            futures = {
+                executor.submit(thread_wrapper, sample_program, parent_tid, arg)
+                for arg in range(3)
+            }
 
             for future in as_completed(futures):
-                res = future.result()
+                future.result()
 
-        print(f"Main thread {parent_tid} config after threads: {dsp_settings._get_current_config()}")
+        print(
+            f"Main thread {parent_tid} config after threads: {dsp_settings._get_current_config()}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/dsp/utils/utils.py b/dsp/utils/utils.py
index 7e93933e44..458a4ab452 100644
--- a/dsp/utils/utils.py
+++ b/dsp/utils/utils.py
@@ -4,6 +4,7 @@
 import itertools
 
 from collections import defaultdict
+import copy
 
 
 def print_message(*s, condition=True, pad=False, sep=None):
@@ -75,25 +76,25 @@ def batch(group, bsize, provide_offset=False):
 #     __delattr__ = dict.__delitem__
 
 
-import copy
-
 class dotdict(dict):
     def __getattr__(self, key):
-        if key.startswith('__') and key.endswith('__'):
+        if key.startswith("__") and key.endswith("__"):
             return super().__getattr__(key)
         try:
             return self[key]
         except KeyError:
-            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'")
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{key}'"
+            )
 
     def __setattr__(self, key, value):
-        if key.startswith('__') and key.endswith('__'):
+        if key.startswith("__") and key.endswith("__"):
             super().__setattr__(key, value)
         else:
             self[key] = value
 
     def __delattr__(self, key):
-        if key.startswith('__') and key.endswith('__'):
+        if key.startswith("__") and key.endswith("__"):
             super().__delattr__(key)
         else:
             del self[key]
@@ -234,7 +235,7 @@ def load_batch_backgrounds(args, qids):
     for qid in qids:
         back = args.qid2backgrounds[qid]
 
-        if len(back) and type(back[0]) == int:
+        if len(back) and isinstance(back[0], int):
             x = [args.collection[pid] for pid in back]
         else:
             x = [args.collectionX.get(pid, "") for pid in back]
diff --git a/dspy/datasets/__init__.py b/dspy/datasets/__init__.py
index de40c11566..bcb093b839 100644
--- a/dspy/datasets/__init__.py
+++ b/dspy/datasets/__init__.py
@@ -1,3 +1,3 @@
 from .dataset import Dataset
 from .hotpotqa import HotPotQA
-from .colors import Colors
\ No newline at end of file
+from .colors import Colors
diff --git a/dspy/datasets/colors.py b/dspy/datasets/colors.py
index 265badf270..844ba148c3 100644
--- a/dspy/datasets/colors.py
+++ b/dspy/datasets/colors.py
@@ -2,7 +2,146 @@
 from dspy.datasets.dataset import Dataset
 
 ### A bunch of colors, originally from matplotlib
-all_colors = ['alice blue', 'dodger blue', 'light sky blue', 'deep sky blue', 'sky blue', 'steel blue', 'light steel blue', 'medium blue', 'navy blue', 'blue', 'royal blue', 'cadet blue', 'cornflower blue', 'medium slate blue', 'slate blue', 'dark slate blue', 'powder blue', 'turquoise', 'dark turquoise', 'medium turquoise', 'pale turquoise', 'light sea green', 'medium sea green', 'sea green', 'forest green', 'green yellow', 'lime green', 'dark green', 'green', 'lime', 'chartreuse', 'lawn green', 'yellow green', 'olive green', 'dark olive green', 'medium spring green', 'spring green', 'medium aquamarine', 'aquamarine', 'aqua', 'cyan', 'dark cyan', 'teal', 'medium orchid', 'dark orchid', 'orchid', 'blue violet', 'violet', 'dark violet', 'plum', 'thistle', 'magenta', 'fuchsia', 'dark magenta', 'medium purple', 'purple', 'rebecca purple', 'dark red', 'fire brick', 'indian red', 'light coral', 'dark salmon', 'light salmon', 'salmon', 'red', 'crimson', 'tomato', 'coral', 'orange red', 'dark orange', 'orange', 'yellow', 'gold', 'light goldenrod yellow', 'pale goldenrod', 'goldenrod', 'dark goldenrod', 'beige', 'moccasin', 'blanched almond', 'navajo white', 'antique white', 'bisque', 'burlywood', 'dark khaki', 'khaki', 'tan', 'wheat', 'snow', 'floral white', 'old lace', 'ivory', 'linen', 'seashell', 'honeydew', 'mint cream', 'azure', 'lavender', 'ghost white', 'white smoke', 'gainsboro', 'light gray', 'silver', 'dark gray', 'gray', 'dim gray', 'slate gray', 'light slate gray', 'dark slate gray', 'black', 'medium violet red', 'pale violet red', 'deep pink', 'hot pink', 'light pink', 'pink', 'peach puff', 'rosy brown', 'saddle brown', 'sandy brown', 'chocolate', 'peru', 'sienna', 'brown', 'maroon', 'white', 'misty rose', 'lavender blush', 'papaya whip', 'lemon chiffon', 'light yellow', 'corn silk', 'pale green', 'light green', 'olive drab', 'olive', 'dark sea green']
+all_colors = [
+    "alice blue",
+    "dodger blue",
+    "light sky blue",
+    "deep sky blue",
+    "sky blue",
+    "steel blue",
+    "light steel blue",
+    "medium blue",
+    "navy blue",
+    "blue",
+    "royal blue",
+    "cadet blue",
+    "cornflower blue",
+    "medium slate blue",
+    "slate blue",
+    "dark slate blue",
+    "powder blue",
+    "turquoise",
+    "dark turquoise",
+    "medium turquoise",
+    "pale turquoise",
+    "light sea green",
+    "medium sea green",
+    "sea green",
+    "forest green",
+    "green yellow",
+    "lime green",
+    "dark green",
+    "green",
+    "lime",
+    "chartreuse",
+    "lawn green",
+    "yellow green",
+    "olive green",
+    "dark olive green",
+    "medium spring green",
+    "spring green",
+    "medium aquamarine",
+    "aquamarine",
+    "aqua",
+    "cyan",
+    "dark cyan",
+    "teal",
+    "medium orchid",
+    "dark orchid",
+    "orchid",
+    "blue violet",
+    "violet",
+    "dark violet",
+    "plum",
+    "thistle",
+    "magenta",
+    "fuchsia",
+    "dark magenta",
+    "medium purple",
+    "purple",
+    "rebecca purple",
+    "dark red",
+    "fire brick",
+    "indian red",
+    "light coral",
+    "dark salmon",
+    "light salmon",
+    "salmon",
+    "red",
+    "crimson",
+    "tomato",
+    "coral",
+    "orange red",
+    "dark orange",
+    "orange",
+    "yellow",
+    "gold",
+    "light goldenrod yellow",
+    "pale goldenrod",
+    "goldenrod",
+    "dark goldenrod",
+    "beige",
+    "moccasin",
+    "blanched almond",
+    "navajo white",
+    "antique white",
+    "bisque",
+    "burlywood",
+    "dark khaki",
+    "khaki",
+    "tan",
+    "wheat",
+    "snow",
+    "floral white",
+    "old lace",
+    "ivory",
+    "linen",
+    "seashell",
+    "honeydew",
+    "mint cream",
+    "azure",
+    "lavender",
+    "ghost white",
+    "white smoke",
+    "gainsboro",
+    "light gray",
+    "silver",
+    "dark gray",
+    "gray",
+    "dim gray",
+    "slate gray",
+    "light slate gray",
+    "dark slate gray",
+    "black",
+    "medium violet red",
+    "pale violet red",
+    "deep pink",
+    "hot pink",
+    "light pink",
+    "pink",
+    "peach puff",
+    "rosy brown",
+    "saddle brown",
+    "sandy brown",
+    "chocolate",
+    "peru",
+    "sienna",
+    "brown",
+    "maroon",
+    "white",
+    "misty rose",
+    "lavender blush",
+    "papaya whip",
+    "lemon chiffon",
+    "light yellow",
+    "corn silk",
+    "pale green",
+    "light green",
+    "olive drab",
+    "olive",
+    "dark sea green",
+]
+
 
 class Colors(Dataset):
     def __init__(self, sort_by_suffix=True, *args, **kwargs) -> None:
@@ -11,7 +150,9 @@ def __init__(self, sort_by_suffix=True, *args, **kwargs) -> None:
         self.sort_by_suffix = sort_by_suffix
         colors = self.sorted_by_suffix(all_colors)
 
-        train_size = int(len(colors) * 0.6) # chosen to ensure that similar colors aren't repeated between train and dev
+        train_size = int(
+            len(colors) * 0.6
+        )  # chosen to ensure that similar colors aren't repeated between train and dev
         train_colors, dev_colors = colors[:train_size], colors[train_size:]
 
         self._train = [dict(color=color) for color in train_colors]
@@ -19,7 +160,7 @@ def __init__(self, sort_by_suffix=True, *args, **kwargs) -> None:
 
         random.Random(0).shuffle(self._train)
         random.Random(0).shuffle(self._dev)
-    
+
     def sorted_by_suffix(self, colors):
         if not self.sort_by_suffix:
             return colors
@@ -27,6 +168,6 @@ def sorted_by_suffix(self, colors):
         if isinstance(colors[0], str):
             sorted_colors = sorted(colors, key=lambda x: x[::-1])
         else:
-            sorted_colors = sorted(colors, key=lambda x: x['color'][::-1])
+            sorted_colors = sorted(colors, key=lambda x: x["color"][::-1])
 
         return sorted_colors
diff --git a/dspy/datasets/dataset.py b/dspy/datasets/dataset.py
index 9cd279199f..6ed1ca065c 100644
--- a/dspy/datasets/dataset.py
+++ b/dspy/datasets/dataset.py
@@ -4,8 +4,11 @@
 from dspy import Example
 from dsp.utils import dotdict
 
+
 class Dataset:
-    def __init__(self, train_seed=0, train_size=None, eval_seed=0, dev_size=None, test_size=None):
+    def __init__(
+        self, train_seed=0, train_size=None, eval_seed=0, dev_size=None, test_size=None
+    ):
         self.train_size = train_size
         self.train_seed = train_seed
         self.dev_size = dev_size
@@ -16,7 +19,14 @@ def __init__(self, train_seed=0, train_size=None, eval_seed=0, dev_size=None, te
 
         self.name = self.__class__.__name__
 
-    def reset_seeds(self, train_seed=None, train_size=None, eval_seed=None, dev_size=None, test_size=None):
+    def reset_seeds(
+        self,
+        train_seed=None,
+        train_size=None,
+        eval_seed=None,
+        dev_size=None,
+        test_size=None,
+    ):
         self.train_size = train_size if train_size is not None else self.train_size
         self.train_seed = train_seed if train_seed is not None else self.train_seed
         self.dev_size = dev_size if dev_size is not None else self.dev_size
@@ -24,41 +34,47 @@ def reset_seeds(self, train_seed=None, train_size=None, eval_seed=None, dev_size
         self.test_size = test_size if test_size is not None else self.test_size
         self.test_seed = eval_seed if eval_seed is not None else self.test_seed
 
-        if hasattr(self, '_train_'):
+        if hasattr(self, "_train_"):
             del self._train_
-        
-        if hasattr(self, '_dev_'):
+
+        if hasattr(self, "_dev_"):
             del self._dev_
-        
-        if hasattr(self, '_test_'):
+
+        if hasattr(self, "_test_"):
             del self._test_
 
     @property
     def train(self):
-        if not hasattr(self, '_train_'):
-            self._train_ = self._shuffle_and_sample('train', self._train, self.train_size, self.train_seed)
+        if not hasattr(self, "_train_"):
+            self._train_ = self._shuffle_and_sample(
+                "train", self._train, self.train_size, self.train_seed
+            )
 
         return self._train_
 
     @property
     def dev(self):
-        if not hasattr(self, '_dev_'):
-            self._dev_ = self._shuffle_and_sample('dev', self._dev, self.dev_size, self.dev_seed)
+        if not hasattr(self, "_dev_"):
+            self._dev_ = self._shuffle_and_sample(
+                "dev", self._dev, self.dev_size, self.dev_seed
+            )
 
         return self._dev_
-    
+
     @property
     def test(self):
-        if not hasattr(self, '_test_'):
-            self._test_ = self._shuffle_and_sample('test', self._test, self.test_size, self.test_seed)
+        if not hasattr(self, "_test_"):
+            self._test_ = self._shuffle_and_sample(
+                "test", self._test, self.test_size, self.test_seed
+            )
 
         return self._test_
 
     def _shuffle_and_sample(self, split, data, size, seed=0):
-        '''
-            The setting (seed=s, size=N) is always a subset
-            of the setting (seed=s, size=M) for N < M.
-        '''
+        """
+        The setting (seed=s, size=N) is always a subset
+        of the setting (seed=s, size=M) for N < M.
+        """
 
         data = list(data)
 
@@ -72,8 +88,10 @@ def _shuffle_and_sample(self, split, data, size, seed=0):
         output = []
 
         for example in data:
-            output.append(Example(**example, dspy_uuid=str(uuid.uuid4()), dspy_split=split))
-        
+            output.append(
+                Example(**example, dspy_uuid=str(uuid.uuid4()), dspy_split=split)
+            )
+
         # TODO: NOTE: Ideally we use these uuids for dedup internally, for demos and internal train/val splits.
         # Now, some tasks (like convQA and Colors) have overlapping examples. Here, we should allow the user to give us
         # a uuid field that would respect this in some way. This means that we need a more refined concept that
@@ -83,30 +101,44 @@ def _shuffle_and_sample(self, split, data, size, seed=0):
         # rng.shuffle(data)
 
         return output
-    
+
     @classmethod
-    def prepare_by_seed(cls, train_seeds=[1,2,3,4,5], train_size=16, dev_size=1000,
-                        divide_eval_per_seed=True, eval_seed=2023, **kwargs):
-        
-        data_args = dotdict(train_size=train_size, eval_seed=eval_seed, dev_size=dev_size, test_size=0, **kwargs)
+    def prepare_by_seed(
+        cls,
+        train_seeds=[1, 2, 3, 4, 5],
+        train_size=16,
+        dev_size=1000,
+        divide_eval_per_seed=True,
+        eval_seed=2023,
+        **kwargs,
+    ):
+        data_args = dotdict(
+            train_size=train_size,
+            eval_seed=eval_seed,
+            dev_size=dev_size,
+            test_size=0,
+            **kwargs,
+        )
         dataset = cls(**data_args)
 
         eval_set = dataset.dev
         eval_sets, train_sets = [], []
 
-        examples_per_seed = dev_size // len(train_seeds) if divide_eval_per_seed else dev_size
+        examples_per_seed = (
+            dev_size // len(train_seeds) if divide_eval_per_seed else dev_size
+        )
         eval_offset = 0
 
         for train_seed in train_seeds:
             data_args.train_seed = train_seed
             dataset.reset_seeds(**data_args)
 
-            eval_sets.append(eval_set[eval_offset:eval_offset+examples_per_seed])
+            eval_sets.append(eval_set[eval_offset : eval_offset + examples_per_seed])
             train_sets.append(dataset.train)
 
             assert len(eval_sets[-1]) == examples_per_seed, len(eval_sets[-1])
             assert len(train_sets[-1]) == train_size, len(train_sets[-1])
-            
+
             if divide_eval_per_seed:
                 eval_offset += examples_per_seed
 
diff --git a/dspy/datasets/gsm8k.py b/dspy/datasets/gsm8k.py
index 0795518e6d..e1e796d115 100644
--- a/dspy/datasets/gsm8k.py
+++ b/dspy/datasets/gsm8k.py
@@ -2,41 +2,45 @@
 import random
 
 from datasets import load_dataset
-from dspy.datasets.dataset import Dataset
+
 
 class GSM8K:
     def __init__(self) -> None:
         super().__init__()
         self.do_shuffle = False
 
-        dataset = load_dataset("gsm8k", 'main')
+        dataset = load_dataset("gsm8k", "main")
 
-        hf_official_train = dataset['train']
-        hf_official_test = dataset['test']
+        hf_official_train = dataset["train"]
+        hf_official_test = dataset["test"]
         official_train = []
         official_test = []
 
         for example in tqdm.tqdm(hf_official_train):
-            question = example['question']
+            question = example["question"]
+
+            answer = example["answer"].strip().split()
+            assert answer[-2] == "####"
 
-            answer = example['answer'].strip().split()
-            assert answer[-2] == '####'
-            
-            gold_reasoning = ' '.join(answer[:-2])
-            answer = str(int(answer[-1].replace(',', '')))
+            gold_reasoning = " ".join(answer[:-2])
+            answer = str(int(answer[-1].replace(",", "")))
 
-            official_train.append(dict(question=question, gold_reasoning=gold_reasoning, answer=answer))
+            official_train.append(
+                dict(question=question, gold_reasoning=gold_reasoning, answer=answer)
+            )
 
         for example in tqdm.tqdm(hf_official_test):
-            question = example['question']
+            question = example["question"]
+
+            answer = example["answer"].strip().split()
+            assert answer[-2] == "####"
 
-            answer = example['answer'].strip().split()
-            assert answer[-2] == '####'
-            
-            gold_reasoning = ' '.join(answer[:-2])
-            answer = str(int(answer[-1].replace(',', '')))
+            gold_reasoning = " ".join(answer[:-2])
+            answer = str(int(answer[-1].replace(",", "")))
 
-            official_test.append(dict(question=question, gold_reasoning=gold_reasoning, answer=answer))
+            official_test.append(
+                dict(question=question, gold_reasoning=gold_reasoning, answer=answer)
+            )
 
         rng = random.Random(0)
         rng.shuffle(official_train)
@@ -50,9 +54,9 @@ def __init__(self) -> None:
 
         import dspy
 
-        trainset = [dspy.Example(**x).with_inputs('question') for x in trainset]
-        devset = [dspy.Example(**x).with_inputs('question') for x in devset]
-        testset = [dspy.Example(**x).with_inputs('question') for x in testset]
+        trainset = [dspy.Example(**x).with_inputs("question") for x in trainset]
+        devset = [dspy.Example(**x).with_inputs("question") for x in devset]
+        testset = [dspy.Example(**x).with_inputs("question") for x in testset]
 
         # print(f"Trainset size: {len(trainset)}")
         # print(f"Devset size: {len(devset)}")
@@ -63,24 +67,27 @@ def __init__(self) -> None:
         self.test = testset
 
 
-
 def parse_integer_answer(answer, only_first_line=True):
     try:
         if only_first_line:
-            answer = answer.strip().split('\n')[0]
+            answer = answer.strip().split("\n")[0]
 
         # find the last token that has a number in it
-        answer = [token for token in answer.split() if any(c.isdigit() for c in token)][-1]
-        answer = answer.split('.')[0]
-        answer = ''.join([c for c in answer if c.isdigit()])
+        answer = [token for token in answer.split() if any(c.isdigit() for c in token)][
+            -1
+        ]
+        answer = answer.split(".")[0]
+        answer = "".join([c for c in answer if c.isdigit()])
         answer = int(answer)
 
     except (ValueError, IndexError):
         # print(answer)
         answer = 0
-    
+
     return answer
 
 
 def gsm8k_metric(gold, pred, trace=None):
-    return int(parse_integer_answer(str(gold.answer))) == int(parse_integer_answer(str(pred.answer)))
+    return int(parse_integer_answer(str(gold.answer))) == int(
+        parse_integer_answer(str(pred.answer))
+    )
diff --git a/dspy/datasets/hotpotqa.py b/dspy/datasets/hotpotqa.py
index f1cc734824..6b1315a169 100644
--- a/dspy/datasets/hotpotqa.py
+++ b/dspy/datasets/hotpotqa.py
@@ -5,62 +5,76 @@
 
 
 class HotPotQA(Dataset):
-    def __init__(self, *args, only_hard_examples=True, keep_details='dev_titles', unofficial_dev=True, **kwargs) -> None:
+    def __init__(
+        self,
+        *args,
+        only_hard_examples=True,
+        keep_details="dev_titles",
+        unofficial_dev=True,
+        **kwargs,
+    ) -> None:
         super().__init__(*args, **kwargs)
-        assert only_hard_examples, "Care must be taken when adding support for easy examples." \
-                                   "Dev must be all hard to match official dev, but training can be flexible."
-        
-        hf_official_train = load_dataset("hotpot_qa", 'fullwiki', split='train')
-        hf_official_dev = load_dataset("hotpot_qa", 'fullwiki', split='validation')
+        assert only_hard_examples, (
+            "Care must be taken when adding support for easy examples."
+            "Dev must be all hard to match official dev, but training can be flexible."
+        )
+
+        hf_official_train = load_dataset("hotpot_qa", "fullwiki", split="train")
+        hf_official_dev = load_dataset("hotpot_qa", "fullwiki", split="validation")
 
         official_train = []
         for raw_example in hf_official_train:
-            if raw_example['level'] == 'hard':
+            if raw_example["level"] == "hard":
                 if keep_details is True:
-                    keys = ['id', 'question', 'answer', 'type', 'supporting_facts']
-                elif keep_details == 'dev_titles':
-                    keys = ['question', 'answer', 'supporting_facts']
+                    keys = ["id", "question", "answer", "type", "supporting_facts"]
+                elif keep_details == "dev_titles":
+                    keys = ["question", "answer", "supporting_facts"]
                 else:
-                    keys = ['question', 'answer']
+                    keys = ["question", "answer"]
 
                 example = {k: raw_example[k] for k in keys}
-                
-                if 'supporting_facts' in example:
-                    example['gold_titles'] = set(example['supporting_facts']['title'])
-                    del example['supporting_facts']
+
+                if "supporting_facts" in example:
+                    example["gold_titles"] = set(example["supporting_facts"]["title"])
+                    del example["supporting_facts"]
 
                 official_train.append(example)
 
         rng = random.Random(0)
         rng.shuffle(official_train)
 
-        self._train = official_train[:len(official_train)*75//100]
+        self._train = official_train[: len(official_train) * 75 // 100]
 
         if unofficial_dev:
-            self._dev = official_train[len(official_train)*75//100:]
+            self._dev = official_train[len(official_train) * 75 // 100 :]
         else:
             self._dev = None
 
         for example in self._train:
-            if keep_details == 'dev_titles':
-                del example['gold_titles']
-        
+            if keep_details == "dev_titles":
+                del example["gold_titles"]
+
         test = []
         for raw_example in hf_official_dev:
-            assert raw_example['level'] == 'hard'
-            example = {k: raw_example[k] for k in ['id', 'question', 'answer', 'type', 'supporting_facts']}
-            if 'supporting_facts' in example:
-                example['gold_titles'] = set(example['supporting_facts']['title'])
-                del example['supporting_facts']
+            assert raw_example["level"] == "hard"
+            example = {
+                k: raw_example[k]
+                for k in ["id", "question", "answer", "type", "supporting_facts"]
+            }
+            if "supporting_facts" in example:
+                example["gold_titles"] = set(example["supporting_facts"]["title"])
+                del example["supporting_facts"]
             test.append(example)
 
         self._test = test
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     from dsp.utils import dotdict
 
-    data_args = dotdict(train_seed=1, train_size=16, eval_seed=2023, dev_size=200*5, test_size=0)
+    data_args = dotdict(
+        train_seed=1, train_size=16, eval_seed=2023, dev_size=200 * 5, test_size=0
+    )
     dataset = HotPotQA(**data_args)
 
     print(dataset)
@@ -80,4 +94,4 @@ def __init__(self, *args, only_hard_examples=True, keep_details='dev_titles', un
 Both London and German have seen attacks during war, there was one specific type of attack that Germany called the blitz, what did London call a similar attack?
 Pre-Madonna was a collection of demos by the singer who was a leading presence during the emergence of what network?
 Alan Mills composed the classic folk song that tells the story of what? 
-"""
\ No newline at end of file
+"""
diff --git a/dspy/evaluate/__init__.py b/dspy/evaluate/__init__.py
index 0b71ccfac7..9692adff24 100644
--- a/dspy/evaluate/__init__.py
+++ b/dspy/evaluate/__init__.py
@@ -1,3 +1,3 @@
 from .evaluate import Evaluate
 from .metrics import *
-from .auto_evaluation import *
\ No newline at end of file
+from .auto_evaluation import *
diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py
index 1ec659540c..d49e25058b 100644
--- a/dspy/evaluate/auto_evaluation.py
+++ b/dspy/evaluate/auto_evaluation.py
@@ -1,4 +1,5 @@
-import dspy 
+import dspy
+
 
 class AnswerCorrectnessSignature(dspy.Signature):
     """Verify that the predicted answer matches the gold answer."""
@@ -6,29 +7,37 @@ class AnswerCorrectnessSignature(dspy.Signature):
     question = dspy.InputField()
     gold_answer = dspy.InputField(desc="correct answer for question")
     predicted_answer = dspy.InputField(desc="predicted answer for question")
-    is_correct = dspy.OutputField(desc='True or False')
+    is_correct = dspy.OutputField(desc="True or False")
+
 
 class AnswerCorrectness(dspy.Module):
     def __init__(self):
         super().__init__()
         self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature)
-    
+
     def forward(self, question, gold_answer, predicted_answer):
-        return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer)
+        return self.evaluate_correctness(
+            question=question,
+            gold_answer=gold_answer,
+            predicted_answer=predicted_answer,
+        )
 
 
 class AnswerFaithfulnessSignature(dspy.Signature):
     """Verify that the predicted answer is based on the provided context."""
-    
+
     context = dspy.InputField(desc="relevant facts for producing answer")
     question = dspy.InputField()
     answer = dspy.InputField(desc="often between 1 and 5 words")
-    is_faithful = dspy.OutputField(desc='True or False')
+    is_faithful = dspy.OutputField(desc="True or False")
+
 
 class AnswerFaithfulness(dspy.Module):
     def __init__(self):
         super().__init__()
         self.evaluate_faithfulness = dspy.ChainOfThought(AnswerFaithfulnessSignature)
-    
+
     def forward(self, context, question, answer):
-        return self.evaluate_faithfulness(context=context, question=question, answer=answer)
+        return self.evaluate_faithfulness(
+            context=context, question=question, answer=answer
+        )
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index ee9a8b2aca..ce1c78be32 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -1,6 +1,3 @@
-from openai import InvalidRequestError
-from openai.error import APIError
-
 import dsp
 import tqdm
 import threading
@@ -9,8 +6,7 @@
 from IPython.display import display as ipython_display, HTML
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-from dsp.utils import EM, F1, HotPotF1
-from dsp.evaluation.utils import *
+# from dsp.evaluation.utils import *
 
 """
 TODO: Counting failures and having a max_failure count. When that is exceeded (also just at the end),
@@ -19,8 +15,17 @@
 
 
 class Evaluate:
-    def __init__(self, *, devset, metric=None, num_threads=1, display_progress=False,
-                 display_table=False, display=True, max_errors=5):
+    def __init__(
+        self,
+        *,
+        devset,
+        metric=None,
+        num_threads=1,
+        display_progress=False,
+        display_table=False,
+        display=True,
+        max_errors=5,
+    ):
         self.devset = devset
         self.metric = metric
         self.num_threads = num_threads
@@ -35,8 +40,10 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress):
         ncorrect = 0
         ntotal = 0
         reordered_devset = []
-        
-        pbar = tqdm.tqdm(total=len(devset), dynamic_ncols=True, disable=not display_progress)
+
+        pbar = tqdm.tqdm(
+            total=len(devset), dynamic_ncols=True, disable=not display_progress
+        )
         for idx, arg in devset:
             example_idx, example, prediction, score = wrapped_program(idx, arg)
             reordered_devset.append((example_idx, example, prediction, score))
@@ -44,17 +51,23 @@ def _execute_single_thread(self, wrapped_program, devset, display_progress):
             ntotal += 1
             self._update_progress(pbar, ncorrect, ntotal)
         pbar.close()
-        
+
         return reordered_devset, ncorrect, ntotal
 
-    def _execute_multi_thread(self, wrapped_program, devset, num_threads, display_progress):
+    def _execute_multi_thread(
+        self, wrapped_program, devset, num_threads, display_progress
+    ):
         ncorrect = 0
         ntotal = 0
         reordered_devset = []
-        
+
         with ThreadPoolExecutor(max_workers=num_threads) as executor:
-            futures = {executor.submit(wrapped_program, idx, arg) for idx, arg in devset}
-            pbar = tqdm.tqdm(total=len(devset), dynamic_ncols=True, disable=not display_progress)
+            futures = {
+                executor.submit(wrapped_program, idx, arg) for idx, arg in devset
+            }
+            pbar = tqdm.tqdm(
+                total=len(devset), dynamic_ncols=True, disable=not display_progress
+            )
 
             for future in as_completed(futures):
                 example_idx, example, prediction, score = future.result()
@@ -67,17 +80,31 @@ def _execute_multi_thread(self, wrapped_program, devset, num_threads, display_pr
         return reordered_devset, ncorrect, ntotal
 
     def _update_progress(self, pbar, ncorrect, ntotal):
-        pbar.set_description(f"Average Metric: {ncorrect} / {ntotal}  ({round(100 * ncorrect / ntotal, 1)})")
+        pbar.set_description(
+            f"Average Metric: {ncorrect} / {ntotal}  ({round(100 * ncorrect / ntotal, 1)})"
+        )
         pbar.update()
 
-    def __call__(self, program, metric=None, devset=None, num_threads=None,
-                 display_progress=None, display_table=None, display=None,
-                 return_all_scores=False):
+    def __call__(
+        self,
+        program,
+        metric=None,
+        devset=None,
+        num_threads=None,
+        display_progress=None,
+        display_table=None,
+        display=None,
+        return_all_scores=False,
+    ):
         metric = metric if metric is not None else self.metric
         devset = devset if devset is not None else self.devset
         num_threads = num_threads if num_threads is not None else self.num_threads
-        display_progress = display_progress if display_progress is not None else self.display_progress
-        display_table = display_table if display_table is not None else self.display_table
+        display_progress = (
+            display_progress if display_progress is not None else self.display_progress
+        )
+        display_table = (
+            display_table if display_table is not None else self.display_table
+        )
 
         display = self.display if display is None else display
         display_progress = display_progress and display
@@ -85,16 +112,22 @@ def __call__(self, program, metric=None, devset=None, num_threads=None,
 
         def wrapped_program(example_idx, example):
             # NOTE: TODO: Won't work if threads create threads!
-            creating_new_thread = threading.get_ident() not in dsp.settings.stack_by_thread
+            creating_new_thread = (
+                threading.get_ident() not in dsp.settings.stack_by_thread
+            )
             if creating_new_thread:
-                dsp.settings.stack_by_thread[threading.get_ident()] = list(dsp.settings.main_stack)
+                dsp.settings.stack_by_thread[threading.get_ident()] = list(
+                    dsp.settings.main_stack
+                )
                 # print(threading.get_ident(), dsp.settings.stack_by_thread[threading.get_ident()])
 
             # print(type(example), example)
 
             try:
                 prediction = program(**example.inputs())
-                score = metric(example, prediction)  # FIXME: TODO: What's the right order? Maybe force name-based kwargs!
+                score = metric(
+                    example, prediction
+                )  # FIXME: TODO: What's the right order? Maybe force name-based kwargs!
                 return example_idx, example, prediction, score
             except Exception as e:
                 with self.error_lock:
@@ -111,17 +144,26 @@ def wrapped_program(example_idx, example):
         devset = list(enumerate(devset))
 
         if num_threads == 1:
-            reordered_devset, ncorrect, ntotal = self._execute_single_thread(wrapped_program, devset, display_progress)
+            reordered_devset, ncorrect, ntotal = self._execute_single_thread(
+                wrapped_program, devset, display_progress
+            )
         else:
-            reordered_devset, ncorrect, ntotal = self._execute_multi_thread(wrapped_program, devset, num_threads, display_progress)
+            reordered_devset, ncorrect, ntotal = self._execute_multi_thread(
+                wrapped_program, devset, num_threads, display_progress
+            )
 
         if display:
-            print(f"Average Metric: {ncorrect} / {ntotal}  ({round(100 * ncorrect / ntotal, 1)}%)")
+            print(
+                f"Average Metric: {ncorrect} / {ntotal}  ({round(100 * ncorrect / ntotal, 1)}%)"
+            )
 
         predicted_devset = sorted(reordered_devset)
 
         # data = [{**example, **prediction, 'correct': score} for example, prediction, score in zip(reordered_devset, preds, scores)]
-        data = [merge_dicts(example, prediction) | {'correct': score} for _, example, prediction, score in predicted_devset]
+        data = [
+            merge_dicts(example, prediction) | {"correct": score}
+            for _, example, prediction, score in predicted_devset
+        ]
 
         df = pd.DataFrame(data)
 
@@ -130,7 +172,7 @@ def wrapped_program(example_idx, example):
 
         # Rename the 'correct' column to the name of the metric function
         metric_name = metric.__name__
-        df.rename(columns={'correct': metric_name}, inplace=True)
+        df.rename(columns={"correct": metric_name}, inplace=True)
 
         if display_table:
             if isinstance(display_table, int):
@@ -141,7 +183,7 @@ def wrapped_program(example_idx, example):
                 truncated_rows = 0
 
             styled_df = configure_dataframe_display(df_to_display, metric_name)
-            
+
             ipython_display(styled_df)
 
             if truncated_rows > 0:
@@ -157,9 +199,11 @@ def wrapped_program(example_idx, example):
                 </div>
                 """
                 ipython_display(HTML(message))
-                
+
         if return_all_scores:
-            return round(100 * ncorrect / ntotal, 2), [score for *_, score in predicted_devset]
+            return round(100 * ncorrect / ntotal, 2), [
+                score for *_, score in predicted_devset
+            ]
 
         return round(100 * ncorrect / ntotal, 2)
 
@@ -185,28 +229,36 @@ def truncate_cell(content):
     """Truncate content of a cell to 25 words."""
     words = str(content).split()
     if len(words) > 25:
-        return ' '.join(words[:25]) + '...'
+        return " ".join(words[:25]) + "..."
     return content
 
+
 def configure_dataframe_display(df, metric_name):
     """Set various pandas display options for DataFrame."""
     pd.options.display.max_colwidth = None
-    pd.set_option('display.max_colwidth', 15)  # Adjust the number as needed
-    pd.set_option('display.width', 400)  # Adjust
+    pd.set_option("display.max_colwidth", 15)  # Adjust the number as needed
+    pd.set_option("display.width", 400)  # Adjust
 
     # df[metric_name] = df[metric_name].apply(lambda x: f'✔️ [{x}]' if x is True else f'❌ [{x}]')
-    df.loc[:, metric_name] = df[metric_name].apply(lambda x: f'✔️ [{x}]' if x is True else f'❌ [{x}]')
+    df.loc[:, metric_name] = df[metric_name].apply(
+        lambda x: f"✔️ [{x}]" if x is True else f"❌ [{x}]"
+    )
 
     # Return styled DataFrame
-    return df.style.set_table_styles([
-        {'selector': 'th', 'props': [('text-align', 'left')]},
-        {'selector': 'td', 'props': [('text-align', 'left')]}
-    ]).set_properties(**{
-        'text-align': 'left',
-        'white-space': 'pre-wrap',
-        'word-wrap': 'break-word',
-        'max-width': '400px'
-    })
+    return df.style.set_table_styles(
+        [
+            {"selector": "th", "props": [("text-align", "left")]},
+            {"selector": "td", "props": [("text-align", "left")]},
+        ]
+    ).set_properties(
+        **{
+            "text-align": "left",
+            "white-space": "pre-wrap",
+            "word-wrap": "break-word",
+            "max-width": "400px",
+        }
+    )
+
 
 # FIXME: TODO: The merge_dicts stuff above is way too quick and dirty.
-# TODO: the display_table can't handle False but can handle 0! Not sure how it works with True exactly, probably fails too.
\ No newline at end of file
+# TODO: the display_table can't handle False but can handle 0! Not sure how it works with True exactly, probably fails too.
diff --git a/dspy/evaluate/metrics.py b/dspy/evaluate/metrics.py
index b965b33504..716f0ad4c7 100644
--- a/dspy/evaluate/metrics.py
+++ b/dspy/evaluate/metrics.py
@@ -1,22 +1,24 @@
 # TODO: This should move internally. Same for passage_match. dspy.metrics.answer_exact_match, dspy.metrics.answer_passage_match
 
 import dsp
-from dsp.utils import EM, normalize_text
+
 
 def answer_exact_match(example, pred, trace=None, frac=1.0):
-    assert(type(example.answer) is str or type(example.answer) is list)
-    
-    if type(example.answer) is str:
+    assert isinstance(example.answer, str) or isinstance(example.answer, list)
+
+    if isinstance(example.answer, str):
         return dsp.answer_match(pred.answer, [example.answer], frac=frac)
-    else: # type(example.answer) is list
+    else:  # type(example.answer) is list
         return dsp.answer_match(pred.answer, example.answer, frac=frac)
 
+
 answer_exact_match_str = dsp.answer_match
 
+
 def answer_passage_match(example, pred, trace=None):
-    assert(type(example.answer) is str or type(example.answer) is list)
-    
-    if type(example.answer) is str:
+    assert isinstance(example.answer, str) or isinstance(example.answer, list)
+
+    if isinstance(example.answer, str):
         return dsp.passage_match(pred.context, [example.answer])
-    else: # type(example.answer) is list
+    else:  # type(example.answer) is list
         return dsp.passage_match(pred.context, example.answer)
diff --git a/dspy/predict/__init__.py b/dspy/predict/__init__.py
index 2dec22fe54..41c60c508c 100644
--- a/dspy/predict/__init__.py
+++ b/dspy/predict/__init__.py
@@ -3,5 +3,5 @@
 from .multi_chain_comparison import MultiChainComparison
 from .chain_of_thought_with_hint import ChainOfThoughtWithHint
 from .react import ReAct
-from .aggregation import majority 
-from .program_of_thought import ProgramOfThought
\ No newline at end of file
+from .aggregation import majority
+from .program_of_thought import ProgramOfThought
diff --git a/dspy/predict/aggregation.py b/dspy/predict/aggregation.py
index 2212900c2d..b48f6ceba7 100644
--- a/dspy/predict/aggregation.py
+++ b/dspy/predict/aggregation.py
@@ -1,44 +1,53 @@
 from dspy.primitives.prediction import Prediction, Completions
 from dsp.utils import normalize_text
+import logging
 
+logger = logging.getLogger(__name__)
 
-default_normalize = lambda s: normalize_text(s) or None
+
+def default_normalize(s):
+    return normalize_text(s) or None
 
 
 def majority(prediction_or_completions, normalize=default_normalize, field=None):
     """
-        Returns the most common completion for the target field (or the last field) in the signature.
-        When normalize returns None, that completion is ignored.
-        In case of a tie, earlier completion are prioritized.
+    Returns the most common completion for the target field (or the last field) in the signature.
+    When normalize returns None, that completion is ignored.
+    In case of a tie, earlier completion are prioritized.
     """
 
-    assert any(isinstance(prediction_or_completions, t) for t in [Prediction, Completions, list])
-    input_type = type(prediction_or_completions)
+    assert any(
+        isinstance(prediction_or_completions, t)
+        for t in [Prediction, Completions, list]
+    )
+    # input_type = type(prediction_or_completions)
 
     # Get the completions
     if isinstance(prediction_or_completions, Prediction):
         completions = prediction_or_completions.completions
     else:
         completions = prediction_or_completions
-    
+
     try:
         signature = completions.signature
-    except:
+    except Exception as e:
+        logger.exception(e)
         signature = None
-    
+
     try:
         field = field if field else signature.fields[-1].output_variable
-    except:
+    except Exception as e:
+        logger.exception(e)
         field = field if field else list(completions[0].keys())[-1]
 
     # Normalize
     normalize = normalize if normalize else lambda x: x
     normalized_values = [normalize(completion[field]) for completion in completions]
     normalized_values_ = [x for x in normalized_values if x is not None]
-    
+
     # Count
     value_counts = {}
-    for value in (normalized_values_ or normalized_values):
+    for value in normalized_values_ or normalized_values:
         value_counts[value] = value_counts.get(value, 0) + 1
 
     majority_value = max(value_counts, key=value_counts.get)
@@ -47,9 +56,8 @@ def majority(prediction_or_completions, normalize=default_normalize, field=None)
     for completion in completions:
         if normalize(completion[field]) == majority_value:
             break
-    
+
     # if input_type == Prediction:
     return Prediction.from_completions([completion], signature=signature)
 
     return Completions([completion], signature=signature)
-
diff --git a/dspy/predict/chain_of_thought.py b/dspy/predict/chain_of_thought.py
index 4943856b96..330a1026de 100644
--- a/dspy/predict/chain_of_thought.py
+++ b/dspy/predict/chain_of_thought.py
@@ -3,7 +3,6 @@
 from .predict import Predict
 
 
-
 # TODO: FIXME: Insert this right before the *first* output field. Also rewrite this to use the new signature system.
 
 # TODO: This shouldn't inherit from Predict. It should be a module that has one or two predictors.
@@ -27,6 +26,7 @@ def forward(self, **kwargs):
 # How this should look like. But with also passing signature=simpler_signature to the predict module *if* deactivated.
 """
 
+
 class ChainOfThought(Predict):
     def __init__(self, signature, rationale_type=None, activated=True, **config):
         super().__init__(signature, **config)
@@ -36,22 +36,30 @@ def __init__(self, signature, rationale_type=None, activated=True, **config):
         signature = self.signature
         *keys, last_key = signature.kwargs.keys()
 
-        DEFAULT_RATIONALE_TYPE = dsp.Type(prefix="Reasoning: Let's think step by step in order to",
-                                          desc="${produce the " + last_key + "}. We ...")
+        DEFAULT_RATIONALE_TYPE = dsp.Type(
+            prefix="Reasoning: Let's think step by step in order to",
+            desc="${produce the " + last_key + "}. We ...",
+        )
 
         rationale_type = rationale_type or DEFAULT_RATIONALE_TYPE
-        
+
         extended_kwargs = {key: signature.kwargs[key] for key in keys}
-        extended_kwargs.update({'rationale': rationale_type, last_key: signature.kwargs[last_key]})
-        
-        self.extended_signature = dsp.Template(signature.instructions, **extended_kwargs)
-    
+        extended_kwargs.update(
+            {"rationale": rationale_type, last_key: signature.kwargs[last_key]}
+        )
+
+        self.extended_signature = dsp.Template(
+            signature.instructions, **extended_kwargs
+        )
+
     def forward(self, **kwargs):
         signature = self.signature
 
-        if self.activated is True or (self.activated is None and isinstance(dsp.settings.lm, dsp.GPT3)):
+        if self.activated is True or (
+            self.activated is None and isinstance(dsp.settings.lm, dsp.GPT3)
+        ):
             signature = self.extended_signature
-        
+
         return super().forward(signature=signature, **kwargs)
 
 
diff --git a/dspy/predict/chain_of_thought_with_hint.py b/dspy/predict/chain_of_thought_with_hint.py
index b968d0bd95..369824c8a1 100644
--- a/dspy/predict/chain_of_thought_with_hint.py
+++ b/dspy/predict/chain_of_thought_with_hint.py
@@ -3,9 +3,9 @@
 from .predict import Predict
 
 
-
 # TODO: FIXME: Insert this right before the *first* output field. Also rewrite this to use the new signature system.
 
+
 class ChainOfThoughtWithHint(Predict):
     def __init__(self, signature, rationale_type=None, activated=True, **config):
         super().__init__(signature, **config)
@@ -17,29 +17,45 @@ def __init__(self, signature, rationale_type=None, activated=True, **config):
 
         DEFAULT_HINT_TYPE = dsp.Type(prefix="Hint:", desc="${hint}")
 
-        DEFAULT_RATIONALE_TYPE = dsp.Type(prefix="Reasoning: Let's think step by step in order to",
-                                          desc="${produce the " + last_key + "}. We ...")
+        DEFAULT_RATIONALE_TYPE = dsp.Type(
+            prefix="Reasoning: Let's think step by step in order to",
+            desc="${produce the " + last_key + "}. We ...",
+        )
 
         rationale_type = rationale_type or DEFAULT_RATIONALE_TYPE
-        
+
         extended_kwargs1 = {key: signature.kwargs[key] for key in keys}
-        extended_kwargs1.update({'rationale': rationale_type, last_key: signature.kwargs[last_key]})
+        extended_kwargs1.update(
+            {"rationale": rationale_type, last_key: signature.kwargs[last_key]}
+        )
 
         extended_kwargs2 = {key: signature.kwargs[key] for key in keys}
-        extended_kwargs2.update({'hint': DEFAULT_HINT_TYPE, 'rationale': rationale_type, last_key: signature.kwargs[last_key]})
-        
-        self.extended_signature1 = dsp.Template(signature.instructions, **extended_kwargs1)
-        self.extended_signature2 = dsp.Template(signature.instructions, **extended_kwargs2)
-    
+        extended_kwargs2.update(
+            {
+                "hint": DEFAULT_HINT_TYPE,
+                "rationale": rationale_type,
+                last_key: signature.kwargs[last_key],
+            }
+        )
+
+        self.extended_signature1 = dsp.Template(
+            signature.instructions, **extended_kwargs1
+        )
+        self.extended_signature2 = dsp.Template(
+            signature.instructions, **extended_kwargs2
+        )
+
     def forward(self, **kwargs):
         signature = self.signature
 
-        if self.activated is True or (self.activated is None and isinstance(dsp.settings.lm, dsp.GPT3)):
-            if 'hint' in kwargs and kwargs['hint']:
+        if self.activated is True or (
+            self.activated is None and isinstance(dsp.settings.lm, dsp.GPT3)
+        ):
+            if "hint" in kwargs and kwargs["hint"]:
                 signature = self.extended_signature2
             else:
                 signature = self.extended_signature1
-        
+
         return super().forward(signature=signature, **kwargs)
 
 
@@ -47,4 +63,4 @@ def forward(self, **kwargs):
 TODO: In principle, we can update the field's prefix during forward too to fill any thing based on the input args.
 
 IF the user didn't overwrite our default rationale_type.
-"""
\ No newline at end of file
+"""
diff --git a/dspy/predict/knn.py b/dspy/predict/knn.py
index e0e49fe2b4..6d46e876b5 100644
--- a/dspy/predict/knn.py
+++ b/dspy/predict/knn.py
@@ -2,18 +2,32 @@
 import numpy as np
 import dsp
 
+
 class KNN:
     def __init__(self, k: int, trainset: List[dsp.Example]):
         self.k = k
         self.trainset = trainset
         self.vectorizer = dsp.SentenceTransformersVectorizer()
-        trainset_casted_to_vectorize = [" | ".join([f"{key}: {value}" for key, value in example.items() if key in example._input_keys]) for example in self.trainset]
-        self.trainset_vectors = self.vectorizer(trainset_casted_to_vectorize).astype(np.float32)
+        trainset_casted_to_vectorize = [
+            " | ".join(
+                [
+                    f"{key}: {value}"
+                    for key, value in example.items()
+                    if key in example._input_keys
+                ]
+            )
+            for example in self.trainset
+        ]
+        self.trainset_vectors = self.vectorizer(trainset_casted_to_vectorize).astype(
+            np.float32
+        )
 
     def __call__(self, **kwargs) -> List[dsp.Example]:
         with dsp.settings.context(vectorizer=self.vectorizer):
-            input_example_vector = self.vectorizer([" | ".join([f"{key}: {val}" for key, val in kwargs.items()])])
+            input_example_vector = self.vectorizer(
+                [" | ".join([f"{key}: {val}" for key, val in kwargs.items()])]
+            )
             scores = np.dot(self.trainset_vectors, input_example_vector.T).squeeze()
-            nearest_samples_idxs = scores.argsort()[-self.k:][::-1]
+            nearest_samples_idxs = scores.argsort()[-self.k :][::-1]
             train_sampled = [self.trainset[cur_idx] for cur_idx in nearest_samples_idxs]
             return train_sampled
diff --git a/dspy/predict/multi_chain_comparison.py b/dspy/predict/multi_chain_comparison.py
index 99c2b43c5a..8c59363e54 100644
--- a/dspy/predict/multi_chain_comparison.py
+++ b/dspy/predict/multi_chain_comparison.py
@@ -3,6 +3,7 @@
 
 import dsp
 
+
 class MultiChainComparison(Module):
     def __init__(self, signature, M=3, temperature=0.7, **config):
         super().__init__()
@@ -14,25 +15,40 @@ def __init__(self, signature, M=3, temperature=0.7, **config):
         extended_kwargs = {key: signature.kwargs[key] for key in keys}
 
         for idx in range(M):
-            candidate_type = dsp.Type(prefix=f"Student Attempt #{idx+1}:", desc="${reasoning attempt}")
-            extended_kwargs.update({f'reasoning_attempt_{idx+1}': candidate_type})
-        
-        rationale_type = dsp.Type(prefix="Accurate Reasoning: Thank you everyone. Let's now holistically", desc="${corrected reasoning}")
-        extended_kwargs.update({'rationale': rationale_type, last_key: signature.kwargs[last_key]})
+            candidate_type = dsp.Type(
+                prefix=f"Student Attempt #{idx+1}:", desc="${reasoning attempt}"
+            )
+            extended_kwargs.update({f"reasoning_attempt_{idx+1}": candidate_type})
+
+        rationale_type = dsp.Type(
+            prefix="Accurate Reasoning: Thank you everyone. Let's now holistically",
+            desc="${corrected reasoning}",
+        )
+        extended_kwargs.update(
+            {"rationale": rationale_type, last_key: signature.kwargs[last_key]}
+        )
 
         signature = dsp.Template(signature.instructions, **extended_kwargs)
         self.predict = Predict(signature, temperature=temperature, **config)
         self.last_key = last_key
-    
+
     def forward(self, completions, **kwargs):
         attempts = []
 
         for c in completions:
-            rationale = c.rationale.strip().split('\n')[0].strip()
-            answer = c[self.last_key].strip().split('\n')[0].strip()
-            attempts.append(f"«I'm trying to {rationale} I'm not sure but my prediction is {answer}»")
+            rationale = c.rationale.strip().split("\n")[0].strip()
+            answer = c[self.last_key].strip().split("\n")[0].strip()
+            attempts.append(
+                f"«I'm trying to {rationale} I'm not sure but my prediction is {answer}»"
+            )
 
         assert len(attempts) == self.M, len(attempts)
 
-        kwargs = {**{f'reasoning_attempt_{idx+1}': attempt for idx, attempt in enumerate(attempts)}, **kwargs}
+        kwargs = {
+            **{
+                f"reasoning_attempt_{idx+1}": attempt
+                for idx, attempt in enumerate(attempts)
+            },
+            **kwargs,
+        }
         return self.predict(**kwargs)
diff --git a/dspy/predict/parameter.py b/dspy/predict/parameter.py
index 1c87a6919d..8ed0fc0204 100644
--- a/dspy/predict/parameter.py
+++ b/dspy/predict/parameter.py
@@ -1,5 +1,6 @@
 class Parameter:
     pass
 
+
 class Hyperparameter:
     pass
diff --git a/dspy/predict/predict.py b/dspy/predict/predict.py
index 6d987957a6..a7b1efd1dc 100644
--- a/dspy/predict/predict.py
+++ b/dspy/predict/predict.py
@@ -10,7 +10,7 @@
 class Predict(Parameter):
     def __init__(self, signature, **config):
         self.stage = random.randbytes(8).hex()
-        self.signature = signature #.signature
+        self.signature = signature  # .signature
         self.config = config
         self.reset()
 
@@ -18,27 +18,31 @@ def __init__(self, signature, **config):
         if isinstance(signature, str):
             inputs, outputs = signature.split("->")
             inputs, outputs = inputs.split(","), outputs.split(",")
-            inputs, outputs = [field.strip() for field in inputs], [field.strip() for field in outputs]
+            inputs, outputs = (
+                [field.strip() for field in inputs],
+                [field.strip() for field in outputs],
+            )
 
             assert all(len(field.split()) == 1 for field in (inputs + outputs))
 
-            inputs_ = ', '.join([f"`{field}`" for field in inputs])
-            outputs_ = ', '.join([f"`{field}`" for field in outputs])
+            inputs_ = ", ".join([f"`{field}`" for field in inputs])
+            outputs_ = ", ".join([f"`{field}`" for field in outputs])
 
-            instructions = f"""Given the fields {inputs_}, produce the fields {outputs_}."""
+            instructions = (
+                f"""Given the fields {inputs_}, produce the fields {outputs_}."""
+            )
 
             inputs = {k: InputField() for k in inputs}
             outputs = {k: OutputField() for k in outputs}
 
             for k, v in inputs.items():
                 v.finalize(k, infer_prefix(k))
-            
+
             for k, v in outputs.items():
                 v.finalize(k, infer_prefix(k))
 
             self.signature = dsp.Template(instructions, **inputs, **outputs)
 
-    
     def reset(self):
         self.lm = None
         self.traces = []
@@ -54,11 +58,12 @@ def load_state(self, state):
             setattr(self, name, value)
 
         import dspy
+
         self.demos = [dspy.Example(**x) for x in self.demos]
-    
+
     def __call__(self, **kwargs):
         return self.forward(**kwargs)
-    
+
     def forward(self, **kwargs):
         # Extract the three privileged keyword arguments.
         signature = kwargs.pop("signature", self.signature)
@@ -70,10 +75,12 @@ def forward(self, **kwargs):
 
         # If temperature is 0.0 but its n > 1, set temperature to 0.7.
         temperature = config.get("temperature", None)
-        temperature = lm.kwargs['temperature'] if temperature is None else temperature
+        temperature = lm.kwargs["temperature"] if temperature is None else temperature
 
         num_generations = config.get("n", None)
-        num_generations = lm.kwargs['n'] if num_generations is None else num_generations
+        num_generations = (
+            lm.kwargs.get("n", 1) if num_generations is None else num_generations
+        )
 
         if (temperature is None or temperature <= 0.15) and num_generations > 1:
             config["temperature"] = 0.7
@@ -96,10 +103,12 @@ def forward(self, **kwargs):
             completions.append({})
             for field in signature.fields:
                 if field.output_variable not in kwargs.keys():
-                    completions[-1][field.output_variable] = getattr(c, field.output_variable)
+                    completions[-1][field.output_variable] = getattr(
+                        c, field.output_variable
+                    )
 
         pred = Prediction.from_completions(completions, signature=signature)
-            
+
         if dsp.settings.trace is not None:
             trace = dsp.settings.trace
             trace.append((self, {**kwargs}, pred))
@@ -110,7 +119,6 @@ def __repr__(self):
         return f"{self.__class__.__name__}({self.signature})"
 
 
-
 # TODO: get some defaults during init from the context window?
 # # TODO: FIXME: Hmm, I guess expected behavior is that contexts can
 # affect exeuction. Well, we need to determine whether context dominates, __init__ demoninates, or forward dominates.
diff --git a/dspy/predict/program_of_thought.py b/dspy/predict/program_of_thought.py
index 65d1613b3d..c6f9817f7c 100644
--- a/dspy/predict/program_of_thought.py
+++ b/dspy/predict/program_of_thought.py
@@ -4,6 +4,7 @@
 from ..primitives.python_interpreter import CodePrompt, PythonInterpreter
 import re
 
+
 class ProgramOfThought(Module):
     def __init__(self, signature, max_iters=3):
         super().__init__()
@@ -13,82 +14,151 @@ def __init__(self, signature, max_iters=3):
         self.input_fields = signature.input_fields()
         self.output_fields = signature.output_fields()
 
-        inputs_ = ', '.join([f"`{field_name}`" for field_name in self.input_fields.keys()])
-        outputs_ = ', '.join([f"`{field_name}`" for field_name in self.output_fields.keys()])
+        inputs_ = ", ".join(
+            [f"`{field_name}`" for field_name in self.input_fields.keys()]
+        )
+        outputs_ = ", ".join(
+            [f"`{field_name}`" for field_name in self.output_fields.keys()]
+        )
 
         assert len(self.output_fields) == 1, "PoT only supports one output field."
-        
+
         instr = []
-        instr.append(f"You will be given {inputs_} and you will respond with {outputs_}.")
-        instr.append(f"Generating executable Python code that programmatically computes the correct {outputs_}.")
-        instr.append(f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {outputs_}.")
-        instr = '\n'.join(instr)
-        
-        self.code_generate = dspy.ChainOfThought(dsp.Template(self._generate_instruction('generate'), **self._generate_signature('generate')))
-        self.code_regenerate = dspy.ChainOfThought(dsp.Template(self._generate_instruction('regenerate'), **self._generate_signature('regenerate')))
-        self.generate_answer = dspy.ChainOfThought(dsp.Template(self._generate_instruction('answer'), **self._generate_signature('answer')))
+        instr.append(
+            f"You will be given {inputs_} and you will respond with {outputs_}."
+        )
+        instr.append(
+            f"Generating executable Python code that programmatically computes the correct {outputs_}."
+        )
+        instr.append(
+            f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {outputs_}."
+        )
+        instr = "\n".join(instr)
+
+        self.code_generate = dspy.ChainOfThought(
+            dsp.Template(
+                self._generate_instruction("generate"),
+                **self._generate_signature("generate"),
+            )
+        )
+        self.code_regenerate = dspy.ChainOfThought(
+            dsp.Template(
+                self._generate_instruction("regenerate"),
+                **self._generate_signature("regenerate"),
+            )
+        )
+        self.generate_answer = dspy.ChainOfThought(
+            dsp.Template(
+                self._generate_instruction("answer"),
+                **self._generate_signature("answer"),
+            )
+        )
 
     def _generate_signature(self, mode):
         signature_dict = dict(self.input_fields)
         fields_for_mode = {
-            'generate': {
-                'generated_code': dspy.OutputField(prefix="Code:", desc="python code that answers the question", format=str)
+            "generate": {
+                "generated_code": dspy.OutputField(
+                    prefix="Code:",
+                    desc="python code that answers the question",
+                    format=str,
+                )
+            },
+            "regenerate": {
+                "previous_code": dspy.InputField(
+                    prefix="Previous Code:",
+                    desc="previously-generated python code that errored",
+                    format=str,
+                ),
+                "error": dspy.InputField(
+                    prefix="Error:",
+                    desc="error message from previously-generated python code",
+                ),
+                "generated_code": dspy.OutputField(
+                    prefix="Code:",
+                    desc="python code that answers the question",
+                    format=str,
+                ),
             },
-            'regenerate': {
-                'previous_code': dspy.InputField(prefix="Previous Code:", desc="previously-generated python code that errored", format=str),
-                'error': dspy.InputField(prefix="Error:", desc="error message from previously-generated python code"),
-                'generated_code': dspy.OutputField(prefix="Code:", desc="python code that answers the question", format=str)
+            "answer": {
+                "final_generated_code": dspy.InputField(
+                    prefix="Code:",
+                    desc="python code that answers the question",
+                    format=str,
+                ),
+                "code_output": dspy.InputField(
+                    prefix="Code Output:",
+                    desc="output of previously-generated python code",
+                ),
+                "answer": self.signature.kwargs["answer"],
             },
-            'answer': {
-                'final_generated_code': dspy.InputField(prefix="Code:", desc="python code that answers the question", format=str),
-                'code_output': dspy.InputField(prefix="Code Output:", desc="output of previously-generated python code"),
-                'answer': self.signature.kwargs["answer"]
-            }
         }
         signature_dict.update(fields_for_mode[mode])
         return signature_dict
 
     def _generate_instruction(self, mode):
-        mode_inputs = ', '.join([f"`{field_name}`" for field_name in self._generate_signature(mode).keys() if isinstance(self._generate_signature(mode)[field_name], dspy.InputField)])
-        mode_outputs = ', '.join([f"`{field_name}`" for field_name in self._generate_signature(mode).keys() if isinstance(self._generate_signature(mode)[field_name], dspy.OutputField)])
-        if mode == 'generate':
+        mode_inputs = ", ".join(
+            [
+                f"`{field_name}`"
+                for field_name in self._generate_signature(mode).keys()
+                if isinstance(
+                    self._generate_signature(mode)[field_name], dspy.InputField
+                )
+            ]
+        )
+        mode_outputs = ", ".join(
+            [
+                f"`{field_name}`"
+                for field_name in self._generate_signature(mode).keys()
+                if isinstance(
+                    self._generate_signature(mode)[field_name], dspy.OutputField
+                )
+            ]
+        )
+        if mode == "generate":
             instr = [
                 f"You will be given {mode_inputs} and you will respond with {mode_outputs}.",
                 f"Generating executable Python code that programmatically computes the correct {mode_outputs}.",
-                f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {mode_outputs}."
+                f"After you're done with the computation, make sure the last line in your code evaluates to the correct value for {mode_outputs}.",
             ]
-        elif mode == 'regenerate':
+        elif mode == "regenerate":
             instr = [
                 f"You are given {mode_inputs} due to an error in previous code.",
-                f"Your task is to correct the error and provide the new {mode_outputs}."
+                f"Your task is to correct the error and provide the new {mode_outputs}.",
             ]
         else:  # mode == 'answer'
             instr = [
                 f"Given the final code {mode_inputs}, provide the final {mode_outputs}."
             ]
 
-        return '\n'.join(instr)
+        return "\n".join(instr)
 
     def parse_code(self, code_data):
-        code = code_data.get('generated_code', '').split('---', 1)[0].split('\n\n\n', 1)[0]
-        code_match = re.search(r'```python[ \n](.*?)[ \n]```?', code, re.DOTALL)
-        code_block = (code_match.group(1) if code_match else code).replace('\\n', '\n')
+        code = (
+            code_data.get("generated_code", "").split("---", 1)[0].split("\n\n\n", 1)[0]
+        )
+        code_match = re.search(r"```python[ \n](.*?)[ \n]```?", code, re.DOTALL)
+        code_block = (code_match.group(1) if code_match else code).replace("\\n", "\n")
         if not code_block:
             return code, "Error: Empty code after parsing."
-        if "\n" not in code_block and code_block.count('=') > 1:
+        if "\n" not in code_block and code_block.count("=") > 1:
             return code, "Error: Code format is not correct."
-        lines = code_block.split('\n')
-        last_line_match = re.match(r'^(\w+)\s*=', lines[-1].strip())
+        lines = code_block.split("\n")
+        last_line_match = re.match(r"^(\w+)\s*=", lines[-1].strip())
         if last_line_match and len(lines) > 1:
-            code_block += '\n' + last_line_match.group(1)
+            code_block += "\n" + last_line_match.group(1)
         else:
-            code_block = re.sub(r'([a-zA-Z_]\w* *=.*?)(?=[a-zA-Z_]\w* *=)', r'\1\n', code_block)
-            code_block = re.sub(r'([a-zA-Z_]\w* *=.*?)([a-zA-Z_]\w*)$', r'\1\n\2', code_block)
+            code_block = re.sub(
+                r"([a-zA-Z_]\w* *=.*?)(?=[a-zA-Z_]\w* *=)", r"\1\n", code_block
+            )
+            code_block = re.sub(
+                r"([a-zA-Z_]\w* *=.*?)([a-zA-Z_]\w*)$", r"\1\n\2", code_block
+            )
         return code_block, None
 
     def execute_code(self, code):
         if not code:
-            return code, None, 'Error: Empty code before execution.'
+            return code, None, "Error: Empty code before execution."
         code_prompt = CodePrompt(code, code_type="python")
         interpreter = PythonInterpreter(action_space={"print": print})
         try:
@@ -96,19 +166,23 @@ def execute_code(self, code):
             return code, output, None
         except Exception as e:
             return code, None, str(e)
-            
+
     def forward(self, **kwargs):
         code_data = self.code_generate(question=kwargs["question"])
         parsed_code, error = self.parse_code(code_data)
         code, output, error = self.execute_code(parsed_code)
         hop = 0
         while hop < self.max_iters and error:
-            print('Error in code execution')
-            code_data = self.code_regenerate(question=kwargs["question"], previous_code=code, error=error)
+            print("Error in code execution")
+            code_data = self.code_regenerate(
+                question=kwargs["question"], previous_code=code, error=error
+            )
             parsed_code, error = self.parse_code(code_data)
             hop += 1
             if hop == self.max_iters:
-                print('Max hops reached. Error persists.')
+                print("Max hops reached. Error persists.")
                 return None
-        answer_gen_result = self.generate_answer(question=kwargs["question"], final_generated_code=code, code_output=output)
+        answer_gen_result = self.generate_answer(
+            question=kwargs["question"], final_generated_code=code, code_output=output
+        )
         return answer_gen_result
diff --git a/dspy/predict/react.py b/dspy/predict/react.py
index d79b9b52fb..5d049693d7 100644
--- a/dspy/predict/react.py
+++ b/dspy/predict/react.py
@@ -14,31 +14,56 @@ def __init__(self, signature, max_iters=5, num_results=3, tools=None):
         self.max_iters = max_iters
 
         self.tools = tools or [dspy.Retrieve(k=num_results)]
-        self.tools = {tool.name: tool for tool in self.tools} #if isinstance(self.tools, list) else self.tools
-
-        self.input_fields = {k: v for k, v in self.signature.kwargs.items() if isinstance(v, dspy.InputField)}
-        self.output_fields = {k: v for k, v in self.signature.kwargs.items() if isinstance(v, dspy.OutputField)}
+        self.tools = {
+            tool.name: tool for tool in self.tools
+        }  # if isinstance(self.tools, list) else self.tools
+
+        self.input_fields = {
+            k: v
+            for k, v in self.signature.kwargs.items()
+            if isinstance(v, dspy.InputField)
+        }
+        self.output_fields = {
+            k: v
+            for k, v in self.signature.kwargs.items()
+            if isinstance(v, dspy.OutputField)
+        }
 
         inputs, outputs = signature.fields[:-1], signature.fields[-1:]
 
-        inputs_ = ', '.join([f"`{field.input_variable}`" for field in inputs])
-        outputs_ = ', '.join([f"`{field.output_variable}`" for field in outputs])
+        inputs_ = ", ".join([f"`{field.input_variable}`" for field in inputs])
+        outputs_ = ", ".join([f"`{field.output_variable}`" for field in outputs])
 
         assert len(outputs) == 1, "ReAct only supports one output field."
 
         instr = []
-        instr.append(f"You will be given {inputs_} and you will respond with {outputs_}.\n")
-        instr.append("To do this, you will interleave Thought, Action, and Observation steps.\n")
-        instr.append("Thought can reason about the current situation, and Action can be the following types:\n")
-
-        self.tools['Finish'] = dspy.Example(name="Finish", input_variable=outputs_.strip('`'), desc=f"returns the final {outputs_} and finishes the task")
+        instr.append(
+            f"You will be given {inputs_} and you will respond with {outputs_}.\n"
+        )
+        instr.append(
+            "To do this, you will interleave Thought, Action, and Observation steps.\n"
+        )
+        instr.append(
+            "Thought can reason about the current situation, and Action can be the following types:\n"
+        )
+
+        self.tools["Finish"] = dspy.Example(
+            name="Finish",
+            input_variable=outputs_.strip("`"),
+            desc=f"returns the final {outputs_} and finishes the task",
+        )
 
         for idx, tool in enumerate(self.tools):
             tool = self.tools[tool]
-            instr.append(f"({idx+1}) {tool.name}[{tool.input_variable}], which {tool.desc}")
-        
-        instr = '\n'.join(instr)
-        self.react = [Predict(dsp.Template(instr, **self._generate_signature(i))) for i in range(1, max_iters + 1)]
+            instr.append(
+                f"({idx+1}) {tool.name}[{tool.input_variable}], which {tool.desc}"
+            )
+
+        instr = "\n".join(instr)
+        self.react = [
+            Predict(dsp.Template(instr, **self._generate_signature(i)))
+            for i in range(1, max_iters + 1)
+        ]
 
     def _generate_signature(self, iters):
         signature_dict = {}
@@ -46,29 +71,49 @@ def _generate_signature(self, iters):
             signature_dict[key] = val
 
         for j in range(1, iters + 1):
-            signature_dict[f"Thought_{j}"] = dspy.OutputField(prefix=f"Thought {j}:", desc="next steps to take based on last observation")
-
-            tool_list = ' or '.join([f"{tool.name}[{tool.input_variable}]" for tool in self.tools.values() if tool.name != 'Finish'])
-            signature_dict[f"Action_{j}"] = dspy.OutputField(prefix=f"Action {j}:", desc=f"always either {tool_list} or, when done, Finish[answer]")
+            signature_dict[f"Thought_{j}"] = dspy.OutputField(
+                prefix=f"Thought {j}:",
+                desc="next steps to take based on last observation",
+            )
+
+            tool_list = " or ".join(
+                [
+                    f"{tool.name}[{tool.input_variable}]"
+                    for tool in self.tools.values()
+                    if tool.name != "Finish"
+                ]
+            )
+            signature_dict[f"Action_{j}"] = dspy.OutputField(
+                prefix=f"Action {j}:",
+                desc=f"always either {tool_list} or, when done, Finish[answer]",
+            )
 
             if j < iters:
-                signature_dict[f"Observation_{j}"] = dspy.OutputField(prefix=f"Observation {j}:", desc="observations based on action", format=dsp.passages2text)
+                signature_dict[f"Observation_{j}"] = dspy.OutputField(
+                    prefix=f"Observation {j}:",
+                    desc="observations based on action",
+                    format=dsp.passages2text,
+                )
 
         return signature_dict
-    
+
     def act(self, output, hop):
         try:
             action = output[f"Action_{hop+1}"]
-            action_name, action_val = action.strip().split('\n')[0].split('[', 1)
-            action_val = action_val.rsplit(']', 1)[0]
+            action_name, action_val = action.strip().split("\n")[0].split("[", 1)
+            action_val = action_val.rsplit("]", 1)[0]
 
-            if action_name == 'Finish': return action_val
+            if action_name == "Finish":
+                return action_val
 
-            output[f"Observation_{hop+1}"] = self.tools[action_name](action_val).passages
+            output[f"Observation_{hop+1}"] = self.tools[action_name](
+                action_val
+            ).passages
 
-        except Exception as e:
-            output[f"Observation_{hop+1}"] = "Failed to parse action. Bad formatting or incorrect action name."
-        
+        except Exception:
+            output[
+                f"Observation_{hop+1}"
+            ] = "Failed to parse action. Bad formatting or incorrect action name."
 
     def forward(self, **kwargs):
         args = {key: kwargs[key] for key in self.input_fields.keys() if key in kwargs}
@@ -76,9 +121,10 @@ def forward(self, **kwargs):
         for hop in range(self.max_iters):
             # with dspy.settings.context(show_guidelines=(i <= 2)):
             output = self.react[hop](**args)
-            
-            if action_val := self.act(output, hop): break
+
+            if action_val := self.act(output, hop):
+                break
             args.update(output)
 
         # assumes only 1 output field for now - TODO: handling for multiple output fields
-        return dspy.Prediction(**{list(self.output_fields.keys())[0]: action_val or ''}) 
+        return dspy.Prediction(**{list(self.output_fields.keys())[0]: action_val or ""})
diff --git a/dspy/primitives/__init__.py b/dspy/primitives/__init__.py
index 3db8c7d029..21e59fd3a9 100644
--- a/dspy/primitives/__init__.py
+++ b/dspy/primitives/__init__.py
@@ -1,4 +1,4 @@
 from .example import *
 from .program import *
 from .prediction import *
-from .python_interpreter import *
\ No newline at end of file
+from .python_interpreter import *
diff --git a/dspy/primitives/box.py b/dspy/primitives/box.py
index db1b51d8f5..097fb3fb81 100644
--- a/dspy/primitives/box.py
+++ b/dspy/primitives/box.py
@@ -93,37 +93,78 @@ class BoxType(type):
     # List of operations to override
     ops = [
         # Arithmetic operations
-        'add', 'sub', 'mul', 'truediv', 'floordiv', 'mod', 'pow', 
-        'lshift', 'rshift', 'and', 'or', 'xor',
+        "add",
+        "sub",
+        "mul",
+        "truediv",
+        "floordiv",
+        "mod",
+        "pow",
+        "lshift",
+        "rshift",
+        "and",
+        "or",
+        "xor",
         # 'r'-prefixed versions of arithmetic operations
-        'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rmod', 
-        'rpow', 'rlshift', 'rrshift', 'rand', 'ror', 'rxor',
+        "radd",
+        "rsub",
+        "rmul",
+        "rtruediv",
+        "rfloordiv",
+        "rmod",
+        "rpow",
+        "rlshift",
+        "rrshift",
+        "rand",
+        "ror",
+        "rxor",
         # Sequence operations
-        'getitem', 'setitem', 'delitem', 'contains',
+        "getitem",
+        "setitem",
+        "delitem",
+        "contains",
         # Unary and other operations
-        'neg', 'pos', 'abs', 'invert', 'round', 'len', 
-        'getitem', 'setitem', 'delitem', 'contains', 'iter',
+        "neg",
+        "pos",
+        "abs",
+        "invert",
+        "round",
+        "len",
+        "getitem",
+        "setitem",
+        "delitem",
+        "contains",
+        "iter",
         # Mappings operations (for dicts)
-        'get', 'keys', 'values', 'items',
+        "get",
+        "keys",
+        "values",
+        "items",
         # Comparison
-        'eq', 'ne', 'lt', 'le', 'gt', 'ge',
+        "eq",
+        "ne",
+        "lt",
+        "le",
+        "gt",
+        "ge",
     ]
 
     def __init__(cls, name, bases, attrs):
         def create_method(op):
             def method(self, other=None):
-                if op in ['len', 'keys', 'values', 'items']:
+                if op in ["len", "keys", "values", "items"]:
                     return getattr(self._value, op)()
                 elif isinstance(other, Box):
-                    return Box(getattr(self._value, f'__{op}__')(other._value))
+                    return Box(getattr(self._value, f"__{op}__")(other._value))
                 elif other is not None:
-                    return Box(getattr(self._value, f'__{op}__')(other))
+                    return Box(getattr(self._value, f"__{op}__")(other))
                 else:
                     return NotImplemented
+
             return method
 
         for op in BoxType.ops:
-            setattr(cls, f'__{op}__', create_method(op))
+            setattr(cls, f"__{op}__", create_method(op))
 
         super().__init__(name, bases, attrs)
 
@@ -138,10 +179,10 @@ def __repr__(self):
 
     def __str__(self):
         return str(self._value)
-    
+
     def __bool__(self):
         return bool(self._value)
-    
+
     # if method is missing just call it on the _value
     def __getattr__(self, name):
         return Box(getattr(self._value, name))
diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py
index dbc16c1e1c..d3476fdcef 100644
--- a/dspy/primitives/example.py
+++ b/dspy/primitives/example.py
@@ -1,5 +1,3 @@
-import copy
-
 class Example:
     def __init__(self, base=None, **kwargs):
         # Internal storage and other attributes
@@ -17,20 +15,20 @@ def __init__(self, base=None, **kwargs):
 
         # Update with provided kwargs
         self._store.update(kwargs)
-    
+
     def __getattr__(self, key):
-        if key.startswith('__') and key.endswith('__'):
+        if key.startswith("__") and key.endswith("__"):
             raise AttributeError
         if key in self._store:
             return self._store[key]
         raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'")
 
     def __setattr__(self, key, value):
-        if key.startswith('_') or key in dir(self.__class__):  
+        if key.startswith("_") or key in dir(self.__class__):
             super().__setattr__(key, value)
         else:
             self._store[key] = value
-    
+
     def __getitem__(self, key):
         return self._store[key]
 
@@ -42,55 +40,67 @@ def __delitem__(self, key):
 
     def __contains__(self, key):
         return key in self._store
-    
+
     def __len__(self):
-        return len([k for k in self._store if not k.startswith('dspy_')])
-    
+        return len([k for k in self._store if not k.startswith("dspy_")])
+
     def __repr__(self):
         # return f"Example({self._store})" + f" (input_keys={self._input_keys}, demos={self._demos})"
-        d = {k: v for k, v in self._store.items() if not k.startswith('dspy_')}
+        d = {k: v for k, v in self._store.items() if not k.startswith("dspy_")}
         return f"Example({d})" + f" (input_keys={self._input_keys})"
-    
+
     def __str__(self):
         return self.__repr__()
-    
+
     def __eq__(self, other):
         return self._store == other._store
-    
+
     def __hash__(self):
         return hash(tuple(self._store.items()))
 
     def keys(self, include_dspy=False):
-        return [k for k in self._store.keys() if not k.startswith('dspy_') or include_dspy]
-    
+        return [
+            k for k in self._store.keys() if not k.startswith("dspy_") or include_dspy
+        ]
+
     def values(self, include_dspy=False):
-        return [v for k, v in self._store.items() if not k.startswith('dspy_') or include_dspy]
+        return [
+            v
+            for k, v in self._store.items()
+            if not k.startswith("dspy_") or include_dspy
+        ]
 
     def items(self, include_dspy=False):
-        return [(k, v) for k, v in self._store.items() if not k.startswith('dspy_') or include_dspy]
+        return [
+            (k, v)
+            for k, v in self._store.items()
+            if not k.startswith("dspy_") or include_dspy
+        ]
 
     def get(self, key, default=None):
         return self._store.get(key, default)
-    
+
     def with_inputs(self, *keys):
         copied = self.copy()
         copied._input_keys = set(keys)
         return copied
-    
+
     def inputs(self):
         if self._input_keys is None:
-            raise ValueError("Inputs have not been set for this example. Use `example.with_inputs()` to set them.")
+            raise ValueError(
+                "Inputs have not been set for this example. Use `example.with_inputs()` to set them."
+            )
 
         # return items that are in input_keys
         d = {key: self._store[key] for key in self._store if key in self._input_keys}
         return type(self)(d)
-    
+
     def labels(self):
         # return items that are NOT in input_keys
         input_keys = self.inputs().keys()
         d = {key: self._store[key] for key in self._store if key not in input_keys}
         return type(self)(d)
-    
+
     def __iter__(self):
         return iter(dict(self._store))
 
@@ -102,6 +112,6 @@ def without(self, *keys):
         for key in keys:
             del copied[key]
         return copied
-    
+
     def toDict(self):
         return self._store.copy()
diff --git a/dspy/primitives/module.py b/dspy/primitives/module.py
index 866fb2009c..77429f2203 100644
--- a/dspy/primitives/module.py
+++ b/dspy/primitives/module.py
@@ -8,7 +8,7 @@ def __init__(self):
 
     def named_parameters(self):
         """
-            Unlike PyTorch, handles (non-recursive) lists of parameters too.
+        Unlike PyTorch, handles (non-recursive) lists of parameters too.
         """
 
         from dspy.predict.parameter import Parameter
@@ -30,7 +30,7 @@ def add_parameter(param_name, param_value):
                 if not value._compiled:
                     for sub_name, param in value.named_parameters():
                         add_parameter(f"{name}.{sub_name}", param)
-            
+
             elif isinstance(value, (list, tuple)):
                 for idx, item in enumerate(value):
                     add_parameter(f"{name}[{idx}]", item)
@@ -49,23 +49,23 @@ def deepcopy(self):
 
     def reset_copy(self):
         obj = copy.deepcopy(self)
-        
+
         for param in obj.parameters():
             param.reset()
-        
+
         return obj
-    
+
     def dump_state(self):
         return {name: param.dump_state() for name, param in self.named_parameters()}
-    
+
     def load_state(self, state):
         for name, param in self.named_parameters():
             param.load_state(state[name])
-    
+
     def save(self, path):
         with open(path, "w") as f:
             f.write(ujson.dumps(self.dump_state(), indent=2))
-    
+
     def load(self, path):
         with open(path, "r") as f:
             self.load_state(ujson.loads(f.read()))
diff --git a/dspy/primitives/prediction.py b/dspy/primitives/prediction.py
index df653c1c4a..c00d860d9f 100644
--- a/dspy/primitives/prediction.py
+++ b/dspy/primitives/prediction.py
@@ -4,12 +4,12 @@
 class Prediction(Example):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        
+
         del self._demos
         del self._input_keys
 
         self._completions = None
-    
+
     @classmethod
     def from_completions(cls, list_or_dict, signature=None):
         obj = cls()
@@ -17,16 +17,16 @@ def from_completions(cls, list_or_dict, signature=None):
         obj._store = {k: v[0] for k, v in obj._completions.items()}
 
         return obj
-    
+
     def __repr__(self):
-        store_repr = ',\n    '.join(f"{k}={repr(v)}" for k, v in self._store.items())
+        store_repr = ",\n    ".join(f"{k}={repr(v)}" for k, v in self._store.items())
 
         if self._completions is None or len(self._completions) == 1:
             return f"Prediction(\n    {store_repr}\n)"
-        
+
         num_completions = len(self._completions)
         return f"Prediction(\n    {store_repr},\n    completions=Completions(...)\n) ({num_completions-1} completions omitted)"
-        
+
     def __str__(self):
         return self.__repr__()
 
@@ -47,11 +47,15 @@ def __init__(self, list_or_dict, signature=None):
         else:
             kwargs = list_or_dict
 
-        assert all(isinstance(v, list) for v in kwargs.values()), "All values must be lists"
+        assert all(
+            isinstance(v, list) for v in kwargs.values()
+        ), "All values must be lists"
 
         if kwargs:
             length = len(next(iter(kwargs.values())))
-            assert all(len(v) == length for v in kwargs.values()), "All lists must have the same length"
+            assert all(
+                len(v) == length for v in kwargs.values()
+            ), "All lists must have the same length"
 
         self._completions = kwargs
 
@@ -62,16 +66,18 @@ def __getitem__(self, key):
         if isinstance(key, int):
             if key < 0 or key >= len(self):
                 raise IndexError("Index out of range")
-            
+
             return Prediction(**{k: v[key] for k, v in self._completions.items()})
-        
+
         return self._completions[key]
 
     def __getattr__(self, name):
         if name in self._completions:
             return self._completions[name]
-        
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+        raise AttributeError(
+            f"'{type(self).__name__}' object has no attribute '{name}'"
+        )
 
     def __len__(self):
         # Return the length of the list for one of the keys
@@ -82,7 +88,9 @@ def __contains__(self, key):
         return key in self._completions
 
     def __repr__(self):
-        items_repr = ',\n    '.join(f"{k}={repr(v)}" for k, v in self._completions.items())
+        items_repr = ",\n    ".join(
+            f"{k}={repr(v)}" for k, v in self._completions.items()
+        )
         return f"Completions(\n    {items_repr}\n)"
 
     def __str__(self):
diff --git a/dspy/primitives/program.py b/dspy/primitives/program.py
index 73dbee6438..51c8b34b34 100644
--- a/dspy/primitives/program.py
+++ b/dspy/primitives/program.py
@@ -1,5 +1,3 @@
-import copy
-
 from dspy.primitives.module import BaseModule
 
 
@@ -7,7 +5,7 @@ class ProgramMeta(type):
     pass
     # def __call__(cls, *args, **kwargs):
     #     obj = super(ProgramMeta, cls).__call__(*args, **kwargs)
-        
+
     #     if issubclass(cls, Program) and not getattr(obj, "_program_init_called", False):
     #         obj._base_init()
     #         obj._program_init_called = True
@@ -23,30 +21,34 @@ def __init__(self):
 
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
-    
+
     def named_predictors(self):
         from dspy.predict.predict import Predict
-        
+
         named_parameters = self.named_parameters()
-        return [(name, param) for name, param in named_parameters if isinstance(param, Predict)]
+        return [
+            (name, param)
+            for name, param in named_parameters
+            if isinstance(param, Predict)
+        ]
 
     def predictors(self):
         return [param for _, param in self.named_predictors()]
-    
+
     def __repr__(self):
         s = []
 
         for name, param in self.named_predictors():
             s.append(f"{name} = {param}")
-        
-        return '\n'.join(s)
+
+        return "\n".join(s)
 
     # def __deepcopy__(self, memo):
     #     # memo is a dict of id's to copies already made during the current call
     #     # Check if the object is already copied
     #     if id(self) in memo:
     #         return memo[id(self)]
-        
+
     #     print(f"Deep copying {self.__class__.__name__}...")
 
     #     new_copy = copy.copy(self)
@@ -60,4 +62,4 @@ def __repr__(self):
     #     return new_copy
 
 
-Program = Module
\ No newline at end of file
+Program = Module
diff --git a/dspy/primitives/python_interpreter.py b/dspy/primitives/python_interpreter.py
index c47da27f04..eff13d5ae3 100644
--- a/dspy/primitives/python_interpreter.py
+++ b/dspy/primitives/python_interpreter.py
@@ -15,18 +15,14 @@
 import difflib
 import importlib
 import typing
-import inspect
 from typing import (
     Any,
-    Callable,
     Dict,
     Mapping,
     List,
     Optional,
     Set,
     Tuple,
-    TypeVar,
-    Union,
 )
 import builtins
 
@@ -39,7 +35,7 @@ class InterpreterError(ValueError):
     pass
 
 
-class PythonInterpreter():
+class PythonInterpreter:
     r"""A customized python interpreter to control the execution of
     LLM-generated codes. The interpreter makes sure the code can only execute
     functions given in action space and import white list. It also supports
@@ -48,8 +44,8 @@ class PythonInterpreter():
     .. highlight:: none
 
     This class is adapted from the Camel adaptation https://github.com/camel-ai/
-    camel/blob/9a9d71874944e9736c55cdaed3df469a8becec05/camel/utils/python_interpreter.py 
-    which adapts from the hugging face implementation `python_interpreter.py 
+    camel/blob/9a9d71874944e9736c55cdaed3df469a8becec05/camel/utils/python_interpreter.py
+    which adapts from the hugging face implementation `python_interpreter.py
     <https://github.com/huggingface/transformers/blob/8f093fb799246f7dd9104ff44728da0c53a9f67a
     /src/transformers/tools/python_interpreter.py>`_. The original license applies::
 
@@ -77,10 +73,10 @@ class PythonInterpreter():
     :obj:`fuzz_state` for fuzzy matching."
 
     DSPy's modifications:
-    "We expanded upon the Camel libraries modifications by adding additional 
+    "We expanded upon the Camel libraries modifications by adding additional
     support for "Mapping" statements, "conditional" operators, and including
     the "CodePrompt" and "TextPrompt" classes for code execution.
-    
+
 
     Modifications copyright (C) 2023 CAMEL-AI.org
 
@@ -100,17 +96,24 @@ class PythonInterpreter():
             (:obj:`.`). (default: :obj:`None`)
     """
 
-    def __init__(self, action_space: Dict[str, Any],
-                 import_white_list: Optional[List[str]] = None) -> None:
+    def __init__(
+        self,
+        action_space: Dict[str, Any],
+        import_white_list: Optional[List[str]] = None,
+    ) -> None:
         self.action_space = action_space
         self.state = self.action_space.copy()
         self.fuzz_state: Dict[str, Any] = {}
         self.import_white_list = import_white_list or []
 
-    def execute(self, code: str, state: Optional[Dict[str, Any]] = None,
-                fuzz_state: Optional[Dict[str, Any]] = None,
-                keep_state: bool = True) -> Any:
-        r""" Execute the input python codes in a security environment.
+    def execute(
+        self,
+        code: str,
+        state: Optional[Dict[str, Any]] = None,
+        fuzz_state: Optional[Dict[str, Any]] = None,
+        keep_state: bool = True,
+    ) -> Any:
+        r"""Execute the input python codes in a security environment.
 
         Args:
             code (str): Generated python code to be executed.
@@ -142,7 +145,9 @@ def execute(self, code: str, state: Optional[Dict[str, Any]] = None,
             expression = ast.parse(code)
         except SyntaxError as e:
             error_line = code.splitlines()[e.lineno - 1]
-            raise InterpreterError(f"Syntax error in code at line {e.lineno}: {error_line}\nError: {e}")
+            raise InterpreterError(
+                f"Syntax error in code at line {e.lineno}: {error_line}\nError: {e}"
+            )
 
         result = None
         for idx, node in enumerate(expression.body):
@@ -151,8 +156,7 @@ def execute(self, code: str, state: Optional[Dict[str, Any]] = None,
             except InterpreterError as e:
                 if not keep_state:
                     self.clear_state()
-                msg = (f"Evaluation of the code stopped at node {idx}. "
-                       f"See:\n{e}")
+                msg = f"Evaluation of the code stopped at node {idx}. " f"See:\n{e}"
                 # More information can be provided by `ast.unparse()`,
                 # which is new in python 3.9.
                 raise InterpreterError(msg)
@@ -163,12 +167,12 @@ def execute(self, code: str, state: Optional[Dict[str, Any]] = None,
             self.clear_state()
 
         return result
-    
+
     def clear_state(self) -> None:
         r"""Initialize :obj:`state` and :obj:`fuzz_state`"""
         self.state = self.action_space.copy()
         self.fuzz_state = {}
-    
+
     # ast.Index is deprecated after python 3.9, which cannot pass type check,
     # but is still necessary for older versions.
     @typing.no_type_check
@@ -229,8 +233,7 @@ def _execute_ast(self, expression: ast.AST) -> Any:
             # cannot pass type check
             return self._execute_ast(expression.value)
         elif isinstance(expression, ast.JoinedStr):
-            return "".join(
-                [str(self._execute_ast(v)) for v in expression.values])
+            return "".join([str(self._execute_ast(v)) for v in expression.values])
         elif isinstance(expression, ast.List):
             # List -> evaluate all elements
             return [self._execute_ast(elt) for elt in expression.elts]
@@ -250,8 +253,7 @@ def _execute_ast(self, expression: ast.AST) -> Any:
         else:
             # For now we refuse anything else. Let's add things as we need
             # them.
-            raise InterpreterError(
-                f"{expression.__class__.__name__} is not supported.")
+            raise InterpreterError(f"{expression.__class__.__name__} is not supported.")
 
     def _execute_assign(self, assign: ast.Assign) -> Any:
         targets = assign.targets
@@ -266,30 +268,35 @@ def _assign(self, target: ast.expr, value: Any):
             self.state[target.id] = value
         elif isinstance(target, ast.Tuple):
             if not isinstance(value, tuple):
-                raise InterpreterError(f"Expected type tuple, but got"
-                                       f"{value.__class__.__name__} instead.")
+                raise InterpreterError(
+                    f"Expected type tuple, but got"
+                    f"{value.__class__.__name__} instead."
+                )
             if len(target.elts) != len(value):
                 raise InterpreterError(
-                    f"Expected {len(target.elts)} values but got"
-                    f" {len(value)}.")
+                    f"Expected {len(target.elts)} values but got" f" {len(value)}."
+                )
             for t, v in zip(target.elts, value):
                 self.state[self._execute_ast(t)] = v
         else:
-            raise InterpreterError(f"Unsupported variable type. Expected "
-                                   f"ast.Name or ast.Tuple, got "
-                                   f"{target.__class__.__name__} instead.")
+            raise InterpreterError(
+                f"Unsupported variable type. Expected "
+                f"ast.Name or ast.Tuple, got "
+                f"{target.__class__.__name__} instead."
+            )
 
     def _execute_call(self, call: ast.Call) -> Any:
         callable_func = self._execute_ast(call.func)
 
         args = [self._execute_ast(arg) for arg in call.args]
         kwargs = {
-            keyword.arg: self._execute_ast(keyword.value)
-            for keyword in call.keywords
+            keyword.arg: self._execute_ast(keyword.value) for keyword in call.keywords
         }
         if isinstance(callable_func, ast.FunctionDef):
             old_state = self.state.copy()
-            for param_name, arg_value in zip([param.arg for param in callable_func.args.args], args):
+            for param_name, arg_value in zip(
+                [param.arg for param in callable_func.args.args], args
+            ):
                 self.state[param_name] = arg_value
             result = None
             for stmt in callable_func.body:
@@ -303,8 +310,13 @@ def _execute_call(self, call: ast.Call) -> Any:
     def _execute_augassign(self, augassign: ast.AugAssign):
         current_value = self.state[augassign.target.id]
         increment_value = self._execute_ast(augassign.value)
-        if not (isinstance(current_value, (int, float)) and isinstance(increment_value, (int, float))):
-            raise InterpreterError(f"Invalid types for augmented assignment: {type(current_value)}, {type(increment_value)}")
+        if not (
+            isinstance(current_value, (int, float))
+            and isinstance(increment_value, (int, float))
+        ):
+            raise InterpreterError(
+                f"Invalid types for augmented assignment: {type(current_value)}, {type(increment_value)}"
+            )
         if isinstance(augassign.op, ast.Add):
             new_value = current_value + increment_value
         elif isinstance(augassign.op, ast.Sub):
@@ -313,9 +325,11 @@ def _execute_augassign(self, augassign: ast.AugAssign):
             new_value = current_value * increment_value
         elif isinstance(augassign.op, ast.Div):
             new_value = current_value / increment_value
-        #TODO - any other augassign operators that are missing
+        # TODO - any other augassign operators that are missing
         else:
-            raise InterpreterError(f"Augmented assignment operator {augassign.op} is not supported")
+            raise InterpreterError(
+                f"Augmented assignment operator {augassign.op} is not supported"
+            )
         self._assign(augassign.target, new_value)
         return new_value
 
@@ -324,15 +338,14 @@ def _execute_subscript(self, subscript: ast.Subscript):
         value = self._execute_ast(subscript.value)
         if not isinstance(subscript.ctx, ast.Load):
             raise InterpreterError(
-                f"{subscript.ctx.__class__.__name__} is not supported for "
-                "subscript.")
+                f"{subscript.ctx.__class__.__name__} is not supported for " "subscript."
+            )
         if isinstance(value, (list, tuple)):
             return value[int(index)]
         if index in value:
             return value[index]
         if isinstance(index, str) and isinstance(value, Mapping):
-            close_matches = difflib.get_close_matches(index,
-                                                      list(value.keys()))
+            close_matches = difflib.get_close_matches(index, list(value.keys()))
             if len(close_matches) > 0:
                 return value[close_matches[0]]
 
@@ -340,7 +353,7 @@ def _execute_subscript(self, subscript: ast.Subscript):
 
     def _execute_name(self, name: ast.Name):
         if name.id in dir(builtins):
-          return getattr(builtins, name.id)
+            return getattr(builtins, name.id)
         if isinstance(name.ctx, ast.Store):
             return name.id
         elif isinstance(name.ctx, ast.Load):
@@ -356,14 +369,17 @@ def _execute_condition(self, condition):
             elif isinstance(condition.op, ast.Or):
                 results = [self._execute_ast(value) for value in condition.values]
                 return any(results)
-            else: #TODO - add any other BoolOps missing
-                raise InterpreterError(f"Boolean operator {condition.op} is not supported")
+            else:  # TODO - add any other BoolOps missing
+                raise InterpreterError(
+                    f"Boolean operator {condition.op} is not supported"
+                )
         elif isinstance(condition, ast.Compare):
             if len(condition.ops) > 1:
-                raise InterpreterError("Cannot evaluate conditions with multiple operators")
+                raise InterpreterError(
+                    "Cannot evaluate conditions with multiple operators"
+                )
         if len(condition.ops) > 1:
-            raise InterpreterError(
-                "Cannot evaluate conditions with multiple operators")
+            raise InterpreterError("Cannot evaluate conditions with multiple operators")
         left = self._execute_ast(condition.left)
         comparator = condition.ops[0]
         right = self._execute_ast(condition.comparators[0])
@@ -423,7 +439,7 @@ def _execute_import(self, import_module: ast.Import) -> None:
 
     def _execute_import_from(self, import_from: ast.ImportFrom):
         if import_from.module is None:
-            raise InterpreterError("\"from . import\" is not supported.")
+            raise InterpreterError('"from . import" is not supported.')
         for import_name in import_from.names:
             full_name = import_from.module + f".{import_name.name}"
             self._validate_import(full_name)
@@ -441,9 +457,11 @@ def _validate_import(self, full_name: str):
                 return
 
         if not found_name:
-            raise InterpreterError(f"It is not permitted to import modules "
-                                   f"than module white list (try to import "
-                                   f"{full_name}).")
+            raise InterpreterError(
+                f"It is not permitted to import modules "
+                f"than module white list (try to import "
+                f"{full_name})."
+            )
 
     def _execute_binop(self, binop: ast.BinOp):
         left = self._execute_ast(binop.left)
@@ -494,6 +512,7 @@ def _get_value_from_state(self, key: str) -> Any:
         else:
             raise InterpreterError(f"The variable `{key}` is not defined.")
 
+
 class TextPrompt(str):
     r"""A class that represents a text prompt. The :obj:`TextPrompt` class
     extends the built-in :obj:`str` class to provide a property for retrieving
@@ -506,12 +525,12 @@ class TextPrompt(str):
 
     @property
     def key_words(self) -> Set[str]:
-        r"""Returns a set of strings representing the keywords in the prompt.
-        """
+        r"""Returns a set of strings representing the keywords in the prompt."""
         from camel.utils import get_prompt_template_key_words
+
         return get_prompt_template_key_words(self)
 
-    def format(self, *args: Any, **kwargs: Any) -> 'TextPrompt':
+    def format(self, *args: Any, **kwargs: Any) -> "TextPrompt":
         r"""Overrides the built-in :obj:`str.format` method to allow for
         default values in the format string. This is used to allow formatting
         the partial string.
@@ -524,10 +543,11 @@ def format(self, *args: Any, **kwargs: Any) -> 'TextPrompt':
             TextPrompt: A new :obj:`TextPrompt` object with the format string
                 replaced with the formatted string.
         """
-        default_kwargs = {key: '{' + f'{key}' + '}' for key in self.key_words}
+        default_kwargs = {key: "{" + f"{key}" + "}" for key in self.key_words}
         default_kwargs.update(kwargs)
         return TextPrompt(super().format(*args, **default_kwargs))
 
+
 class CodePrompt(TextPrompt):
     r"""A class that represents a code prompt. It extends the :obj:`TextPrompt`
     class with a :obj:`code_type` property.
@@ -536,7 +556,7 @@ class with a :obj:`code_type` property.
         code_type (str, optional): The type of code. Defaults to None.
     """
 
-    def __new__(cls, *args: Any, **kwargs: Any) -> 'CodePrompt':
+    def __new__(cls, *args: Any, **kwargs: Any) -> "CodePrompt":
         r"""Creates a new instance of the :obj:`CodePrompt` class.
 
         Args:
@@ -546,7 +566,7 @@ def __new__(cls, *args: Any, **kwargs: Any) -> 'CodePrompt':
         Returns:
             CodePrompt: The created :obj:`CodePrompt` instance.
         """
-        code_type = kwargs.pop('code_type', None)
+        code_type = kwargs.pop("code_type", None)
         instance = super().__new__(cls, *args, **kwargs)
         instance._code_type = code_type
         return instance
@@ -569,8 +589,9 @@ def set_code_type(self, code_type: str) -> None:
         self._code_type = code_type
 
     def execute(
-        self, interpreter: Optional[PythonInterpreter] = None,
-        user_variable: Optional[Dict[str, Any]] = None
+        self,
+        interpreter: Optional[PythonInterpreter] = None,
+        user_variable: Optional[Dict[str, Any]] = None,
     ) -> Tuple[Any, PythonInterpreter]:
         r"""Executes the code string by a given python interpreter.
 
@@ -587,10 +608,11 @@ def execute(
                 represents the value of the last statement (excluding "import")
                 in the code. This value could potentially be the desired result
                 of the LLM-generated code.
-    """
+        """
         # NOTE: Only supports Python code for now.
         if not interpreter:
             interpreter = PythonInterpreter(action_space=globals())
-        execution_res = interpreter.execute(self, fuzz_state=user_variable,
-                                            keep_state=True)
+        execution_res = interpreter.execute(
+            self, fuzz_state=user_variable, keep_state=True
+        )
         return execution_res, interpreter
diff --git a/dspy/retrieve/__init__.py b/dspy/retrieve/__init__.py
index 1d1f9e8b7d..64cf276a6c 100644
--- a/dspy/retrieve/__init__.py
+++ b/dspy/retrieve/__init__.py
@@ -1 +1 @@
-from .retrieve import Retrieve
\ No newline at end of file
+from .retrieve import Retrieve
diff --git a/dspy/retrieve/retrieve.py b/dspy/retrieve/retrieve.py
index 4f61ba8aed..c324d1bf9a 100644
--- a/dspy/retrieve/retrieve.py
+++ b/dspy/retrieve/retrieve.py
@@ -13,10 +13,10 @@ class Retrieve(Parameter):
     def __init__(self, k=3):
         self.stage = random.randbytes(8).hex()
         self.k = k
-    
+
     def reset(self):
         pass
-    
+
     def dump_state(self):
         state_keys = ["k"]
         return {k: getattr(self, k) for k in state_keys}
@@ -24,20 +24,23 @@ def dump_state(self):
     def load_state(self, state):
         for name, value in state.items():
             setattr(self, name, value)
-    
+
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
-    
-    def forward(self, query_or_queries):
-        queries = [query_or_queries] if isinstance(query_or_queries, str) else query_or_queries
-        queries = [query.strip().split('\n')[0].strip() for query in queries]
 
+    def forward(self, query_or_queries):
+        queries = (
+            [query_or_queries]
+            if isinstance(query_or_queries, str)
+            else query_or_queries
+        )
+        queries = [query.strip().split("\n")[0].strip() for query in queries]
 
         # print(queries)
         # TODO: Consider removing any quote-like markers that surround the query too.
 
         passages = dsp.retrieveEnsemble(queries, k=self.k)
         return Prediction(passages=passages)
-    
 
-# TODO: Consider doing Prediction.from_completions with the individual sets of passages (per query) too.
\ No newline at end of file
+
+# TODO: Consider doing Prediction.from_completions with the individual sets of passages (per query) too.
diff --git a/dspy/retrieve/you_rm.py b/dspy/retrieve/you_rm.py
index 4d5273cc4f..add26b971f 100644
--- a/dspy/retrieve/you_rm.py
+++ b/dspy/retrieve/you_rm.py
@@ -1,42 +1,44 @@
-import dspy
-import os
-import requests
+# import dspy
+# import os
+# import requests
 
-from typing import Union, List
+# from typing import Union, List
 
 
-class YouRM(dspy.Retrieve):
-    def __init__(self, ydc_api_key=None, k=3):
-        super().__init__(k=k)
-        if not ydc_api_key and not os.environ.get("YDC_API_KEY"):
-            raise RuntimeError("You must supply ydc_api_key or set environment variable YDC_API_KEY")
-        elif ydc_api_key:
-            self.ydc_api_key = ydc_api_key
-        else:
-            self.ydc_api_key = os.environ["YDC_API_KEY"]
+# class YouRM(dspy.Retrieve):
+#     def __init__(self, ydc_api_key=None, k=3):
+#         super().__init__(k=k)
+#         if not ydc_api_key and not os.environ.get("YDC_API_KEY"):
+#             raise RuntimeError(
+#                 "You must supply ydc_api_key or set environment variable YDC_API_KEY"
+#             )
+#         elif ydc_api_key:
+#             self.ydc_api_key = ydc_api_key
+#         else:
+#             self.ydc_api_key = os.environ["YDC_API_KEY"]
 
-    def forward(self, query_or_queries: Union[str, List[str]]) -> dspy.Prediction:
-        """Search with You.com for self.k top passages for query or queries
+#     def forward(self, query_or_queries: Union[str, List[str]]) -> dspy.Prediction:
+#         """Search with You.com for self.k top passages for query or queries
 
-        Args:
-            query_or_queries (Union[str, List[str]]): The query or queries to search for.
+#         Args:
+#             query_or_queries (Union[str, List[str]]): The query or queries to search for.
 
-        Returns:
-            dspy.Prediction: An object containing the retrieved passages.
-        """
-        queries = (
-            [query_or_queries]
-            if isinstance(query_or_queries, str)
-            else query_or_queries
-        )
-        docs = []
-        for query in queries:
-            headers = {"X-API-Key": self.ydc_api_key}
-            results = requests.get(
-                f"https://api.ydc-index.io/search?query={query}",
-                headers=headers,
-            ).json()
-            for hit in results["hits"][:self.k]:
-                for snippet in hit["snippets"]:
-                    docs.append(snippet)
-        return dspy.Prediction(passages=docs)
+#         Returns:
+#             dspy.Prediction: An object containing the retrieved passages.
+#         """
+#         queries = (
+#             [query_or_queries]
+#             if isinstance(query_or_queries, str)
+#             else query_or_queries
+#         )
+#         docs = []
+#         for query in queries:
+#             headers = {"X-API-Key": self.ydc_api_key}
+#             results = requests.get(
+#                 f"https://api.ydc-index.io/search?query={query}",
+#                 headers=headers,
+#             ).json()
+#             for hit in results["hits"][: self.k]:
+#                 for snippet in hit["snippets"]:
+#                     docs.append(snippet)
+#         return dspy.Prediction(passages=docs)
diff --git a/dspy/signatures/field.py b/dspy/signatures/field.py
index 848439b6d4..a5bbf79367 100644
--- a/dspy/signatures/field.py
+++ b/dspy/signatures/field.py
@@ -1,31 +1,31 @@
-import re
-import dsp
-
 class Field:
     """A more ergonomic datatype that infers prefix and desc if omitted."""
+
     def __init__(self, *, prefix=None, desc=None, input, format=None):
         self.prefix = prefix  # This can be None initially and set later
         self.desc = desc
         self.format = format
-        
+
     def finalize(self, key, inferred_prefix):
         """Set the prefix if it's not provided explicitly."""
         if self.prefix is None:
             self.prefix = inferred_prefix + ":"
-        
+
         if self.desc is None:
-            self.desc = f'${{{key}}}'
-        
+            self.desc = f"${{{key}}}"
+
     def __repr__(self):
         return f"{self.__class__.__name__}(prefix={self.prefix}, desc={self.desc})"
-    
+
     def __eq__(self, __value: object) -> bool:
         return self.__dict__ == __value.__dict__
 
+
 class InputField(Field):
     def __init__(self, *, prefix=None, desc=None, format=None):
         super().__init__(prefix=prefix, desc=desc, input=True, format=format)
 
+
 class OutputField(Field):
     def __init__(self, *, prefix=None, desc=None, format=None):
         super().__init__(prefix=prefix, desc=desc, input=False, format=format)
diff --git a/dspy/signatures/signature.py b/dspy/signatures/signature.py
index 5a92964aed..1e31696c93 100644
--- a/dspy/signatures/signature.py
+++ b/dspy/signatures/signature.py
@@ -4,6 +4,7 @@
 from .field import Field, InputField, OutputField
 import threading
 
+
 class SignatureMeta(type):
     _thread_local_storage = threading.local()
 
@@ -21,27 +22,31 @@ def __new__(cls, name, bases, class_dict):
                 type_attributes[k] = v
                 del class_dict[k]
 
-        instructions = class_dict.get('__doc__') or ""
+        instructions = class_dict.get("__doc__") or ""
 
         new_class = super().__new__(cls, name, bases, class_dict)
 
         # Attach the _SignatureNamespace directly to the class
-        setattr(new_class, 'signature', cls._SignatureNamespace(type_attributes))
+        setattr(new_class, "signature", cls._SignatureNamespace(type_attributes))
 
         # Create and attach the template directly to the class
-        setattr(new_class, '_template', dsp.Template(instructions=instructions, **type_attributes))
+        setattr(
+            new_class,
+            "_template",
+            dsp.Template(instructions=instructions, **type_attributes),
+        )
 
         return new_class
 
     @property
     def kwargs(cls):
         return cls.signature.fields
-    
+
     def __call__(cls, *args, **kwargs):
         if len(args) == 1 and isinstance(args[0], str):
             instance = super(SignatureMeta, cls).__call__(*args, **kwargs)
             return instance
-        #old 
+        # old
         return cls._template(*args, **kwargs)
 
     def __getattr__(cls, attr):
@@ -50,13 +55,14 @@ def __getattr__(cls, attr):
             return getattr(cls._template, attr)
         return super().__getattr__(attr)
 
+
 class Signature(metaclass=SignatureMeta):
     def __init__(self, signature: str = "", instructions: str = ""):
         self.signature = signature
         self.instructions = instructions
         self.fields = {}
         self.parse_structure()
-    
+
     def __getattr__(self, attr):
         if attr not in self.__dict__:
             return getattr(self.__class__, attr)
@@ -80,7 +86,7 @@ def attach(self, **kwargs):
                 raise ValueError(f"{key} does not exist in this signature")
             field_map = {
                 InputField: InputField(prefix=prefix, desc=desc),
-                OutputField: OutputField(prefix=prefix, desc=desc)
+                OutputField: OutputField(prefix=prefix, desc=desc),
             }
             self.fields[key] = field_map.get(type(field_type))
         return self
@@ -97,7 +103,11 @@ def add_field(self, field_name: str, field_type, position="append"):
             if input_fields:
                 last_input_key = list(input_fields.keys())[-1]
                 index = list(self.fields.keys()).index(last_input_key) + 1
-                self.fields = {**dict(list(self.fields.items())[:index]), field_name: field_instance, **dict(list(self.fields.items())[index:])}
+                self.fields = {
+                    **dict(list(self.fields.items())[:index]),
+                    field_name: field_instance,
+                    **dict(list(self.fields.items())[index:]),
+                }
             else:
                 self.fields[field_name] = field_instance
         elif isinstance(field_instance, OutputField) and position == "prepend":
@@ -105,7 +115,11 @@ def add_field(self, field_name: str, field_type, position="append"):
             if output_fields:
                 first_output_key = list(output_fields.keys())[0]
                 index = list(self.fields.keys()).index(first_output_key)
-                self.fields = {**dict(list(self.fields.items())[:index]), field_name: field_instance, **dict(list(self.fields.items())[index:])}
+                self.fields = {
+                    **dict(list(self.fields.items())[:index]),
+                    field_name: field_instance,
+                    **dict(list(self.fields.items())[index:]),
+                }
             else:
                 self.fields[field_name] = field_instance
         elif position == "prepend":
@@ -113,7 +127,9 @@ def add_field(self, field_name: str, field_type, position="append"):
         elif position == "append":
             self.fields[field_name] = field_instance
         else:
-            raise ValueError(f"invalid field addition. Please verify that your field name: {field_name}, field_type: {field_type}, and expected position: {position} are correct.")
+            raise ValueError(
+                f"invalid field addition. Please verify that your field name: {field_name}, field_type: {field_type}, and expected position: {position} are correct."
+            )
 
     def input_fields(self):
         return {k: v for k, v in self.fields.items() if isinstance(v, InputField)}
@@ -129,41 +145,45 @@ def __repr__(self):
                 s.append(f"- {name} = {value}")
             else:
                 s.append(f"- {name} = [field not attached]")
-        return f'{self.__class__.__name__}\n' + '\n'.join(s)
+        return f"{self.__class__.__name__}\n" + "\n".join(s)
 
     def __eq__(self, __value: object) -> bool:
         return self._template == __value._template
 
 
-
 def infer_prefix(attribute_name: str) -> str:
     """Infers a prefix from an attribute name."""
-    
+
     # Convert camelCase to snake_case, but handle sequences of capital letters properly
-    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', attribute_name)
-    intermediate_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", attribute_name)
+    intermediate_name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
 
     # Insert underscores around numbers to ensure spaces in the final output
-    with_underscores_around_numbers = re.sub('([a-zA-Z])(\d)', r'\1_\2', intermediate_name)
-    with_underscores_around_numbers = re.sub('(\d)([a-zA-Z])', r'\1_\2', with_underscores_around_numbers)
+    with_underscores_around_numbers = re.sub(
+        "([a-zA-Z])(\d)", r"\1_\2", intermediate_name
+    )
+    with_underscores_around_numbers = re.sub(
+        "(\d)([a-zA-Z])", r"\1_\2", with_underscores_around_numbers
+    )
 
     # Convert snake_case to 'Proper Title Case', but ensure acronyms are uppercased
-    words = with_underscores_around_numbers.split('_')
+    words = with_underscores_around_numbers.split("_")
     title_cased_words = []
     for word in words:
         if word.isupper():
             title_cased_words.append(word)
         else:
             title_cased_words.append(word.capitalize())
-    
-    return ' '.join(title_cased_words)
+
+    return " ".join(title_cased_words)
+
 
 ### Testing the function
-assert infer_prefix('someAttributeName42IsCool') == 'Some Attribute Name 42 Is Cool'
-assert infer_prefix('version2Update') == 'Version 2 Update'
-assert infer_prefix('modelT45Enhanced') == 'Model T 45 Enhanced'
-assert infer_prefix('someAttributeName') == 'Some Attribute Name'
-assert infer_prefix('some_attribute_name') == 'Some Attribute Name'
-assert infer_prefix('URLAddress') == 'URL Address'
-assert infer_prefix('isHTTPSecure') == 'Is HTTP Secure'
-assert infer_prefix('isHTTPSSecure123') == 'Is HTTPS Secure 123'
+assert infer_prefix("someAttributeName42IsCool") == "Some Attribute Name 42 Is Cool"
+assert infer_prefix("version2Update") == "Version 2 Update"
+assert infer_prefix("modelT45Enhanced") == "Model T 45 Enhanced"
+assert infer_prefix("someAttributeName") == "Some Attribute Name"
+assert infer_prefix("some_attribute_name") == "Some Attribute Name"
+assert infer_prefix("URLAddress") == "URL Address"
+assert infer_prefix("isHTTPSecure") == "Is HTTP Secure"
+assert infer_prefix("isHTTPSSecure123") == "Is HTTPS Secure 123"
diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py
index 57be40e9a9..34f69f3472 100644
--- a/dspy/teleprompt/bootstrap.py
+++ b/dspy/teleprompt/bootstrap.py
@@ -8,7 +8,6 @@
 from .teleprompt import Teleprompter
 from .vanilla import LabeledFewShot
 
-from dspy.evaluate.evaluate import Evaluate
 
 # TODO: metrics should return an object with __bool__ basically, but fine if they're more complex.
 # They can also be sortable.
@@ -31,14 +30,22 @@
 
 
 class BootstrapFewShot(Teleprompter):
-    def __init__(self, metric=None, teacher_settings={}, max_bootstrapped_demos=4, max_labeled_demos=16, max_rounds=1, max_errors=5):
+    def __init__(
+        self,
+        metric=None,
+        teacher_settings={},
+        max_bootstrapped_demos=4,
+        max_labeled_demos=16,
+        max_rounds=1,
+        max_errors=5,
+    ):
         self.metric = metric
         self.teacher_settings = teacher_settings
 
         self.max_bootstrapped_demos = max_bootstrapped_demos
         self.max_labeled_demos = max_labeled_demos
         self.max_rounds = max_rounds
-        self.max_errors= max_errors
+        self.max_errors = max_errors
         self.error_count = 0
         self.error_lock = threading.Lock()
 
@@ -54,29 +61,43 @@ def compile(self, student, *, teacher=None, trainset, valset=None):
         self.student._compiled = True
 
         return self.student
-    
+
     def _prepare_student_and_teacher(self, student, teacher):
         self.student = student.reset_copy()
-        self.teacher = teacher.deepcopy() if teacher is not None else student.reset_copy()
+        self.teacher = (
+            teacher.deepcopy() if teacher is not None else student.reset_copy()
+        )
 
         assert self.student._compiled is False, "Student must be uncompiled."
 
         if self.max_labeled_demos and self.teacher._compiled is False:
             teleprompter = LabeledFewShot(k=self.max_labeled_demos)
-            self.teacher = teleprompter.compile(self.teacher.reset_copy(), trainset=self.trainset)
+            self.teacher = teleprompter.compile(
+                self.teacher.reset_copy(), trainset=self.trainset
+            )
 
     def _prepare_predictor_mappings(self):
         name2predictor, predictor2name = {}, {}
         student, teacher = self.student, self.teacher
 
-        assert len(student.predictors()) == len(teacher.predictors()), "Student and teacher must have the same number of predictors."
-
-        for (name1, predictor1), (name2, predictor2) in zip(student.named_predictors(), teacher.named_predictors()):
-            assert name1 == name2, "Student and teacher must have the same program structure."
-            assert predictor1.signature == predictor2.signature, f"Student and teacher must have the same signatures. {type(predictor1.signature)} != {type(predictor2.signature)}"
-            assert id(predictor1) != id(predictor2), "Student and teacher must be different objects."
-
-            name2predictor[name1] = None # dict(student=predictor1, teacher=predictor2)
+        assert len(student.predictors()) == len(
+            teacher.predictors()
+        ), "Student and teacher must have the same number of predictors."
+
+        for (name1, predictor1), (name2, predictor2) in zip(
+            student.named_predictors(), teacher.named_predictors()
+        ):
+            assert (
+                name1 == name2
+            ), "Student and teacher must have the same program structure."
+            assert (
+                predictor1.signature == predictor2.signature
+            ), f"Student and teacher must have the same signatures. {type(predictor1.signature)} != {type(predictor2.signature)}"
+            assert id(predictor1) != id(
+                predictor2
+            ), "Student and teacher must be different objects."
+
+            name2predictor[name1] = None  # dict(student=predictor1, teacher=predictor2)
             predictor2name[id(predictor1)] = name1
             predictor2name[id(predictor2)] = name2
 
@@ -99,12 +120,16 @@ def _bootstrap(self, *, max_bootsraps=None):
 
                     if success:
                         bootstrapped[example_idx] = True
-            
-        print(f'Bootstrapped {len(bootstrapped)} full traces after {example_idx+1} examples in round {round_idx}.')
-        
+
+        print(
+            f"Bootstrapped {len(bootstrapped)} full traces after {example_idx+1} examples in round {round_idx}."
+        )
+
         # Unbootstrapped training examples
 
-        self.validation = [x for idx, x in enumerate(self.trainset) if idx not in bootstrapped]
+        self.validation = [
+            x for idx, x in enumerate(self.trainset) if idx not in bootstrapped
+        ]
         random.Random(0).shuffle(self.validation)
 
         self.validation = self.valset or self.validation
@@ -112,16 +137,20 @@ def _bootstrap(self, *, max_bootsraps=None):
         # NOTE: Can't yet use evaluate because we need to trace *per example*
         # evaluate = Evaluate(program=self.teacher, metric=self.metric, num_threads=12)
         # score = evaluate(self.metric, display_table=False, display_progress=True)
-    
+
     def _bootstrap_one_example(self, example, round_idx=0):
         name2traces = self.name2traces
-        teacher = self.teacher #.deepcopy()
+        teacher = self.teacher  # .deepcopy()
         predictor_cache = {}
 
         try:
             with dsp.settings.context(trace=[], **self.teacher_settings):
                 lm = dsp.settings.lm
-                lm = lm.copy(temperature=0.7 + 0.001 * round_idx) if round_idx > 0 else lm
+                lm = (
+                    lm.copy(temperature=0.7 + 0.001 * round_idx)
+                    if round_idx > 0
+                    else lm
+                )
                 new_settings = dict(lm=lm) if round_idx > 0 else {}
 
                 with dsp.settings.context(**new_settings):
@@ -135,7 +164,9 @@ def _bootstrap_one_example(self, example, round_idx=0):
                     for name, predictor in teacher.named_predictors():
                         predictor.demos = predictor_cache[name]
 
-                success = (self.metric is None) or self.metric(example, prediction, trace)
+                success = (self.metric is None) or self.metric(
+                    example, prediction, trace
+                )
                 # print(success, example, prediction)
         except Exception as e:
             success = False
@@ -144,14 +175,18 @@ def _bootstrap_one_example(self, example, round_idx=0):
                 current_error_count = self.error_count
             if current_error_count >= self.max_errors:
                 raise e
-            print(f'Failed to run or to evaluate example {example} with {self.metric} due to {e}.')
-        
+            print(
+                f"Failed to run or to evaluate example {example} with {self.metric} due to {e}."
+            )
+
         if success:
             for step in trace:
                 predictor, inputs, outputs = step
 
-                if 'dspy_uuid' in example:
-                    demo = Example(augmented=True, dspy_uuid=example.dspy_uuid, **inputs, **outputs)
+                if "dspy_uuid" in example:
+                    demo = Example(
+                        augmented=True, dspy_uuid=example.dspy_uuid, **inputs, **outputs
+                    )
                 else:
                     # TODO: FIXME: This is a hack. RandomSearch will complain for now in this edge case.
                     demo = Example(augmented=True, **inputs, **outputs)
@@ -159,16 +194,22 @@ def _bootstrap_one_example(self, example, round_idx=0):
                 try:
                     predictor_name = self.predictor2name[id(predictor)]
                 except KeyError as e:
-                    continue # FIXME: !
+                    continue  # FIXME: !
 
                     # TODO: Look closer into this. It's a bit tricky to reproduce.
-                    print(f'Failed to find predictor {predictor} in {self.predictor2name}.')
-                    print('Are you doing this in a notebook (Jupyter)? This might be caused by redefining values by rerunning cells.')
-                    print('Try restarting the notebook, or open an issue.')
-                    raise KeyError(f'Failed to find predictor {id(predictor)} {predictor} in {self.predictor2name}.') from e
+                    print(
+                        f"Failed to find predictor {predictor} in {self.predictor2name}."
+                    )
+                    print(
+                        "Are you doing this in a notebook (Jupyter)? This might be caused by redefining values by rerunning cells."
+                    )
+                    print("Try restarting the notebook, or open an issue.")
+                    raise KeyError(
+                        f"Failed to find predictor {id(predictor)} {predictor} in {self.predictor2name}."
+                    ) from e
 
                 name2traces[predictor_name].append(demo)
-        
+
         return success
 
     def _train(self):
@@ -176,14 +217,17 @@ def _train(self):
         raw_demos = self.validation
 
         for name, predictor in self.student.named_predictors():
-            augmented_demos = self.name2traces[name][:self.max_bootstrapped_demos]
-            
-            sample_size = min(self.max_labeled_demos - len(augmented_demos), len(raw_demos))
+            augmented_demos = self.name2traces[name][: self.max_bootstrapped_demos]
+
+            sample_size = min(
+                self.max_labeled_demos - len(augmented_demos), len(raw_demos)
+            )
             sample_size = max(0, sample_size)
 
             raw_demos = rng.sample(raw_demos, sample_size)
-            
+
             import dspy
+
             if dspy.settings.release >= 20230928:
                 predictor.demos = raw_demos + augmented_demos
             else:
diff --git a/dspy/teleprompt/ensemble.py b/dspy/teleprompt/ensemble.py
index f52dc10214..4f727e9992 100644
--- a/dspy/teleprompt/ensemble.py
+++ b/dspy/teleprompt/ensemble.py
@@ -1,5 +1,3 @@
-import dsp
-import tqdm
 import random
 
 from dspy.teleprompt.teleprompt import Teleprompter
@@ -8,12 +6,15 @@
 TODO: The EnsembledProgram should actually imitate the structure of the individual programs (IF they are all compatible). This allows compiling with an ensemble program as a (singular) teacher. Basically the top majority-compatible trace will end up being used, if dspy.majority is the reduce_fn.
 """
 
+
 class Ensemble(Teleprompter):
     def __init__(self, *, reduce_fn=None, size=None, deterministic=False):
         """A common reduce_fn is dspy.majority."""
-        
-        assert deterministic is False, "TODO: Implement example hashing for deterministic ensemble."
-        
+
+        assert (
+            deterministic is False
+        ), "TODO: Implement example hashing for deterministic ensemble."
+
         self.reduce_fn = reduce_fn
         self.size = size
         self.deterministic = deterministic
@@ -23,11 +24,12 @@ def compile(self, programs):
         reduce_fn = self.reduce_fn
 
         import dspy
+
         class EnsembledProgram(dspy.Module):
             def __init__(self):
                 super().__init__()
                 self.programs = programs
-            
+
             def forward(self, *args, **kwargs):
                 programs = random.sample(self.programs, size) if size else self.programs
                 outputs = [prog(*args, **kwargs) for prog in programs]
diff --git a/dspy/teleprompt/finetune.py b/dspy/teleprompt/finetune.py
index 7541bd670c..54348ffde2 100644
--- a/dspy/teleprompt/finetune.py
+++ b/dspy/teleprompt/finetune.py
@@ -1,7 +1,6 @@
 import os
 import time
 import dsp
-import tqdm
 import random
 
 import ujson
@@ -16,11 +15,13 @@
 # from dspy.evaluate.evaluate import Evaluate
 
 
-if os.environ.get('DSP_NOTEBOOK_CACHEDIR'):
-    training_data_directory = os.path.join(os.environ.get('DSP_NOTEBOOK_CACHEDIR'), 'compiler')
+if os.environ.get("DSP_NOTEBOOK_CACHEDIR"):
+    training_data_directory = os.path.join(
+        os.environ.get("DSP_NOTEBOOK_CACHEDIR"), "compiler"
+    )
     print(training_data_directory)
 else:
-    training_data_directory = 'local_cache/compiler'
+    training_data_directory = "local_cache/compiler"
 
 if not os.path.exists(training_data_directory):
     os.makedirs(training_data_directory)
@@ -52,82 +53,110 @@ def __init__(self, metric=None, teacher_settings={}, multitask=True):
         self.multitask = multitask
 
         metric = metric or (lambda *args: True)
-        self.teleprompter = BootstrapFewShot(metric=metric,
-                                             max_bootstrapped_demos=999999,
-                                             max_labeled_demos=0,  # FIXME: TODO: Make this zero? or param, with default as 16 or 0?
-                                             teacher_settings=teacher_settings)
-        
-
-    def compile(self, student, *, teacher=None, trainset, valset=None,
-                target='t5-large', bsize=12, accumsteps=1, lr=5e-5, epochs=1, bf16=False, int8=False, peft=False):
-
+        self.teleprompter = BootstrapFewShot(
+            metric=metric,
+            max_bootstrapped_demos=999999,
+            max_labeled_demos=0,  # FIXME: TODO: Make this zero? or param, with default as 16 or 0?
+            teacher_settings=teacher_settings,
+        )
+
+    def compile(
+        self,
+        student,
+        *,
+        teacher=None,
+        trainset,
+        valset=None,
+        target="t5-large",
+        bsize=12,
+        accumsteps=1,
+        lr=5e-5,
+        epochs=1,
+        bf16=False,
+        int8=False,
+        peft=False,
+    ):
         # It's usually better to supply a few-shot teacher, rather than uncompiled module (the student).
         if teacher is None:
-            print("WARNING: Using a vanilla teacher. "
-                  "Are you sure you want to use BootstrapFinetune without a compiled teacher?")
-
+            print(
+                "WARNING: Using a vanilla teacher. "
+                "Are you sure you want to use BootstrapFinetune without a compiled teacher?"
+            )
 
         teachers = teacher if isinstance(teacher, list) else [teacher]
         finetune_data = {}
 
         for teacher in teachers:
             # Dummy compilation to get bootstraps.
-            compiled = self.teleprompter.compile(student, teacher=teacher, trainset=trainset)
+            compiled = self.teleprompter.compile(
+                student, teacher=teacher, trainset=trainset
+            )
             multitask = self.multitask
 
             # Prepare finetune <prompt, completion> pairs.
             for name, predictor in compiled.named_predictors():
-                name_ = 'all' if multitask else name
-                finetune_data[name_] = [] if name_ not in finetune_data else finetune_data[name_]
+                name_ = "all" if multitask else name
+                finetune_data[name_] = (
+                    [] if name_ not in finetune_data else finetune_data[name_]
+                )
 
                 for demo in predictor.demos:
                     demo = dict(demo)
 
                     # TODO: FIXME: generalize.
-                    completion = demo.pop(predictor.signature.fields[-1].output_variable)
-                    prompt = predictor.signature.query(dsp.Example(demos=[], **demo)).strip()
+                    completion = demo.pop(
+                        predictor.signature.fields[-1].output_variable
+                    )
+                    prompt = predictor.signature.query(
+                        dsp.Example(demos=[], **demo)
+                    ).strip()
 
-                    finetune_data[name_].append(dict(prompt=prompt, completion=completion))
+                    finetune_data[name_].append(
+                        dict(prompt=prompt, completion=completion)
+                    )
 
         for name_ in finetune_data:
             random.Random(0).shuffle(finetune_data[name_])
             print(name_, len(finetune_data[name_]))
 
-
         #
         # Dump as files.
-        # 
+        #
         finetune_paths = {}
 
         for name in finetune_data:
             data = finetune_data[name]
-            hashed_name = name + '.' + Hasher.hash(data)
-            output_path = os.path.join(training_data_directory, f'{hashed_name}.jsonl')
+            hashed_name = name + "." + Hasher.hash(data)
+            output_path = os.path.join(training_data_directory, f"{hashed_name}.jsonl")
             print(output_path)
 
-            with open(output_path, 'w') as f:
+            with open(output_path, "w") as f:
                 for line in data:
-                    f.write(ujson.dumps(line) + '\n')
-            
+                    f.write(ujson.dumps(line) + "\n")
+
             finetune_paths[name] = output_path
-        
 
         #
         # Train!
         #
         import string
+
         compiler_config = {
-            'save': ''.join(random.Random(time.time()).choices(string.ascii_uppercase + string.digits, k=13)), # https://stackoverflow.com/a/2257449/1493011
-            'peft': peft,
-            'fp16': False,
-            'bf16': bf16,
-            'int8': int8,
-            'fid': False,
-            'rationale': False,
-            'batch_size': bsize,
-            'epochs': epochs,
-            'gradient_accumulation_steps': accumsteps, # 2,
-            'lr': lr
+            "save": "".join(
+                random.Random(time.time()).choices(
+                    string.ascii_uppercase + string.digits, k=13
+                )
+            ),  # https://stackoverflow.com/a/2257449/1493011
+            "peft": peft,
+            "fp16": False,
+            "bf16": bf16,
+            "int8": int8,
+            "fid": False,
+            "rationale": False,
+            "batch_size": bsize,
+            "epochs": epochs,
+            "gradient_accumulation_steps": accumsteps,  # 2,
+            "lr": lr,
         }
 
         from dsp.modules.finetuning import finetune_hf
@@ -138,11 +167,13 @@ def compile(self, student, *, teacher=None, trainset, valset=None,
         for name in finetune_data:
             training_data_path = finetune_paths[name]
             compiler_config_ = dict(compiler_config)
-            compiler_config_['save'] = compiler_config['save'] + '.' + name
+            compiler_config_["save"] = compiler_config["save"] + "." + name
             best_ckpt_path = finetune_hf(training_data_path, target, compiler_config_)
 
             print(f"#> Best checkpoint path: {best_ckpt_path} for {name}")
-            finetune_models[name] = dsp.HFModel(model=target, checkpoint=best_ckpt_path) # best_ckpt_path
+            finetune_models[name] = dsp.HFModel(
+                model=target, checkpoint=best_ckpt_path
+            )  # best_ckpt_path
 
         #
         # Set the LMs to the finetuned ones, per module
@@ -151,9 +182,11 @@ def compile(self, student, *, teacher=None, trainset, valset=None,
 
         assert len(compiled.named_predictors()) == len(compiled2.named_predictors())
 
-        for (name, predictor), (name2, predictor2) in zip(compiled.named_predictors(), compiled2.named_predictors()):
+        for (name, predictor), (name2, predictor2) in zip(
+            compiled.named_predictors(), compiled2.named_predictors()
+        ):
             assert name == name2
-            name = 'all' if multitask else name
+            name = "all" if multitask else name
 
             # TODO: FIXME: When we assign .lm, the Predict.forward will also set only_query=True.
             # This is correct for here but we may want to make it more explicitly restricted to finetuned models.
@@ -161,5 +194,5 @@ def compile(self, student, *, teacher=None, trainset, valset=None,
 
             predictor2.lm = finetune_models[name]
             assert predictor2.demos == []
-        
+
         return compiled2
diff --git a/dspy/teleprompt/knn_fewshot.py b/dspy/teleprompt/knn_fewshot.py
index 2c4f78133e..947b80e229 100644
--- a/dspy/teleprompt/knn_fewshot.py
+++ b/dspy/teleprompt/knn_fewshot.py
@@ -1,6 +1,9 @@
 from typing import List
 import types
 import dsp
+from dspy.teleprompt.bootstrap import BootstrapFewShot
+from dspy.teleprompt.teleprompt import Teleprompter
+
 
 class KNNFewShot(Teleprompter):
     def __init__(self, KNN, k: int, trainset: List[dsp.Example]):
@@ -12,8 +15,10 @@ def compile(self, student, *, teacher=None, trainset, valset=None):
         def forward_pass(**kwargs):
             knn_trainset = self.KNN(**kwargs)
             few_shot_bootstrap = BootstrapFewShot()
-            compiled_program = few_shot_bootstrap.compile(student, teacher=teacher, trainset=knn_trainset, valset=valset)
+            compiled_program = few_shot_bootstrap.compile(
+                student, teacher=teacher, trainset=knn_trainset, valset=valset
+            )
             return compiled_program
-        
+
         student_copy.forward = types.MethodType(forward_pass, student_copy)
-        return student_copy
\ No newline at end of file
+        return student_copy
diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py
index f027f83fef..8635f8f8a5 100644
--- a/dspy/teleprompt/random_search.py
+++ b/dspy/teleprompt/random_search.py
@@ -1,5 +1,3 @@
-import dsp
-import tqdm
 import random
 
 from dspy.teleprompt.teleprompt import Teleprompter
@@ -28,7 +26,17 @@
 
 
 class BootstrapFewShotWithRandomSearch(Teleprompter):
-    def __init__(self, metric, teacher_settings={}, max_bootstrapped_demos=4, max_labeled_demos=16, max_rounds=1, num_candidate_programs=16, num_threads=6, stop_at_score=None):
+    def __init__(
+        self,
+        metric,
+        teacher_settings={},
+        max_bootstrapped_demos=4,
+        max_labeled_demos=16,
+        max_rounds=1,
+        num_candidate_programs=16,
+        num_threads=6,
+        stop_at_score=None,
+    ):
         self.metric = metric
         self.teacher_settings = teacher_settings
         self.max_rounds = max_rounds
@@ -44,7 +52,13 @@ def __init__(self, metric, teacher_settings={}, max_bootstrapped_demos=4, max_la
         # self.max_bootstrapped_demos = self.max_num_traces
         self.max_labeled_demos = max_labeled_demos
 
-        print("Going to sample between", self.min_num_samples, "and", self.max_num_samples, "traces per predictor.")
+        print(
+            "Going to sample between",
+            self.min_num_samples,
+            "and",
+            self.max_num_samples,
+            "traces per predictor.",
+        )
         # print("Going to sample", self.max_num_traces, "traces in total.")
         print("Will attempt to train", self.num_candidate_sets, "candidate sets.")
 
@@ -66,77 +80,110 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None
             if seed == -3:
                 # zero-shot
                 program2 = student.reset_copy()
-            
+
             elif seed == -2:
                 # labels only
                 teleprompter = LabeledFewShot(k=self.max_labeled_demos)
                 program2 = teleprompter.compile(student, trainset=trainset2)
-            
+
             elif seed == -1:
                 # unshuffled few-shot
-                program = BootstrapFewShot(metric=self.metric, max_bootstrapped_demos=self.max_num_samples,
-                                           max_labeled_demos=self.max_labeled_demos,
-                                           teacher_settings=self.teacher_settings, max_rounds=self.max_rounds)
+                program = BootstrapFewShot(
+                    metric=self.metric,
+                    max_bootstrapped_demos=self.max_num_samples,
+                    max_labeled_demos=self.max_labeled_demos,
+                    teacher_settings=self.teacher_settings,
+                    max_rounds=self.max_rounds,
+                )
                 program2 = program.compile(student, teacher=teacher, trainset=trainset2)
 
             else:
                 assert seed >= 0, seed
 
                 random.Random(seed).shuffle(trainset2)
-                size = random.Random(seed).randint(self.min_num_samples, self.max_num_samples)
-
-                teleprompter = BootstrapFewShot(metric=self.metric, max_bootstrapped_demos=size,
-                                                max_labeled_demos=self.max_labeled_demos,
-                                                teacher_settings=self.teacher_settings,
-                                                max_rounds=self.max_rounds)
-
-                program2 = teleprompter.compile(student, teacher=teacher, trainset=trainset2)
-
-            evaluate = Evaluate(devset=self.valset, metric=self.metric, num_threads=self.num_threads,
-                                display_table=False, display_progress=True)
+                size = random.Random(seed).randint(
+                    self.min_num_samples, self.max_num_samples
+                )
+
+                teleprompter = BootstrapFewShot(
+                    metric=self.metric,
+                    max_bootstrapped_demos=size,
+                    max_labeled_demos=self.max_labeled_demos,
+                    teacher_settings=self.teacher_settings,
+                    max_rounds=self.max_rounds,
+                )
+
+                program2 = teleprompter.compile(
+                    student, teacher=teacher, trainset=trainset2
+                )
+
+            evaluate = Evaluate(
+                devset=self.valset,
+                metric=self.metric,
+                num_threads=self.num_threads,
+                display_table=False,
+                display_progress=True,
+            )
 
             score, subscores = evaluate(program2, return_all_scores=True)
 
             all_subscores.append(subscores)
 
-            print('Score:', score, 'for set:', [len(predictor.demos) for predictor in program2.predictors()])
+            print(
+                "Score:",
+                score,
+                "for set:",
+                [len(predictor.demos) for predictor in program2.predictors()],
+            )
 
             if len(scores) == 0 or score > max(scores):
-                print('New best score:', score, 'for seed', seed)
+                print("New best score:", score, "for seed", seed)
                 best_program = program2
 
             scores.append(score)
             print(f"Scores so far: {scores}")
 
-            print('Best score:', max(scores))
+            print("Best score:", max(scores))
 
             score_data.append((score, subscores, seed, program2))
 
-            if len(score_data) > 2:  # We check if there are at least 3 scores to consider
+            if (
+                len(score_data) > 2
+            ):  # We check if there are at least 3 scores to consider
                 for k in [1, 2, 3, 5, 8, 9999]:
-                    top_3_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[:k]
+                    top_3_scores = sorted(score_data, key=lambda x: x[0], reverse=True)[
+                        :k
+                    ]
 
                     # Transpose the subscores to get max per entry and then calculate their average
-                    transposed_subscores = zip(*[subscores for _, subscores, *_ in top_3_scores if subscores])
-                    avg_of_max_per_entry = sum(max(entry) for entry in transposed_subscores) / len(top_3_scores[0][1])
+                    transposed_subscores = zip(
+                        *[subscores for _, subscores, *_ in top_3_scores if subscores]
+                    )
+                    avg_of_max_per_entry = sum(
+                        max(entry) for entry in transposed_subscores
+                    ) / len(top_3_scores[0][1])
+
+                    print(
+                        f"Average of max per entry across top {k} scores: {avg_of_max_per_entry}"
+                    )
 
-                    print(f'Average of max per entry across top {k} scores: {avg_of_max_per_entry}')
-            
             if self.stop_at_score is not None and score >= self.stop_at_score:
-                print(f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}")
+                print(
+                    f"Stopping early because score {score} is >= stop_at_score {self.stop_at_score}"
+                )
                 break
 
         # To best program, attach all program candidates in decreasing average score
         best_program.candidate_programs = score_data
-        best_program.candidate_programs = sorted(best_program.candidate_programs, key=lambda x: x[0], reverse=True)
+        best_program.candidate_programs = sorted(
+            best_program.candidate_programs, key=lambda x: x[0], reverse=True
+        )
 
         print(len(best_program.candidate_programs), "candidate programs found.")
 
         return best_program
 
 
-
-
 # sample between 4 and 10 examples from traces
 # TODO: FIXME: The max number of demos should be determined in part by the LM's tokenizer + max_length.
 # This does require executing the program, or at least the predictor.
diff --git a/dspy/teleprompt/teleprompt.py b/dspy/teleprompt/teleprompt.py
index 949a6f89c4..fe5b0b9500 100644
--- a/dspy/teleprompt/teleprompt.py
+++ b/dspy/teleprompt/teleprompt.py
@@ -1,10 +1,3 @@
-import tqdm
-import random
-import dsp
-
-from dspy.evaluate.evaluate import Evaluate
-
-
 class Teleprompter:
     def __init__(self):
         pass
diff --git a/dspy/teleprompt/teleprompt_optuna.py b/dspy/teleprompt/teleprompt_optuna.py
index 5528e5fb0b..29f0248dfc 100644
--- a/dspy/teleprompt/teleprompt_optuna.py
+++ b/dspy/teleprompt/teleprompt_optuna.py
@@ -1,17 +1,23 @@
-import dsp
-import tqdm
-import random
 import optuna
 
 from dspy.teleprompt.teleprompt import Teleprompter
 
 from .bootstrap import BootstrapFewShot
-from .vanilla import LabeledFewShot
 
 from dspy.evaluate.evaluate import Evaluate
 
+
 class BootstrapFewShotWithOptuna(Teleprompter):
-    def __init__(self, metric, teacher_settings={}, max_bootstrapped_demos=4, max_labeled_demos=16, max_rounds=1, num_candidate_programs=16, num_threads=6):
+    def __init__(
+        self,
+        metric,
+        teacher_settings={},
+        max_bootstrapped_demos=4,
+        max_labeled_demos=16,
+        max_rounds=1,
+        num_candidate_programs=16,
+        num_threads=6,
+    ):
         self.metric = metric
         self.teacher_settings = teacher_settings
         self.max_rounds = max_rounds
@@ -27,37 +33,58 @@ def __init__(self, metric, teacher_settings={}, max_bootstrapped_demos=4, max_la
         # self.max_bootstrapped_demos = self.max_num_traces
         self.max_labeled_demos = max_labeled_demos
 
-        print("Going to sample between", self.min_num_samples, "and", self.max_num_samples, "traces per predictor.")
+        print(
+            "Going to sample between",
+            self.min_num_samples,
+            "and",
+            self.max_num_samples,
+            "traces per predictor.",
+        )
         # print("Going to sample", self.max_num_traces, "traces in total.")
         print("Will attempt to train", self.num_candidate_sets, "candidate sets.")
 
     def objective(self, trial):
         program2 = self.student.reset_copy()
-        for (name, compiled_predictor), (_, program2_predictor) in zip(self.compiled_teleprompter.named_predictors(), program2.named_predictors()):
+        for (name, compiled_predictor), (_, program2_predictor) in zip(
+            self.compiled_teleprompter.named_predictors(), program2.named_predictors()
+        ):
             all_demos = compiled_predictor.demos
-            demo_index = trial.suggest_int(f"demo_index_for_{name}", 0, len(all_demos) - 1)
+            demo_index = trial.suggest_int(
+                f"demo_index_for_{name}", 0, len(all_demos) - 1
+            )
             selected_demo = dict(all_demos[demo_index])
             program2_predictor.demos = [selected_demo]
-        evaluate = Evaluate(devset=self.valset, metric=self.metric, num_threads=self.num_threads,
-                            display_table=False, display_progress=True)
+        evaluate = Evaluate(
+            devset=self.valset,
+            metric=self.metric,
+            num_threads=self.num_threads,
+            display_table=False,
+            display_progress=True,
+        )
         score, _ = evaluate(program2, return_all_scores=True)
         trial.set_user_attr("program", program2)
         return score
 
-
     def compile(self, student, *, teacher=None, max_demos, trainset, valset=None):
         self.trainset = trainset
         self.valset = valset or trainset
         self.student = student.reset_copy()
-        self.teacher = teacher.deepcopy() if teacher is not None else student.reset_copy()
-        teleprompter_optimize = BootstrapFewShot(metric=self.metric, max_bootstrapped_demos=max_demos,
-                                        max_labeled_demos=self.max_labeled_demos,
-                                        teacher_settings=self.teacher_settings,
-                                        max_rounds=self.max_rounds)
-        self.compiled_teleprompter = teleprompter_optimize.compile(self.student, teacher=self.teacher, trainset=self.trainset)
-        study = optuna.create_study(direction='maximize')
+        self.teacher = (
+            teacher.deepcopy() if teacher is not None else student.reset_copy()
+        )
+        teleprompter_optimize = BootstrapFewShot(
+            metric=self.metric,
+            max_bootstrapped_demos=max_demos,
+            max_labeled_demos=self.max_labeled_demos,
+            teacher_settings=self.teacher_settings,
+            max_rounds=self.max_rounds,
+        )
+        self.compiled_teleprompter = teleprompter_optimize.compile(
+            self.student, teacher=self.teacher, trainset=self.trainset
+        )
+        study = optuna.create_study(direction="maximize")
         study.optimize(self.objective, n_trials=self.num_candidate_sets)
         best_program = study.trials[study.best_trial.number].user_attrs["program"]
-        print('Best score:', study.best_value)
-        print('Best program:', best_program)
-        return best_program
\ No newline at end of file
+        print("Best score:", study.best_value)
+        print("Best program:", best_program)
+        return best_program
diff --git a/dspy/teleprompt/vanilla.py b/dspy/teleprompt/vanilla.py
index 068343fa0d..b5ff3d3766 100644
--- a/dspy/teleprompt/vanilla.py
+++ b/dspy/teleprompt/vanilla.py
@@ -1,4 +1,3 @@
-import dsp
 import random
 
 from .teleprompt import Teleprompter
@@ -19,12 +18,15 @@ def compile(self, student, *, trainset, sample=True):
 
         for predictor in self.student.predictors():
             if sample:
-                predictor.demos = rng.sample(self.trainset, min(self.k, len(self.trainset)))
+                predictor.demos = rng.sample(
+                    self.trainset, min(self.k, len(self.trainset))
+                )
             else:
-                predictor.demos = self.trainset[:min(self.k, len(self.trainset))]
+                predictor.demos = self.trainset[: min(self.k, len(self.trainset))]
 
         return self.student
-    
+
+
 # NOTE: I believe templatev2 keeps rdemos as long as they have the last field.
 # This may change later, especially with the introduction of required vs optional fields.
 # NOTE: Since we're relying on downstream code to handle the demos, this sampling may be sub-sampled.
diff --git a/inspect-app/app.py b/inspect-app/app.py
index 5ea50dd92a..174c993ba9 100644
--- a/inspect-app/app.py
+++ b/inspect-app/app.py
@@ -7,41 +7,41 @@
 CORS(app)
 
 dynamo_resource = boto3.resource("dynamodb", region_name="us-west-1")
-table = dynamo_resource.Table('dsp-inspect-app')
+table = dynamo_resource.Table("dsp-inspect-app")
 
 
 @app.route("/")
 def index():
-  return "This is the main page."
+    return "This is the main page."
 
 
 @app.route("/data/<id>", methods=["GET"])
 def get_item(id):
-  response = table.get_item(Key={"id": id})
+    response = table.get_item(Key={"id": id})
 
-  if 'Item' in response:
-    return jsonify(response['Item'])
-  else:
-    return 'Item not found', 404
+    if "Item" in response:
+        return jsonify(response["Item"])
+    else:
+        return "Item not found", 404
 
 
 @app.route("/inspect-db", methods=["GET"])
 def inspect_db():
-  return table.scan()
+    return table.scan()
 
 
-@app.route('/log-item', methods=['POST'])
+@app.route("/log-item", methods=["POST"])
 def log_item():
     data = request.get_json()
 
-    if 'id' not in data or 'content' not in data:
-        return 'Missing required fields', 400
+    if "id" not in data or "content" not in data:
+        return "Missing required fields", 400
 
-    data['expiry_time'] = int(time.time() + 86400)
+    data["expiry_time"] = int(time.time() + 86400)
     table.put_item(Item=data)
 
-    return 'Data created successfully', 201
-    
+    return "Data created successfully", 201
+
 
 if __name__ == "__main__":
-  app.run()
\ No newline at end of file
+    app.run()
diff --git a/setup.py b/setup.py
index a9f5c2b7c3..a70103d83a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
 from setuptools import setup, find_packages
 
 # Read the content of the README file
-with open('README.md', 'r', encoding='utf-8') as f:
+with open("README.md", "r", encoding="utf-8") as f:
     long_description = f.read()
 
 # Read the content of the requirements.txt file
-with open('requirements.txt', 'r', encoding='utf-8') as f:
+with open("requirements.txt", "r", encoding="utf-8") as f:
     requirements = f.read().splitlines()
 
 setup(
@@ -13,17 +13,15 @@
     version="2.0.3",
     description="DSPy",
     long_description=long_description,
-    long_description_content_type='text/markdown',
+    long_description_content_type="text/markdown",
     url="https://github.com/stanfordnlp/dsp",
     author="Omar Khattab",
     author_email="okhattab@stanford.edu",
     license="MIT License",
-    packages=find_packages(include=['dsp.*', 'dspy.*', 'dsp', 'dspy']),
-    python_requires='>=3.9',
+    packages=find_packages(include=["dsp.*", "dspy.*", "dsp", "dspy"]),
+    python_requires=">=3.9",
     install_requires=requirements,
-    extras_require={
-        "pinecone": ["pinecone-client~=2.2.4"]
-    },
+    extras_require={"pinecone": ["pinecone-client~=2.2.4"]},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Science/Research",

From fe48032300323133bdf0f9f1189f051e17527f5a Mon Sep 17 00:00:00 2001
From: DanielUH2019 <danielcardenascabrera2016@gmail.com>
Date: Sat, 28 Oct 2023 22:36:55 -0400
Subject: [PATCH 2/3] Create ruff config

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..47c68b79f8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,5 @@
+[tool.ruff]
+extend-exclude = ["cache"]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401","F403"]

From 70ca636c9fd7d3224185b4a86571227ad002c3ee Mon Sep 17 00:00:00 2001
From: DanielUH2019 <danielcardenascabrera2016@gmail.com>
Date: Sat, 28 Oct 2023 22:40:17 -0400
Subject: [PATCH 3/3] Create pre-commit-config with ci

---
 .pre-commit-config.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..96be8f5830
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-toml
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.1.3
+    hooks:
+    -   id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+    -   id: ruff-format
+ci:
+    autofix_commit_msg: '[pre-commit.ci] Auto format from pre-commit.com hooks'
+    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
\ No newline at end of file