In [1]:
# Enable reloading on code changes
%load_ext autoreload
%autoreload 2

# Setting the environment variables
import os # noqa

# os.environ["DSPY_CACHEDIR"] =
# os.environ["DSP_CACHEDIR"] =
# os.environ["OPENAI_API_KEY"] =

# Import the library
import dspy


  from .autonotebook import tqdm as notebook_tqdm


## I. Showcasing `LM.finetune()`

In [2]:
import time

In [3]:
# Example call to an LM before fine-tuning
lm = dspy.LM('gpt-4o-mini-2024-07-18')
lm("How far is the Moon from Earth?")

["The average distance from the Earth to the Moon is about 238,855 miles (384,400 kilometers). However, this distance can vary slightly due to the Moon's elliptical orbit, ranging from approximately 225,623 miles (363,104 kilometers) at its closest (perigee) to about 252,088 miles (405,696 kilometers) at its farthest (apogee)."]

In [4]:
# Using LM.finetune(), BSFT, and BetterTogether requires this flag
dspy.settings.experimental = True

In [5]:
# Let's construct a dummy dataset
message = {
  "messages": [
    {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
    {"role": "user", "content": "How far is the Moon from Earth?"},
    {"role": "assistant", "content": "384,400 kilometers"},
  ]
}
training_data = [message] * 20

# Let's finetune the model
train_kwargs = {
  "n_epochs": 1,
}

job = lm.finetune(
  train_data=training_data,
  train_kwargs=train_kwargs,
  data_format="chat",  # Could be left empty, inferred from "lm.model_type" as a default
)
type(job)

[Finetune] Validating the data format

dspy.clients.openai.TrainingJobOpenAI


[Finetune] Saving the data to a file


[Finetune] Uploading the data to the provider


[Finetune] Start remote training
[Finetune] Wait for training to complete
[Finetune] Get trained model if the run was a success



Running the cell below immediately after the cell above returns `False`, indicating that the job is not done.

In [6]:
# This will return False until the job is complete
job.done()

False

Once started, a `job` object can be polled for status, assuming that a provider has implemented the status checking.
Note: It takes a bit for the `job.done()` to update once `job.status()` turns to succeeded.

In [7]:
while not job.done():
    print(job.status())
    time.sleep(10)

TrainingStatus.pending
TrainingStatus.pending
TrainingStatus.pending
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.running
TrainingStatus.succeeded
TrainingStatus.succeeded
TrainingStatus.succeeded


In [8]:
# Once the job is complete, the fine-tuned LM can be obtained via job.result()
finetuned_lm = job.result()
print(finetuned_lm)

# We can look at the model IDs to ensure that the fine-tuned model is different
print(f"Base model: {lm.model}")
print(f"Fine-tuned model: {finetuned_lm.model}")

<dspy.clients.lm.LM object at 0x7f08be785730>
Base model: gpt-4o-mini-2024-07-18
Fine-tuned model: ft:gpt-4o-mini-2024-07-18:stanford::AOHSK6y9


In [9]:
# We can check how the fine-tuned LM responds to the query we used for
# fine-tuning.
finetuned_lm("How far is the Moon from Earth?")

['384,400 kilometers']

## II. LM fine-tuning with a custom `Provider`

In [10]:
import time
from typing import Any, Dict, List, Optional
from dspy.clients.provider import Provider, TrainingJob, DataFormat

# Using LM.finetune(), BSFT, and BetterTogether requires this flag
dspy.settings.experimental = True

Here we define a custom provider with a dummy fine-tune method.

In [11]:
class CustomProvider(Provider):

    def __init__(self):
        super().__init__()
        self.finetunable = True

    @staticmethod
    def finetune(
        job: TrainingJob,
        model: str,
        train_data: List[Dict[str, Any]],
        train_kwargs: Optional[Dict[str, Any]] = None,
        data_format: Optional[DataFormat] = None,
    ) -> str:

        # Fake fine-tuning
        print("Fake fine-tuning has started!!")
        time.sleep(15)
        print("Done")

        # Return the new model name; we are hard-coding an OpenAI model as a
        # demo placeholder
        model = "ft:gpt-4o-mini-2024-07-18:stanford::AMDsC653"
        return model
    
    # # We could also override the launch/kill methods if needed
    # def launch(model: str, launch_kwargs: dict):
    #     pass

    # def kill(model: str, launch_kwargs: dict):
    #     pass


# We could also create a custom TrainingJob class to implement
# .status() and .cancel() methods, but we don't have to.

In [12]:
# We could also pass launch_kwargs if this model needs to be launched before
# use, assuming that the launch and kill methods are implemented by the
# custom provider.
launch_kwargs = {
  "gpu": 1,
  "max_prompt_length": 1000,
}

# Create the LM we want to fine-tune, using a dummy model name
model = "openai/MyAmazingCustomModel"
provider = CustomProvider()
lm = dspy.LM(model, provider=provider, launch_kwargs=launch_kwargs)
lm.launch()

# Query the model -- commented out because the model is not real
# lm("How far is the Moon from Earth?")

# kill the model once done
lm.kill()

`launch()` is called for the auto-launched model openai/MyAmazingCustomModel -- no action is taken!
`kill()` is called for the auto-launched model openai/MyAmazingCustomModel -- no action is taken!


In [13]:
# Try fine-tune
dspy.settings.experimental = True

# Let's construct a dummy dataset
message = {
  "messages": [
    {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."},
    {"role": "user", "content": "How far is the Moon from Earth?"},
    {"role": "assistant", "content": "384,400 kilometers"},
  ]
}
training_data = [message] * 20

# Let's finetune the model
train_kwargs = {
  "n_epochs": 1,
}

job = lm.finetune(
  train_data=training_data,
  train_kwargs=train_kwargs,
  data_format="chat"
)
type(job)

Fake fine-tuning has started!!

dspy.clients.provider.TrainingJob




In [14]:
# Running the command below immediately after the cell above returns `False`, indicating that the job is not done.
print(f"[0s] job.done(): {job.done()}")

# Wait
time.sleep(20)

# Check again
print(f"[20s] job.done(): {job.done()}")

[0s] job.done(): False
[20s] job.done(): True


Done


We can access the fine-tuned model as before

In [15]:
lm = job.result()
lm("How far is the Moon from Earth?")

['384,400 kilometers']

## III. Showcasing `BootstrapFinetune`

### i. Task Setup

Example setup using HotPotQA

In [19]:
import dspy
from dspy.datasets import HotPotQA
from dspy.evaluate import Evaluate # noqa
from dsp.utils.utils import deduplicate # noqa


# We are setting the experimental flag to True to make use of the fine-tuning
# features that are still in development.
dspy.settings.configure(experimental=True)

# Define the program
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3, num_hops=2):
        super().__init__()
        self.num_hops = 2
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(self.num_hops)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
    
    def forward(self, question):
        context = []
        
        for hop in range(self.num_hops):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        answer = self.generate_answer(context=context, question=question).copy(context=context)
        return answer

# Prepare the dataset
TRAIN_SIZE = 1000
DEV_SIZE = 500
dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, only_hard_examples=True)
trainset = [x.with_inputs('question') for x in dataset.train][:TRAIN_SIZE]
devset = [x.with_inputs('question') for x in dataset.dev][:DEV_SIZE]

# Prepare the metric and evaluator
NUM_THREADS = 12
metric = dspy.evaluate.answer_exact_match
evaluate = Evaluate(devset=devset, metric=metric, num_threads=NUM_THREADS, display_progress=True)

# Prepare the retriever model
COLBERT_V2_ENDPOINT = "http://20.102.90.50:2017/wiki17_abstracts"
retriever = dspy.ColBERTv2(url=COLBERT_V2_ENDPOINT)

### ii. Demo of `BootstrapFinetune`

In [20]:
# Using LM.finetune(), BSFT, and BetterTogether requires this flag
dspy.settings.experimental = True

The cell below shows the different ways the `BootstrapFinetune` can be optimized.

In [21]:
# (1) BSFT can be initilized with no arguments!
weight_optimizer = dspy.BootstrapFinetune()

# (2) Better to optimize with a metric to be used for filtering data before
# fine-tuning
weight_optimizer = dspy.BootstrapFinetune(
  metric=metric
)

# (3) Bootstrap fine-tune accepts other parameters as well, as shown below
train_kwargs = {
  "n_epochs": 1,
}
adapter = dspy.ChatAdapter()

weight_optimizer = dspy.BootstrapFinetune(
  metric=metric,               # Can be left empty, leads to no filtering
  multitask=True,              # We can also handle False!
  train_kwargs=train_kwargs,   # Can be left empty
  adapter=adapter,             # Can be left empty, leads to adapters inferred from the LM
  exclude_demos=False,         # Can be left empty
  num_threads = 1,             # Can be left empty
)


# (4) The adapter and train_kwargs arguments could be passed as dictionaries
# mapping LMs to their respective adapters/train_kwargs. This is useful when the
# predictors of the program point to different LMs.
lm = dspy.LM('gpt-4o-mini-2024-07-18')
adapter = dspy.ChatAdapter()

train_kwargs = {
  lm: {
    "n_epochs": 1,
  },
  # lm2: train_kwargs2,
}
adapter = {
  lm: adapter,
  # lm2: adapter2,
}

weight_optimizer = dspy.BootstrapFinetune(
  metric=metric,               # Can be left empty, leads to no filtering
  multitask=True,              # We can also handle False!
  train_kwargs=train_kwargs,   # Can be left empty
  adapter=adapter,             # Can be left empty, leads to adapters inferred from the LM
  exclude_demos=False,         # Can be left empty
  num_threads = 1,             # Can be left empty
)

The cell below shows an example of running `BootstrapFinetune`

In [26]:
# Using method (3) from above to create a weight-optimized program
train_kwargs = {
  "n_epochs": 1,
}
adapter = dspy.ChatAdapter()

weight_optimizer = dspy.BootstrapFinetune(
  metric=metric,               # Can be left empty, leads to no filtering
  multitask=True,              # We can also handle False!
  train_kwargs=train_kwargs,   # Can be left empty
  adapter=adapter,             # Can be left empty, leads to adapters inferred from the LM
  exclude_demos=False,         # Can be left empty
  num_threads = 1,             # Can be left empty
)

lm = dspy.LM('gpt-4o-mini-2024-07-18')
small_trainset = trainset[:10] # Use a small subset of the training data

with dspy.context(lm=lm, rm=retriever):
  weight_optimized_program = weight_optimizer.compile(
    student=BasicMH(),
    trainset=small_trainset,
    teacher=None,             # Doesn't need to be set, student is used as the teacher by default
  )

Preparing the student and teacher programs...
Ensuring that the student is not compiled.
No teacher provided. Using a copy of the student program as the teacher.
Bootstrapping data...
Average Metric: 5 / 10  (50.0): 100%|██████████| 10/10 [00:00<00:00, 416.94it/s]
Preparing the train data...
Collected data for 10 examples
After filtering for score, 5 examples remain
Using 15 data points for fine-tuning the model: gpt-4o-mini-2024-07-18
Starting LM fine-tuning...
1 fine-tuning job(s) to start.
Starting 1 fine-tuning jobs...
[OpenAI Provider] Validating the data format
[OpenAI Provider] Saving the data to a file
[OpenAI Provider] Data saved to /scr-ssd/dilara/.cache/dspy-new/finetune/4fd944783dbb3639.jsonl
[OpenAI Provider] Uploading the data to the provider
[OpenAI Provider] Start remote training
[OpenAI Provider] Job started with the OpenAI Job ID ftjob-gUXOgSEqEyV3v9Q6JgAqic9G
[OpenAI Provider] Wait for training to complete
[OpenAI Provider] Attempting to retrieve the trained model
[O

In [27]:
for p in weight_optimized_program.predictors():
  print(p.lm.model)

ft:gpt-4o-mini-2024-07-18:stanford::AOI4YHf2
ft:gpt-4o-mini-2024-07-18:stanford::AOI4YHf2
ft:gpt-4o-mini-2024-07-18:stanford::AOI4YHf2


## IV. Demo of `BetterTogether`

### i. Task Setup

Example setup using HotPotQA

In [4]:
import dspy
from dspy.datasets import HotPotQA
from dspy.evaluate import Evaluate # noqa
from dsp.utils.utils import deduplicate # noqa


# We are setting the experimental flag to True to make use of the fine-tuning
# features that are still in development.
dspy.settings.configure(experimental=True)

# Define the program
class BasicMH(dspy.Module):
    def __init__(self, passages_per_hop=3, num_hops=2):
        super().__init__()
        self.num_hops = 2
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_query = [dspy.ChainOfThought("context, question -> search_query") for _ in range(self.num_hops)]
        self.generate_answer = dspy.ChainOfThought("context, question -> answer")
    
    def forward(self, question):
        context = []
        
        for hop in range(self.num_hops):
            search_query = self.generate_query[hop](context=context, question=question).search_query
            passages = self.retrieve(search_query).passages
            context = deduplicate(context + passages)

        answer = self.generate_answer(context=context, question=question).copy(context=context)
        return answer

# Prepare the dataset
TRAIN_SIZE = 1000
DEV_SIZE = 500
dataset = HotPotQA(train_seed=1, eval_seed=2023, test_size=0, only_hard_examples=True)
trainset = [x.with_inputs('question') for x in dataset.train][:TRAIN_SIZE]
devset = [x.with_inputs('question') for x in dataset.dev][:DEV_SIZE]

# Prepare the metric and evaluator
NUM_THREADS = 12
metric = dspy.evaluate.answer_exact_match
evaluate = Evaluate(devset=devset, metric=metric, num_threads=NUM_THREADS, display_progress=True)

# Prepare the retriever model
COLBERT_V2_ENDPOINT = "http://20.102.90.50:2017/wiki17_abstracts"
retriever = dspy.ColBERTv2(url=COLBERT_V2_ENDPOINT)

### ii. Demo

In [5]:
# Using LM.finetune(), BSFT, and BetterTogether requires this flag
dspy.settings.experimental = True


In [8]:
# (1) The only required argument we require for BetterTogether is the metric
better_together = dspy.BetterTogether(
  metric=metric        # This is the only metric we require!
                       # We could also consider not requiring it if BootstrapFewShotWithRandomSearch is modified.
)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


In [11]:
# (2) We can also pass the weight and prompt optimizers we initialized
train_kwargs = {
  "n_epochs": 1,
}
adapter = dspy.ChatAdapter()

weight_optimizer = dspy.BootstrapFinetune(
  metric=metric,               # Can be left empty, leads to no filtering
  multitask=True,              # We can also handle False!
  train_kwargs=train_kwargs,   # Can be left empty
  adapter=adapter,             # Can be left empty, leads to adapters inferred from the LM
  exclude_demos=True,          # We are dropping the demos for fine-tuning 
  num_threads = 1,             # Can be left empty
)

prompt_optimizer = dspy.BootstrapFewShotWithRandomSearch(
    metric=metric,
    max_bootstrapped_demos=3,
    max_labeled_demos=3,
    num_candidate_programs=6,
    num_threads=6
)

# Initialize BetterTogether
better_together = dspy.BetterTogether(
  metric=metric,
  weight_optimizer=weight_optimizer,   # Can be left empty
  prompt_optimizer=prompt_optimizer,   # Can be left empty
  seed=2023,                           # Can be left empty
)

Going to sample between 1 and 3 traces per predictor.
Will attempt to bootstrap 6 candidate sets.


In [21]:
# Running BetterTogether on a small dataset

lm = dspy.LM('gpt-4o-mini-2024-07-18')
small_trainset = trainset[:50] # Use a small subset of the training data

with dspy.context(lm=lm, rm=retriever):
  optimized_program = better_together.compile(
    student=BasicMH(),
    trainset=small_trainset,
    strategy="p -> w -> p",
    valset_ratio=0.1
  )

[BetterTogether] Validating the strategy
[BetterTogether] Preparing the student program...
Ensuring that the student is not compiled
[BetterTogether] Compiling the student program...
[BetterTogether] Step 1 of 3 - Strategy `p`
[BetterTogether] Shuffling the trainset...
[BetterTogether] Preparing for prompt optimization...
[BetterTogether] Launching the program LMs for sampling...
`launch()` is called for the auto-launched model `gpt-4o-mini-2024-07-18` -- no action is taken!
[BetterTogether] Compiling the prompt optimizer...


Average Metric: 2 / 5  (40.0): 100%|██████████| 5/5 [00:02<00:00,  2.05it/s]  


New best score: 40.0 for seed -3
Scores so far: [40.0]
Best score so far: 40.0


Average Metric: 2 / 5  (40.0): 100%|██████████| 5/5 [00:02<00:00,  2.30it/s] 


Scores so far: [40.0, 40.0]
Best score so far: 40.0


 11%|█         | 5/45 [00:11<01:28,  2.21s/it]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 1 / 5  (20.0): 100%|██████████| 5/5 [00:06<00:00,  1.25s/it]


Scores so far: [40.0, 40.0, 20.0]
Best score so far: 40.0


 11%|█         | 5/45 [00:18<02:25,  3.63s/it]


Bootstrapped 2 full traces after 6 examples in round 0.


Average Metric: 1 / 5  (20.0): 100%|██████████| 5/5 [00:07<00:00,  1.43s/it]


Scores so far: [40.0, 40.0, 20.0, 20.0]
Best score so far: 40.0


  7%|▋         | 3/45 [00:05<01:10,  1.67s/it]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 2 / 5  (40.0): 100%|██████████| 5/5 [00:05<00:00,  1.11s/it] 


Scores so far: [40.0, 40.0, 20.0, 20.0, 40.0]
Best score so far: 40.0


  2%|▏         | 1/45 [00:02<01:32,  2.10s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 1 / 5  (20.0): 100%|██████████| 5/5 [00:09<00:00,  1.84s/it]


Scores so far: [40.0, 40.0, 20.0, 20.0, 40.0, 20.0]
Best score so far: 40.0


  2%|▏         | 1/45 [00:01<01:04,  1.46s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 1 / 5  (20.0): 100%|██████████| 5/5 [00:06<00:00,  1.28s/it] 


Scores so far: [40.0, 40.0, 20.0, 20.0, 40.0, 20.0, 20.0]
Best score so far: 40.0


  2%|▏         | 1/45 [00:04<03:36,  4.91s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 2 / 5  (40.0): 100%|██████████| 5/5 [00:08<00:00,  1.60s/it]


Scores so far: [40.0, 40.0, 20.0, 20.0, 40.0, 20.0, 20.0, 40.0]
Best score so far: 40.0


 16%|█▌        | 7/45 [00:11<01:02,  1.65s/it]


Bootstrapped 3 full traces after 8 examples in round 0.


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:06<00:00,  1.37s/it] 


New best score: 60.0 for seed 5
Scores so far: [40.0, 40.0, 20.0, 20.0, 40.0, 20.0, 20.0, 40.0, 60.0]
Best score so far: 60.0
9 candidate programs found.
[BetterTogether] Killing the LMs used for sampling...
`kill()` is called for the auto-launched model `gpt-4o-mini-2024-07-18` -- no action is taken!
[BetterTogether] Step 2 of 3 - Strategy `p -> w`
[BetterTogether] Shuffling the trainset...
[BetterTogether] Preparing for weight optimization...
[BetterTogether] Compiling the weight optimizer...
[BootstrapFinetune] Preparing the student and teacher programs...
Ensuring that the student is not compiled
No teacher provided. Using a copy of the student program as the teacher.
[BootstrapFinetune] Bootstrapping data...
Average Metric: 28 / 50  (56.0): 100%|██████████| 50/50 [03:55<00:00,  4.72s/it]
[BootstrapFinetune] Preparing the train data...
[BootstrapFinetune] Collected data for 50 examples
[BootstrapFinetune] After filtering for score, 28 examples remain
Using 84 data points for fine-t

Average Metric: 1 / 5  (20.0): 100%|██████████| 5/5 [00:02<00:00,  2.42it/s]  


New best score: 20.0 for seed -3
Scores so far: [20.0]
Best score so far: 20.0


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:01<00:00,  3.57it/s]


New best score: 60.0 for seed -2
Scores so far: [20.0, 60.0]
Best score so far: 60.0


  9%|▉         | 4/45 [00:08<01:22,  2.02s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:06<00:00,  1.21s/it]


Scores so far: [20.0, 60.0, 60.0]
Best score so far: 60.0


  4%|▍         | 2/45 [00:02<01:03,  1.48s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 4 / 5  (80.0): 100%|██████████| 5/5 [00:06<00:00,  1.22s/it] 


New best score: 80.0 for seed 0
Scores so far: [20.0, 60.0, 60.0, 80.0]
Best score so far: 80.0


  2%|▏         | 1/45 [00:01<00:58,  1.32s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 1 / 5  (20.0): 100%|██████████| 5/5 [00:06<00:00,  1.39s/it]


Scores so far: [20.0, 60.0, 60.0, 80.0, 20.0]
Best score so far: 80.0


  7%|▋         | 3/45 [00:04<01:00,  1.43s/it]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:06<00:00,  1.27s/it] 


Scores so far: [20.0, 60.0, 60.0, 80.0, 20.0, 60.0]
Best score so far: 80.0


  2%|▏         | 1/45 [00:01<00:47,  1.07s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:07<00:00,  1.43s/it] 


Scores so far: [20.0, 60.0, 60.0, 80.0, 20.0, 60.0, 60.0]
Best score so far: 80.0


  2%|▏         | 1/45 [00:03<02:19,  3.17s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:05<00:00,  1.18s/it]


Scores so far: [20.0, 60.0, 60.0, 80.0, 20.0, 60.0, 60.0, 60.0]
Best score so far: 80.0


 11%|█         | 5/45 [00:14<01:58,  2.96s/it]


Bootstrapped 3 full traces after 6 examples in round 0.


Average Metric: 3 / 5  (60.0): 100%|██████████| 5/5 [00:06<00:00,  1.31s/it] 

Scores so far: [20.0, 60.0, 60.0, 80.0, 20.0, 60.0, 60.0, 60.0, 60.0]
Best score so far: 80.0
9 candidate programs found.
[BetterTogether] Killing the LMs used for sampling...
`kill()` is called for the auto-launched model `ft:gpt-4o-mini-2024-07-18:stanford::AOIn7Dm8` -- no action is taken!
[BetterTogether] BetterTogether has finished compiling the student program.





In [22]:
for p in optimized_program.predictors():
  print(p.lm.model)

ft:gpt-4o-mini-2024-07-18:stanford::AOIn7Dm8
ft:gpt-4o-mini-2024-07-18:stanford::AOIn7Dm8
ft:gpt-4o-mini-2024-07-18:stanford::AOIn7Dm8
