In [5]:
from mpromptune import qEI
import os
import dspy
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from dspy.teleprompt import MIPROv2
import random
import optuna
import numpy as np

In [6]:
lm = dspy.LM('openai/gpt-4o-mini', api_key=os.environ['OPEN_AI_KEY'])
dspy.configure(lm=lm)
teleprompter = MIPROv2(
    metric=gsm8k_metric,
    auto="light",
    num_threads=2
)

In [7]:
sampler_config={'sampler': 'qei',
                'max_space_size': 100,
                'n_batches': 200,
                'batch_size': 4,
                'min_cold_start': 4}


In [8]:
gsm8k = GSM8K()

optimized_program = teleprompter.compile(
    dspy.ChainOfThought("question -> answer"),
    trainset=gsm8k.train,
    n_jobs=2,
    **sampler_config
)

Reusing dataset gsm8k (/home/sagemaker-user/.cache/huggingface/datasets/gsm8k/main/1.1.0/37bfb08b1d4fcbb01f06b03d9e1ef5f1fcbd4d3af3d08842c50d7305091285ba)


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 7473/7473 [00:00<00:00, 19970.36it/s]
100%|██████████| 1319/1319 [00:00<00:00, 20617.55it/s]
2025/10/27 15:00:21 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100

2025/10/27 15:00:21 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/10/27 15:00:21 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/10/27 15:00:21 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


 10%|█         | 4/40 [00:00<00:06,  5.60it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/6


  5%|▌         | 2/40 [00:00<00:04,  9.22it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/6


  8%|▊         | 3/40 [00:00<00:03,  9.42it/s]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 6/6


 10%|█         | 4/40 [00:00<00:04,  8.75it/s]
2025/10/27 15:00:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/10/27 15:00:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/10/27 15:00:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/10/27 15:00:24 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/10/27 15:00:24 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, produce the fields `answer`.

2025/10/27 15:00:24 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Imagine you are a brilliant mathematician being tested in a high-stakes competition. You will be presented with a variety of mathematical word problems that require multi-step reasoning and strategic thinking. For each problem, analyze the provided `question` carefully and break down your reasoning step by step. After deriving a detailed explanation of how you arrived at your conclusion, provide the final `answer`. Your ability to think critically and articulate your process is crucial to your success in this competition.

2025/10/27 15:00:24 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Imagine you are a detectiv

Average Metric: 91.00 / 100 (91.0%): 100%|██████████| 100/100 [00:00<00:00, 154.84it/s]

2025/10/27 15:00:25 INFO dspy.evaluate.evaluate: Average Metric: 91 / 100 (91.0%)
2025/10/27 15:00:25 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 91.0






2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 13 - Minibatch ==
2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 13 - Minibatch ==



  0%|          | 0/35 [00:00<?, ?it/s][A
Average Metric: 1.00 / 1 (100.0%):   0%|          | 0/35 [00:00<?, ?it/s][A
Average Metric: 2.00 / 2 (100.0%):   3%|▎         | 1/35 [00:00<00:00, 83.69it/s][A
Average Metric: 1.00 / 1 (100.0%):   0%|          | 0/35 [00:00<?, ?it/s].28it/s][A
Average Metric: 7.00 / 7 (100.0%):  17%|█▋        | 6/35 [00:00<00:00, 48.44it/s][A
Average Metric: 5.00 / 5 (100.0%):  11%|█▏        | 4/35 [00:00<00:00, 32.20it/s][A
Average Metric: 33.00 / 35 (94.3%): 100%|██████████| 35/35 [00:00<00:00, 108.59it/s]

2025/10/27 15:00:26 INFO dspy.evaluate.evaluate: Average Metric: 33 / 35 (94.3%)





2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 94.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29]
2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0]
2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0


2025/10/27 15:00:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 13 - Minibatch ==


Average Metric: 31.00 / 35 (88.6%): 100%|██████████| 35/35 [00:00<00:00, 125.95it/s]

2025/10/27 15:00:27 INFO dspy.evaluate.evaluate: Average Metric: 31 / 35 (88.6%)
2025/10/27 15:00:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/10/27 15:00:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57]
2025/10/27 15:00:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0]
2025/10/27 15:00:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0







2025/10/27 15:00:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 13 - Minibatch ==


Average Metric: 16.00 / 16 (100.0%):  43%|████▎     | 15/35 [00:07<00:11,  1.78it/s]
Average Metric: 6.00 / 6 (100.0%):  14%|█▍        | 5/35 [00:07<00:00, 39.31it/s][A
Average Metric: 7.00 / 7 (100.0%):  17%|█▋        | 6/35 [00:07<00:00, 39.31it/s][A
Average Metric: 8.00 / 8 (100.0%):  20%|██        | 7/35 [00:07<00:00, 39.31it/s][A
Average Metric: 9.00 / 9 (100.0%):  23%|██▎       | 8/35 [00:07<00:00, 39.31it/s][A
Average Metric: 9.00 / 9 (100.0%):  26%|██▌       | 9/35 [00:07<00:27,  1.06s/it][A
Average Metric: 9.00 / 10 (90.0%):  26%|██▌       | 9/35 [00:07<00:27,  1.06s/it][A
Average Metric: 10.00 / 11 (90.9%):  29%|██▊       | 10/35 [00:07<00:26,  1.06s/it][A
Average Metric: 11.00 / 12 (91.7%):  31%|███▏      | 11/35 [00:07<00:25,  1.06s/it][A
Average Metric: 12.00 / 13 (92.3%):  34%|███▍      | 12/35 [00:08<00:24,  1.06s/it][A
Average Metric: 13.00 / 14 (92.9%):  37%|███▋      | 13/35 [00:08<00:23,  1.06s/it][A
Average Metric: 14.00 / 15 (93.3%):  40%|████      | 14/3

2025/10/27 15:00:45 INFO dspy.evaluate.evaluate: Average Metric: 32 / 35 (91.4%)
2025/10/27 15:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].
2025/10/27 15:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43]
2025/10/27 15:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0]
2025/10/27 15:00:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0





Bayesian Issues: 'X_RapidAPI_Key'
Bayesian Issues: 'X_RapidAPI_Key'
Bayesian Issues: 'X_RapidAPI_Key'


2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 13 - Minibatch ==


Bayesian Issues: 'X_RapidAPI_Key'
Average Metric: 31.00 / 35 (88.6%): 100%|██████████| 35/35 [00:00<00:00, 175.80it/s]

2025/10/27 15:00:46 INFO dspy.evaluate.evaluate: Average Metric: 31 / 35 (88.6%)





2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57]
2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0]
2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0


2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 13 - Full Evaluation =====
2025/10/27 15:00:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 94.29) from minibatch trials...


Average Metric: 90.00 / 100 (90.0%): 100%|██████████| 100/100 [00:00<00:00, 164.40it/s]

2025/10/27 15:00:47 INFO dspy.evaluate.evaluate: Average Metric: 90 / 100 (90.0%)
2025/10/27 15:00:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:00:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0
2025/10/27 15:00:47 INFO dspy.teleprompt.mipro_optimizer_v2: 






2025/10/27 15:00:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 13 - Minibatch ==


Average Metric: 32.00 / 35 (91.4%): 100%|██████████| 35/35 [00:38<00:00,  1.09s/it] 

2025/10/27 15:01:05 INFO dspy.evaluate.evaluate: Average Metric: 32 / 35 (91.4%)
2025/10/27 15:01:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2025/10/27 15:01:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57, 91.43]
2025/10/27 15:01:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:01:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0


2025/10/27 15:01:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 13 - Minibatch ==



Average Metric: 32.00 / 35 (91.4%): 100%|██████████| 35/35 [00:40<00:00,  1.16s/it]

2025/10/27 15:01:27 INFO dspy.evaluate.evaluate: Average Metric: 32 / 35 (91.4%)
2025/10/27 15:01:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2'].
2025/10/27 15:01:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57, 91.43, 91.43]
2025/10/27 15:01:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:01:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0


2025/10/27 15:01:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 13 - Minibatch ==



Average Metric: 31.00 / 35 (88.6%): 100%|██████████| 35/35 [00:42<00:00,  1.21s/it]

2025/10/27 15:01:47 INFO dspy.evaluate.evaluate: Average Metric: 31 / 35 (88.6%)
2025/10/27 15:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2'].
2025/10/27 15:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57, 91.43, 91.43, 88.57]
2025/10/27 15:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0





Bayesian Issues: 'X_RapidAPI_Key'
Bayesian Issues: 'X_RapidAPI_Key'
Average Metric: 24.00 / 26 (92.3%):  74%|███████▍  | 26/35 [00:20<00:06,  1.33it/s]Bayesian Issues: 'X_RapidAPI_Key'
Average Metric: 28.00 / 32 (87.5%):  89%|████████▊ | 31/35 [00:20<00:02,  1.43it/s]

2025/10/27 15:01:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 13 - Minibatch ==


Bayesian Issues: 'X_RapidAPI_Key'
Average Metric: 33.00 / 35 (94.3%): 100%|██████████| 35/35 [00:00<00:00, 151.41it/s]

2025/10/27 15:01:49 INFO dspy.evaluate.evaluate: Average Metric: 33 / 35 (94.3%)





2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 94.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 3'].
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57, 91.43, 91.43, 88.57, 94.29]
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0


2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 13 - Minibatch ==


Average Metric: 30.00 / 35 (85.7%): 100%|██████████| 35/35 [00:00<00:00, 136.25it/s]

2025/10/27 15:01:49 INFO dspy.evaluate.evaluate: Average Metric: 30 / 35 (85.7%)
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 3'].
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57, 91.43, 91.43, 88.57, 94.29, 85.71]
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0


2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 13 - Full Evaluation =====
2025/10/27 15:01:49 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 91.43) from minibatch trials...



Average Metric: 31.00 / 35 (88.6%): 100%|██████████| 35/35 [00:26<00:00,  1.32it/s]

2025/10/27 15:01:54 INFO dspy.evaluate.evaluate: Average Metric: 31 / 35 (88.6%)
2025/10/27 15:01:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2025/10/27 15:01:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [94.29, 88.57, 91.43, 88.57, 91.43, 91.43, 88.57, 94.29, 85.71, 88.57]
2025/10/27 15:01:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0]
2025/10/27 15:01:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0





Average Metric: 91.00 / 100 (91.0%): 100%|██████████| 100/100 [01:04<00:00,  1.55it/s]

2025/10/27 15:02:54 INFO dspy.evaluate.evaluate: Average Metric: 91 / 100 (91.0%)
2025/10/27 15:02:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [91.0, 90.0, 91.0]
2025/10/27 15:02:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 91.0
2025/10/27 15:02:54 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/10/27 15:02:54 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 91.0!



