In [2]:
import dspy

from diversity_gen import OptDiverseDataGenerator
import pandas
from diversity_metrics import dc_score, negative_cosine_sim, cosine_sim
import random
import json

def metric(gold, pred, trace=None):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    # if computed_cos_score > 0.6:
    #     computed_cos_score = -1
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    overall_score = computed_dc_score - computed_cos_score + computed_neg_cos_sim
    return overall_score

def metric_separate(gold, pred):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    return dspy.Prediction(
        diversity_score=computed_dc_score,
        cosine_sim_ref_pred=computed_cos_score,
        diversity_cos_score=computed_neg_cos_sim
    )

def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    metric_score = metric_separate(gold, pred)
    overall_score = metric(gold, pred, trace) / 2.0
    
    feedback_text = f"The overall score is {overall_score:.2f}, which computed as the cosine similarity between the in-context gold examples and generations ({metric_score.cosine_sim_ref_pred: .2f}) subtracted from the sum of two diversity scores (DC Score = {metric_score.diversity_score: .2f}, Negative Cosine Similarity = {metric_score.diversity_cos_score: .2f}). This score is then divided by 2 to normalize. Try to improve the diversity of your response. The generations should be sufficiently similar to the in-context gold examples without being too similar."
    if metric_score.cosine_sim_ref_pred > 0.6:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too high. Aim to be more creative in the generations while adhering to the hard requirements."
    return dspy.Prediction(
        score=overall_score,
        feedback=feedback_text,
    )


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pupa_tnb_data = pandas.read_csv("PUPA_TNB.csv")
random.seed(42)
random_sample = pupa_tnb_data.sample(n=15)

all_examples = []

for i, row in random_sample.iterrows():
    curr_example = "User Query: " + row["user_query"] + "\nAssistant Response: " + row["target_response"]
    all_examples.append(curr_example)    

lm = dspy.LM("gpt-4.1-nano", cache=True)
dspy.configure(lm=lm)

PUPA_REQUIREMENT = "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user."

task_gen = OptDiverseDataGenerator()

# Start creating actual data for opt
dspy_examples = []

for _ in range(250):
    dspy_examples.append(dspy.Example({"gold_examples": random.choices(all_examples, k=3),
                                        "hard_requirement": PUPA_REQUIREMENT}).with_inputs("gold_examples", "hard_requirement"))
    
train_set = dspy_examples[:200]
dev_set = dspy_examples[200:]
tiny_dev = dspy_examples[240:]


In [4]:

eval = dspy.Evaluate(metric=metric, devset=dev_set, return_all_scores=True)


In [None]:
scores = eval(task_gen)

In [6]:
scores

EvaluationResult(score=193.48, results=<list of 50 results>)

In [27]:
from dspy import GEPA

gepa = GEPA(metric=gepa_metric, track_stats=True, 
            reflection_lm=dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000),
            track_best_outputs=True, max_full_evals=2)

new_prog = gepa.compile(task_gen, trainset=train_set[:30], valset=dev_set[:15])
pareto_frontier = new_prog.detailed_results.val_aggregate_scores

2025/11/05 10:56:21 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 90 metric calls of the program. This amounts to 2.00 full evals on the train+val set.
2025/11/05 10:56:21 INFO dspy.teleprompt.gepa.gepa: Using 15 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
2025/11/05 10:57:03 INFO dspy.evaluate.evaluate: Average Metric: 10.014402839598068 / 15 (66.8%)
2025/11/05 10:57:03 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.6676268559732045
2025/11/05 10:57:03 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.6676268559732045


Average Metric: 2.08 / 3 (69.4%): 100%|██████████| 3/3 [00:13<00:00,  4.39s/it]

2025/11/05 10:57:16 INFO dspy.evaluate.evaluate: Average Metric: 2.082797780258419 / 3 (69.4%)
2025/11/05 10:57:16 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/05 10:57:16 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/sylvia/anaconda3/envs/papillon/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 115, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(
  File "/home/sylvia/anaconda3/envs/papillon/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 227, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/05 10:57:16 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Reflective mutation did not propose a new candidate
2025/11/05 10:57:16 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Sele


Average Metric: 1.75 / 3 (58.5%): 100%|██████████| 3/3 [00:22<00:00,  7.41s/it]

2025/11/05 10:57:38 INFO dspy.evaluate.evaluate: Average Metric: 1.7545798465603868 / 3 (58.5%)
2025/11/05 10:57:39 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Exception during reflection/proposal: Currently, GEPA only supports feedback functions that return the same score as the module's score. However, the module-level score is 0.5756039513569129 and the feedback score is 0.5971685308369173.
2025/11/05 10:57:39 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/sylvia/anaconda3/envs/papillon/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 115, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(
  File "/home/sylvia/anaconda3/envs/papillon/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 217, in make_reflective_dataset
    assert fb["score"] == module_score, f"Currently, GEPA only supports feedback functions that return the same score as the module's score. However, 


Average Metric: 1.59 / 3 (53.1%): 100%|██████████| 3/3 [00:07<00:00,  2.65s/it]

2025/11/05 10:57:46 INFO dspy.evaluate.evaluate: Average Metric: 1.5931040531376106 / 3 (53.1%)
2025/11/05 10:57:46 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/05 10:57:46 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/sylvia/anaconda3/envs/papillon/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 115, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(
  File "/home/sylvia/anaconda3/envs/papillon/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 227, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/05 10:57:46 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Reflective mutation did not propose a new candidate
2025/11/05 10:57:46 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Sel


Average Metric: 0.62 / 1 (62.2%):  33%|███▎      | 1/3 [00:06<00:13,  6.53s/it]



KeyboardInterrupt: 

In [None]:
optimizer = dspy.SIMBA(metric=metric, max_steps=3)
optimized_program = optimizer.compile(task_gen, trainset=train_set)

# Save optimize program for future use
optimized_program.save(f"optimized.json")

NameError: name 'dspy' is not defined

In [None]:
optimized_program.generated_data