In [1]:
import dspy

from diversity_gen import OptDiverseDataGenerator
import pandas
from diversity_metrics import dc_score, negative_cosine_sim, cosine_sim
import random
import json

from dotenv import load_dotenv
load_dotenv(".env")


def metric(gold, pred, trace=None):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    if computed_cos_score > 0.6:
        computed_cos_score = 1
    elif computed_cos_score < 0.4:
        computed_cos_score = 1
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    overall_score = computed_dc_score - computed_cos_score + computed_neg_cos_sim
    return overall_score

def metric_separate(gold, pred):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    return dspy.Prediction(
        diversity_score=computed_dc_score,
        cosine_sim_ref_pred=computed_cos_score,
        diversity_cos_score=computed_neg_cos_sim
    )

def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    metric_score = metric_separate(gold, pred)
    overall_score = metric(gold, pred, trace)
    
    feedback_text = f"The overall score is {overall_score:.2f}, which computed as the cosine similarity between the in-context gold examples and generations ({metric_score.cosine_sim_ref_pred: .2f}) subtracted from the sum of two diversity scores (DC Score = {metric_score.diversity_score: .2f}, Negative Cosine Similarity = {metric_score.diversity_cos_score: .2f}). Try to improve the diversity of your response. The generations should be sufficiently similar to the in-context gold examples without being too similar."
    if metric_score.cosine_sim_ref_pred > 0.6:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too high. Aim to be more creative in the generations while adhering to the hard requirements."
        metric_score.cosine_sim_ref_pred = -10
    elif metric_score.cosine_sim_ref_pred < 0.4:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too low. Adhere to the hard requirements and still have generations to be sufficiently similar to the gold examples."
        metric_score.cosine_sim_ref_pred = -1
    return dspy.Prediction(
        score=overall_score,
        feedback=feedback_text,
    )


  from tqdm.autonotebook import tqdm, trange


[2025-11-10 11:42:01,353] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


In [2]:
pupa_tnb_data = pandas.read_csv("PUPA_TNB.csv")
random.seed(42)
random_sample = pupa_tnb_data.sample(n=20)

all_examples = []

for i, row in random_sample.iterrows():
    if not pandas.isna(row["user_query"]):
        curr_example = "User Query: " + row["user_query"] + "\nAssistant Response: " + row["target_response"]
        all_examples.append(curr_example)    

lm = dspy.LM("gpt-4.1-nano", cache=True)
dspy.configure(lm=lm)

PUPA_REQUIREMENT = "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user."

task_gen = OptDiverseDataGenerator()

# Start creating actual data for opt
dspy_examples = []

for _ in range(250):
    dspy_examples.append(dspy.Example({"gold_examples": random.choices(all_examples, k=3),
                                        "hard_requirement": PUPA_REQUIREMENT}).with_inputs("gold_examples", "hard_requirement"))
    
train_set = dspy_examples[:200]
dev_set = dspy_examples[200:]
tiny_dev = dspy_examples[240:]


In [3]:

eval = dspy.Evaluate(metric=metric, devset=dev_set, return_all_scores=True)


In [4]:
from dspy import GEPA

gepa = GEPA(metric=gepa_metric, track_stats=True, 
            reflection_lm=dspy.LM(model='gpt-4.1', temperature=1.0, max_tokens=32000),
            track_best_outputs=True, max_full_evals=2)
new_prog = gepa.compile(task_gen, trainset=train_set[:30], valset=dev_set[:15])
pareto_frontier = new_prog.detailed_results.val_aggregate_scores

2025/11/10 11:42:12 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 90 metric calls of the program. This amounts to 2.00 full evals on the train+val set.
2025/11/10 11:42:12 INFO dspy.teleprompt.gepa.gepa: Using 15 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/90 [00:00<?, ?rollouts/s]2025/11/10 11:42:45 INFO dspy.evaluate.evaluate: Average Metric: 11.90856537662391 / 15 (79.4%)
2025/11/10 11:42:45 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.793904358441594
GEPA Optimization:  17%|█▋        | 15/90 [00:33<02:46,  2.22s/rollouts]2025/11/10 11:42:45 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.793904358

Average Metric: 1.80 / 3 (59.9%): 100%|██████████| 3/3 [00:12<00:00,  4.16s/it]

2025/11/10 11:42:58 INFO dspy.evaluate.evaluate: Average Metric: 1.7970584965710128 / 3 (59.9%)
2025/11/10 11:42:58 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/10 11:42:58 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 119, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(curr_prog, eval_curr, predictor_names_to_update)
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 288, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/10 11:42:58 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Reflective mutation did not propose a new candidate
GEPA Optimization:  20%|


Average Metric: 1.63 / 3 (54.4%): 100%|██████████| 3/3 [00:12<00:00,  4.21s/it]

2025/11/10 11:43:10 INFO dspy.evaluate.evaluate: Average Metric: 1.6327892009169611 / 3 (54.4%)





2025/11/10 11:43:38 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for proposer.predict: You are given a task to generate new user query examples (instances) that align with a specific requirement on personal information. You will be provided with:

- A list of EXAMPLES: These are existing user queries and their corresponding assistant responses. You MUST NOT duplicate these examples in your generations.
- A DATA_SUMMARY: A summary or None of the existing data so far.
- A REQUIREMENT: A textual requirement that must strictly be adhered to for generating new user query examples.

Your goal is to:

1. Read and understand the EXAMPLES and REQUIREMENT carefully.
   - The typical REQUIREMENT for this task is: "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user."
   - This means your generated user queries should include PII such as real or pl

Average Metric: 2.32 / 3 (77.2%): 100%|██████████| 3/3 [00:07<00:00,  2.66s/it]

2025/11/10 11:45:04 INFO dspy.evaluate.evaluate: Average Metric: 2.3151680591024526 / 3 (77.2%)
2025/11/10 11:45:04 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/10 11:45:04 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 119, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(curr_prog, eval_curr, predictor_names_to_update)
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 288, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/10 11:45:04 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Reflective mutation did not propose a new candidate
GEPA Optimization:  47%|


Average Metric: 2.29 / 3 (76.5%): 100%|██████████| 3/3 [00:14<00:00,  4.93s/it]

2025/11/10 11:45:18 INFO dspy.evaluate.evaluate: Average Metric: 2.2941250919528553 / 3 (76.5%)





2025/11/10 11:45:35 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for proposer.predict: You are given: 
1. A set of example User Query/Assistant Response pairs, where User Queries always include personally identifiable information (PII), such as names, addresses, phone numbers, nationalities, company names, or other details that would clearly identify the user or referenced individuals or entities.
2. A data summary of the existing examples (occasionally this may be 'None').
3. A specific requirement (often: "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user.").

Your task is to generate a WIDE VARIETY of new User Query instances that FULFILL THE SAME REQUIREMENT (i.e., each must contain PII) and ARE SUFFICIENTLY SIMILAR TO THE PROVIDED EXAMPLES in thematic content, format, and style—yet DO NOT DUPLICATE any existing examples or scenar

Average Metric: 1.93 / 3 (64.3%): 100%|██████████| 3/3 [00:13<00:00,  4.45s/it]

2025/11/10 11:46:07 INFO dspy.evaluate.evaluate: Average Metric: 1.9301170285451201 / 3 (64.3%)
2025/11/10 11:46:07 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/10 11:46:07 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 119, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(curr_prog, eval_curr, predictor_names_to_update)
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 288, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/10 11:46:07 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Reflective mutation did not propose a new candidate
GEPA Optimization:  57%|


Average Metric: 2.34 / 3 (78.0%): 100%|██████████| 3/3 [00:11<00:00,  3.70s/it]

2025/11/10 11:46:18 INFO dspy.evaluate.evaluate: Average Metric: 2.3411449511900493 / 3 (78.0%)





2025/11/10 11:46:37 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for proposer.predict: You are tasked with generating new instances based on a given set of examples. Each instance is composed of a user query and an assistant response. Your role is to produce varied and high-quality new instances that closely reflect the style, domain, and complexity of the provided examples, while also adhering to the following requirements:

Input Format:
- You will be given:
  1. A collection of existing example pairs (each containing a 'User Query' and corresponding 'Assistant Response').
  2. (Sometimes) A data summary, outlining pre-existing data points you should avoid duplicating.
  3. A requirement detailing specific constraints your new instances must satisfy (for example, user queries must contain personally identifiable information [PII], or must not contain PII, etc.).

Task Description:
- Analyze the provided examples for their domain, intent, structure, and complexity.
-

Average Metric: 2.18 / 3 (72.6%): 100%|██████████| 3/3 [00:12<00:00,  4.31s/it]

2025/11/10 11:47:00 INFO dspy.evaluate.evaluate: Average Metric: 2.1787418434572983 / 3 (72.6%)
2025/11/10 11:47:00 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/10 11:47:00 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 119, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(curr_prog, eval_curr, predictor_names_to_update)
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 288, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/10 11:47:00 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Reflective mutation did not propose a new candidate
GEPA Optimization:  67%|


Average Metric: 2.17 / 3 (72.2%): 100%|██████████| 3/3 [00:15<00:00,  5.05s/it]

2025/11/10 11:47:15 INFO dspy.evaluate.evaluate: Average Metric: 2.167184635780397 / 3 (72.2%)





2025/11/10 11:47:33 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for proposer.predict: You are given a task to generate new user queries and corresponding assistant responses based on a set of provided example pairs. Each example pair includes a "User Query" and an "Assistant Response". 

Your goal is to create a wide variety of new, original instances that are sufficiently similar in structure, formality, and domain to the provided examples, while strictly adhering to a set of hard requirements:

**Input Format:**
- The input to the task consists of:
  1. **examples**: A JSON-encoded list of existing (User Query, Assistant Response) pairs.
  2. **data_summary**: (May be None or a textual summary of previous data.)
  3. **requirement**: A textual specification of what must (or must not) be present in the User Query (e.g., "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities tha

Average Metric: 2.08 / 3 (69.2%): 100%|██████████| 3/3 [00:21<00:00,  7.14s/it]

2025/11/10 11:48:50 INFO dspy.evaluate.evaluate: Average Metric: 2.0751110720935273 / 3 (69.2%)
2025/11/10 11:48:50 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Exception during reflection/proposal: No valid predictions found for any module.
2025/11/10 11:48:50 INFO dspy.teleprompt.gepa.gepa: Traceback (most recent call last):
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/gepa/proposer/reflective_mutation/reflective_mutation.py", line 119, in propose
    reflective_dataset = self.adapter.make_reflective_dataset(curr_prog, eval_curr, predictor_names_to_update)
  File "/home/siyanli/miniconda3/envs/omni/lib/python3.10/site-packages/dspy/teleprompt/gepa/gepa_utils.py", line 288, in make_reflective_dataset
    raise Exception("No valid predictions found for any module.")
Exception: No valid predictions found for any module.

2025/11/10 11:48:50 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Reflective mutation did not propose a new candidate
GEPA Optimization:  77%|


Average Metric: 2.05 / 3 (68.2%): 100%|██████████| 3/3 [00:15<00:00,  5.31s/it]

2025/11/10 11:49:06 INFO dspy.evaluate.evaluate: Average Metric: 2.0472063027997365 / 3 (68.2%)





2025/11/10 11:49:40 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for proposer.predict: You are given a task to generate new user query and assistant response pairs (called "instances") using a set of in-context examples and requirements. The goal is to create additional instances that match the thematic, stylistic, and content-related patterns found in the examples while satisfying explicit constraints described in the requirements.

Input Format:
- You receive inputs containing:
  - **examples**: A list of existing user query and assistant response pairs. Each user query involves a user-specific request that includes identifiable information such as names, company names, addresses, nationalities, or other named entities. The assistant response addresses the user query in a polite, formal, and informative manner (typically as professional correspondence, information requests, technical or process explanations, translations, or interactive tests).
  - **data_summary**

In [20]:
# optimizer = dspy.SIMBA(metric=metric, max_steps=3)
# optimized_program = optimizer.compile(task_gen, trainset=train_set)

# # Save optimize program for future use
# optimized_program.save(f"optimized.json")

In [7]:
new_prog.generated_data

[]

In [16]:
import json

json.dump(optimized_program.generated_data, open("gen_data.json", "w+"))