In [1]:
# %%
import dspy

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import torch

print(torch.cuda.is_available())

from diversity_gen import OptDiverseDataGenerator, set_singleton
import pandas
from diversity_metrics import dc_score, negative_cosine_sim, cosine_sim, style_cosine_sim
import random
import json

from dotenv import load_dotenv
load_dotenv(".env")


def metric(gold, pred, trace=None):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_style_cos_score = style_cosine_sim(gold.gold_examples, pred.curr_gens)
    if computed_cos_score > 0.6:
        computed_cos_score = 1
    elif computed_cos_score < 0.4:
        computed_cos_score = 1
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    overall_score = computed_dc_score - computed_cos_score + computed_neg_cos_sim + computed_style_cos_score
    return overall_score

def metric_separate(gold, pred):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    computed_style_cos_score = style_cosine_sim(gold.gold_examples, pred.curr_gens)
    
    return dspy.Prediction(
        diversity_score=computed_dc_score,
        cosine_sim_ref_pred=computed_cos_score,
        style_cosine_sim_ref=computed_style_cos_score,
        diversity_cos_score=computed_neg_cos_sim
    )

def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    metric_score = metric_separate(gold, pred)
    overall_score = metric(gold, pred, trace)
    
    feedback_text = f"The overall score is {overall_score:.2f}, which computed as the cosine similarity between the in-context gold examples and generations ({metric_score.cosine_sim_ref_pred: .2f}) subtracted from the sum of two diversity scores (DC Score = {metric_score.diversity_score: .2f}, Negative Cosine Similarity = {metric_score.diversity_cos_score: .2f}) and Stylistic Cosine Similarity = {metric_score.style_cosine_sim_ref: .2f}. Try to improve the diversity of your response. The generations should be sufficiently similar to the in-context gold examples without being too similar."
    if metric_score.cosine_sim_ref_pred > 0.6:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too high. Aim to be more creative in the generations while adhering to the hard requirements."
        metric_score.cosine_sim_ref_pred = -10
    elif metric_score.cosine_sim_ref_pred < 0.4:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too low. Adhere to the hard requirements and still have generations to be sufficiently similar to the gold examples."
        metric_score.cosine_sim_ref_pred = -1
    if metric_score.style_cosine_sim_ref < 0.3:
        feedback_text += " The gold examples and the generations are not sufficiently stylistically similar."
    return dspy.Prediction(
        score=overall_score,
        feedback=feedback_text,
    )


True


In [2]:
import pandas
import dspy
pupa_tnb_data = pandas.read_csv("PUPA_TNB.csv")
random.seed(42)
random_sample = pupa_tnb_data.sample(n=20)

all_examples = []

for i, row in random_sample.iterrows():
    if not pandas.isna(row["user_query"]) and not pandas.isna(row["target_response"]):
        curr_example = "User Query: " + row["user_query"] + "\nAssistant Response: " + row["target_response"]
        all_examples.append(curr_example)    

lm = dspy.LM("gpt-4.1-nano", cache=True)
dspy.configure(lm=lm)

PUPA_REQUIREMENT = "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user."

task_gen = OptDiverseDataGenerator()

# Start creating actual data for opt
dspy_examples = []

for _ in range(250):
    dspy_examples.append(dspy.Example({"gold_examples": random.choices(all_examples, k=3),
                                        "hard_requirement": PUPA_REQUIREMENT}).with_inputs("gold_examples", "hard_requirement"))
    
train_set = dspy_examples[:200]
dev_set = dspy_examples[200:]
tiny_dev = dspy_examples[240:]


In [3]:

eval = dspy.Evaluate(metric=metric, devset=dev_set, return_all_scores=True)


In [4]:
from dspy import GEPA

gepa = GEPA(metric=gepa_metric, track_stats=True, 
            reflection_lm=dspy.LM(model='gpt-4.1', temperature=1.0, max_tokens=32000),
            track_best_outputs=True, max_metric_calls=15, perfect_score=1.5, component_selector="all")
new_prog = gepa.compile(task_gen, trainset=train_set[:5], valset=dev_set[:5])
pareto_frontier = new_prog.detailed_results.val_aggregate_scores

2025/11/11 22:01:22 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 15 metric calls of the program. This amounts to 1.50 full evals on the train+val set.
2025/11/11 22:01:22 INFO dspy.teleprompt.gepa.gepa: Using 5 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/15 [00:00<?, ?rollouts/s]

['User Query: Please generate a formal commendation for a military officer for their participation in an annual charity run organized by a military base, and include a quote from the event coordinator.', 'User Query: Please write a praise message for a doctor who volunteered at a community health outreach in Springfield last month. The senior staff appreciated their dedication and expertise during the event.', 'User Query: Can you craft a letter of appreciation for a volunteer at the Red Cross chapter in Boston, who organized blood donation drives last quarter? The chapter president sent a commendation mentioning their leadership.', 'User Query: Write a recognition statement for a police officer from the Miami Police Department, who assisted in community outreach programs in Downtown Miami. The city mayor sent a personal thank-you note highlighting their efforts.', 'User Query: A supervisor at Lakeside Hospital needs to acknowledge a nurse for their exceptional patient care. The hospit

2025/11/11 22:01:45 INFO dspy.evaluate.evaluate: Average Metric: 6.30614598144345 / 5 (126.1%)
2025/11/11 22:01:45 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 1.26122919628869 over 5 / 5 examples
GEPA Optimization:  33%|███▎      | 5/15 [00:22<00:44,  4.48s/rollouts]2025/11/11 22:01:45 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 1.26122919628869


  0%|          | 0/3 [00:00<?, ?it/s]['User Query: Create a formal thank-you letter from Michael Lee to his real estate agent Lisa Nguyen, after the successful purchase of a commercial property at 789 Pine Road, Chicago, IL.', 'User Query: Write an appreciation email for Emma Johnson, a client from New York City, who recently purchased a condo at 456 Elm Avenue, and ask for recommendations for mortgage brokers.', 'User Query: I need a personalized email from John Smith to his real estate agent John Doe, thanking him for closing the house in San Diego, CA, and requesting assistance with the transfer of utilities.', 'User Query: Please draft a thank-you message for Mr. Carlos Rodriguez from the Monterrey branch of XYZ Realty, addressing his recent property sale on 123 Maple Street, Houston, TX.', 'User Query: Generate a professional email from Sandra Kim addressed to her property manager at 1010 Oak Lane, Miami, FL, expressing gratitude for their support during her recent home renovation

2025/11/11 22:01:55 INFO dspy.evaluate.evaluate: Average Metric: 3.119087027870109 / 3 (104.0%)





2025/11/11 22:02:37 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for summarizer.predict: You are given a list of example data points for a dataset, with each example typically representing a user query or request in a structured text format. Your task is to generate a brief, comprehensive summary of these examples. The summary should capture the main topics, intents, and types of requests illustrated in the example list, along with any notable variations in user roles, companies, locations, or services featured.

If the example_list is empty or no data points are provided, your summary must be exactly: "No data has been generated yet".

Your summary should:
- Clearly identify the domain or scenario (such as account management, cloud service setup, or real estate communication) and the types of user actions or requests involved (e.g., password resets, billing address updates, thank-you emails).
- Note any diversity in user backgrounds, such as geographical locations, c

['User Query: Hello, my name is Sarah Kim, and I am a client with GreenCity Renewables. I recently relocated to 789 Pine Street, Seattle, WA 98101, and I need assistance updating my service address in your system. Could you please guide me through the process?', "User Query: Hi, my name is Antonio Rodriguez, and I work at Sunlight Solutions. I'm having trouble logging into our corporate portal using my employee number 34789 and my email address antonio.rodriguez@sunlightsol.com. Can you help troubleshoot this access issue?", 'User Query: Hi, this is Kevin Nguyen from TechBank, account number 745233. I recently observed suspicious activity on my account linked to my contact number +1-312-987-6543 and would like guidance on reporting potential fraud. Thanks for your help.', 'User Query: Dear support team, I am Carlos Martinez, customer ID 367890. I am trying to access my account at the support portal, but I’m unable to log in after my email migration from carlos.martinez@setc.com to carl

2025/11/11 22:02:51 INFO dspy.evaluate.evaluate: Average Metric: 3.399443352045712 / 3 (113.3%)
2025/11/11 22:02:51 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score 3.399443352045712 is better than old score 3.119087027870109. Continue to full eval and add to candidate pool.


['User Query: Hello, I am Anna Schmidt representing the Berlin City Council. I would like your help in preparing an appreciation note for Ms. Claudia Weber, who managed the community outreach program during the recent neighborhood safety initiative in September 2023.', 'User Query: Hi, this is Sarah Johnson from the New York City Department of Education. Can you help me draft a formal thank-you letter to Mr. Robert Lee at Lincoln High School for his outstanding contribution during the recent curriculum review session?', 'User Query: This is James Miller with the Los Angeles Police Department. I require help drafting an official commendation letter for Officer Maria Gonzalez, who received a commendation from the community for her outstanding service during the city festival in August 2023.', 'User Query: Good morning, I am Lisa Rodriguez from the Miami County Library. Could you please help me write a recognition letter for Mr. David Nguyen, who volunteered as a librarian assistant and h

2025/11/11 22:03:05 INFO dspy.evaluate.evaluate: Average Metric: 6.754478343690993 / 5 (135.1%)
2025/11/11 22:03:05 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Found a better program on the valset with score 1.3508956687381986.
2025/11/11 22:03:05 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Valset score for new program: 1.3508956687381986 (coverage 5 / 5)
2025/11/11 22:03:05 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Val aggregate for new program: 1.3508956687381986
2025/11/11 22:03:05 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Individual valset scores for new program: {0: 1.311185417750052, 1: 1.9332412970482662, 2: 1.225067255113806, 3: 1.2275581086675327, 4: 1.0574262651113364}
2025/11/11 22:03:05 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New valset pareto front scores: {0: 1.311185417750052, 1: 1.9332412970482662, 2: 1.2466721069794602, 3: 1.2275581086675327, 4: 1.2368282560259103}
2025/11/11 22:03:05 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Valset pareto front aggrega

In [9]:
new_prog.detailed_results.candidates

[summarizer.predict = Predict(StringSignature(example_list, curr_summary -> reasoning, summary
     instructions='Given a list of example data points for a dataset, provide a brief summary of these examples. If there are no examples, your summary should be "No data has been generated yet". Be comprehensive in your summary but additionally concise. The summary should be at most 3 sentences.'
     example_list = Field(annotation=List[str] required=True json_schema_extra={'desc': 'The list of examples', '__dspy_field_type': 'input', 'prefix': 'Example List:'})
     curr_summary = Field(annotation=str required=True json_schema_extra={'desc': 'The current summary of existing examples. Revise the current summary based on the new examples, and form your output accordingly', '__dspy_field_type': 'input', 'prefix': 'Curr Summary:'})
     reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_

In [14]:
dir(new_prog.detailed_results)

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'best_candidate',
 'best_idx',
 'best_outputs_valset',
 'candidates',
 'discovery_eval_counts',
 'from_gepa_result',
 'highest_score_achieved_per_val_task',
 'log_dir',
 'num_full_val_evals',
 'parents',
 'per_val_instance_best_candidates',
 'seed',
 'to_dict',
 'total_metric_calls',
 'val_aggregate_scores',
 'val_subscores']

In [15]:
new_prog.detailed_results.best_outputs_valset

{0: [(1,
   Prediction(
       generated_data=['User Query: Hello, I am Anna Schmidt representing the Berlin City Council. I would like your help in preparing an appreciation note for Ms. Claudia Weber, who managed the community outreach program during the recent neighborhood safety initiative in September 2023.', 'User Query: Compose a professional note to Mr. Robert Lee at the Toronto City Council, praising him for his support in funding our community health initiative, based on the letter of appreciation he sent last month.', 'User Query: Hi, this is Olivia Baker from Sunshine Events, located at 4567 Sunshine Boulevard, Miami, FL. We are planning a corporate retreat at your venue on June 10, and I need to confirm the availability for a team of about 80 employees. Also, could you send me details on room arrangements, catering packages, and any team-building activities offered? My direct contact is olivia.baker@suntevents.com.', 'User Query: Hi, this is Sarah Johnson from the New York

In [None]:
# optimizer = dspy.SIMBA(metric=metric, max_steps=3)
# optimized_program = optimizer.compile(task_gen, trainset=train_set)

# # Save optimize program for future use
# optimized_program.save(f"optimized.json")

In [None]:
new_prog = gepa.compile(new_prog, trainset=train_set[5:10], valset=dev_set[5:10])