In [1]:
# %%
import dspy

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import torch

print(torch.cuda.is_available())

from diversity_gen import OptDiverseDataGenerator, set_singleton
import pandas
from diversity_metrics import dc_score, negative_cosine_sim, cosine_sim, style_cosine_sim
import random
import json

from dotenv import load_dotenv
load_dotenv(".env")


def metric(gold, pred, trace=None):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_style_cos_score = style_cosine_sim(gold.gold_examples, pred.curr_gens)
    if computed_cos_score > 0.6:
        computed_cos_score = 1
    elif computed_cos_score < 0.4:
        computed_cos_score = 1
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    overall_score = computed_dc_score - computed_cos_score + computed_neg_cos_sim + computed_style_cos_score
    return overall_score

def metric_separate(gold, pred):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    computed_style_cos_score = style_cosine_sim(gold.gold_examples, pred.curr_gens)
    
    return dspy.Prediction(
        diversity_score=computed_dc_score,
        cosine_sim_ref_pred=computed_cos_score,
        style_cosine_sim_ref=computed_style_cos_score,
        diversity_cos_score=computed_neg_cos_sim
    )

def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    metric_score = metric_separate(gold, pred)
    overall_score = metric(gold, pred, trace)
    
    feedback_text = f"The overall score is {overall_score:.2f}, which computed as the cosine similarity between the in-context gold examples and generations ({metric_score.cosine_sim_ref_pred: .2f}) subtracted from the sum of two diversity scores (DC Score = {metric_score.diversity_score: .2f}, Negative Cosine Similarity = {metric_score.diversity_cos_score: .2f}) and Stylistic Cosine Similarity = {metric_score.style_cosine_sim_ref: .2f}. Try to improve the diversity of your response. The generations should be sufficiently similar to the in-context gold examples without being too similar."
    if metric_score.cosine_sim_ref_pred > 0.6:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too high. Aim to be more creative in the generations while adhering to the hard requirements."
        metric_score.cosine_sim_ref_pred = -10
    elif metric_score.cosine_sim_ref_pred < 0.4:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too low. Adhere to the hard requirements and still have generations to be sufficiently similar to the gold examples."
        metric_score.cosine_sim_ref_pred = -1
    if metric_score.style_cosine_sim_ref < 0.3:
        feedback_text += " The gold examples and the generations are not sufficiently stylistically similar."
    return dspy.Prediction(
        score=overall_score,
        feedback=feedback_text,
    )


True


In [2]:
import pandas
import dspy
pupa_tnb_data = pandas.read_csv("PUPA_TNB.csv")
random.seed(42)
random_sample = pupa_tnb_data.sample(n=20)

all_examples = []

for i, row in random_sample.iterrows():
    if not pandas.isna(row["user_query"]) and not pandas.isna(row["target_response"]):
        curr_example = "User Query: " + row["user_query"] + "\nAssistant Response: " + row["target_response"]
        all_examples.append(curr_example)    

lm = dspy.LM("gpt-4.1-nano", cache=True)
dspy.configure(lm=lm)

PUPA_REQUIREMENT = "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user."

task_gen = OptDiverseDataGenerator()

# Start creating actual data for opt
dspy_examples = []

for _ in range(250):
    dspy_examples.append(dspy.Example({"gold_examples": random.choices(all_examples, k=3),
                                        "hard_requirement": PUPA_REQUIREMENT}).with_inputs("gold_examples", "hard_requirement"))
    
train_set = dspy_examples[:200]
dev_set = dspy_examples[200:]
tiny_dev = dspy_examples[240:]


In [3]:

eval = dspy.Evaluate(metric=metric, devset=dev_set, return_all_scores=True)


In [4]:
from dspy import GEPA

gepa = GEPA(metric=gepa_metric, track_stats=True, 
            reflection_lm=dspy.LM(model='gpt-4.1', temperature=1.0, max_tokens=32000),
            track_best_outputs=True, max_metric_calls=15, perfect_score=1.5, component_selector="all")
new_prog = gepa.compile(task_gen, trainset=train_set[:5], valset=dev_set[:5])
pareto_frontier = new_prog.detailed_results.val_aggregate_scores

2025/11/11 23:04:43 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 15 metric calls of the program. This amounts to 1.50 full evals on the train+val set.
2025/11/11 23:04:43 INFO dspy.teleprompt.gepa.gepa: Using 5 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/15 [00:00<?, ?rollouts/s]

['User Query: This is Ahmed Ali, CEO of Al Noor Construction. I need to schedule a site inspection.', 'User Query: Hello, this is John Doe from Greenfield Property Management. Can you help me update my contact information?', 'User Query: I am Lisa Chang from Bright Future Academy. I want to inquire about employee training programs.', 'User Query: My name is Maria Fernandez from TechSolutions Inc., and I need assistance with my account credentials.', 'User Query: My name is Emily Roberts, and I work at Sunset Real Estate. I have a question regarding property listings.']
['User Query: Can you list all job openings at Acme Corp in New York with links?', 'User Query: This is Ahmed Ali, CEO of Al Noor Construction. I need to schedule a site inspection.', 'User Query: Please find the latest internship postings from Initech located in San Francisco with application links.', 'User Query: I am looking for contract opportunities with Globex International in London, please provide URLs.', 'User Q

2025/11/11 23:05:00 INFO dspy.evaluate.evaluate: Average Metric: 6.2274038809147045 / 5 (124.5%)
2025/11/11 23:05:00 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 1.245480776182941 over 5 / 5 examples
GEPA Optimization:  33%|███▎      | 5/15 [00:16<00:33,  3.35s/rollouts]2025/11/11 23:05:00 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 1.245480776182941


  0%|          | 0/3 [00:00<?, ?it/s]['User Query: My friend Alexei Ivanov, from Moscow, recommended your legal consulting services.', 'User Query: Hello, my name is Maria Gomez from Madrid, and I need assistance with my health records.', 'User Query: I recently moved to Toronto, and I want to find a local pediatrician for my child, Sarah Lee.', 'User Query: Mr. Robert Johnson, a client from Chicago, has requested a detailed report on our financial services.']
Average Metric: 0.95 / 1 (94.6%):  33%|███▎      | 1/3 [00:07<00:14,  7.49s/it]['User Query: I am Jane Smith from Toronto, looking to buy a house in the Riverdale neighborhood. Could you provide the best options available for me and include contact details?', 'User Query: I am Lisa Wang, CEO of Prestige Tech, based in San Francisco. Can you give me a list of recent funding rounds for tech startups in the area along with links?', 'User Query: I am Emily Clark, a student at the University of British Columbia. Could you provide a li

2025/11/11 23:05:12 INFO dspy.evaluate.evaluate: Average Metric: 3.5284594972775536 / 3 (117.6%)





2025/11/11 23:05:50 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for summarizer.predict: You are given a list of example data points, where each data point is a user query. These queries typically include combinations of the following: personal details (names, contact info, addresses, birth dates), geographic locations, organizational affiliations, and explicit service or information requests. The user queries span a variety of domains such as healthcare, legal, immigration, education, business, real estate, and employment, often involving requests for formal communication (e.g., letters, emails, notices), recommendations, professional assistance, reports, or listings (e.g., job openings, property options, funding rounds).

Your task is to generate a brief summary of these example data points. The summary must:

- Accurately reflect the diversity of the user queries, mentioning the inclusion of personal and contextual information, the types of services or responses be

['User Query: My name is Priya Patel, living at 123 Maple Street, Toronto. I am looking for legal assistance regarding a property dispute. Can you connect me with a lawyer in my area?', 'User Query: Hi, my name is Sarah Johnson, and I need to schedule a dental appointment for next Thursday at 3 PM at Sunnydale Dental Clinic in Los Angeles. Can you help?', 'User Query: Hello, I am Michael Lee from Chicago. I want to update my employment information with Acme Corporation. Please guide me through the process.', 'User Query: Good morning, I am Roberto Martinez, residing at 456 Elm Avenue in Miami. I want to inquire about the status of my loan application at Miami Bank.', 'User Query: Hey, my name is Emma Brown, and I am trying to reset my password for my account at GreenTech Solutions. The email associated with my account is emma.brown@gmail.com.']
['User Query: My name is Priya Patel, living at 123 Maple Street, Toronto. I am looking for legal assistance regarding a property dispute. Can 

2025/11/11 23:06:01 INFO dspy.evaluate.evaluate: Average Metric: 3.6234226546114865 / 3 (120.8%)
2025/11/11 23:06:01 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score 3.6234226546114865 is better than old score 3.5284594972775536. Continue to full eval and add to candidate pool.


['User Query: My name is Maria Lopez, and I am seeking legal advice regarding a contract dispute involving ABC Law Firm located at 123 Main Street, Los Angeles, CA.', 'User Query: Hi, this is Robert Anderson from TechNova Inc., based in San Francisco. I would like to schedule a maintenance appointment for our server system scheduled for next Tuesday.', 'User Query: Dear support, my name is David Patel, and I recently purchased a laptop from XYZ Electronics. My order ID is 987654. I would like to request a replacement for a defective unit.', 'User Query: Hello, my name is Michael Chen, and I need assistance with applying for a visa to Canada. Can you help me understand the requirements and process?', 'User Query: I am Jessica Kim, residing at 456 Elm Street, Dallas. I need to book an appointment with a dermatologist for next month.']
['User Query: Can you help me find the contact details of a lawyer named Sarah Johnson at the Law Office of Michael Lee located at 2500 North Lake Shore Dr

2025/11/11 23:06:13 INFO dspy.evaluate.evaluate: Average Metric: 6.036800685291057 / 5 (120.7%)
2025/11/11 23:06:13 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Valset score for new program: 1.2073601370582114 (coverage 5 / 5)
2025/11/11 23:06:13 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Val aggregate for new program: 1.2073601370582114
2025/11/11 23:06:13 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Individual valset scores for new program: {0: 1.0978424780184357, 1: 1.440851868626972, 2: 1.0450744602599968, 3: 1.2946740103064365, 4: 1.1583578680792161}
2025/11/11 23:06:13 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New valset pareto front scores: {0: 1.292948323301971, 1: 1.440851868626972, 2: 1.0450744602599968, 3: 1.3006119908664817, 4: 1.3042108176216003}
2025/11/11 23:06:13 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Valset pareto front aggregate score: 1.2767394921354043
2025/11/11 23:06:13 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Updated valset pareto front programs: {

In [5]:
new_prog.detailed_results.candidates

[summarizer.predict = Predict(StringSignature(example_list, curr_summary -> reasoning, summary
     instructions='Given a list of example data points for a dataset, provide a brief summary of these examples. If there are no examples, your summary should be "No data has been generated yet". Be comprehensive in your summary but additionally concise. The summary should be at most 3 sentences.'
     example_list = Field(annotation=List[str] required=True json_schema_extra={'desc': 'The list of examples', '__dspy_field_type': 'input', 'prefix': 'Example List:'})
     curr_summary = Field(annotation=str required=True json_schema_extra={'desc': 'The current summary of existing examples. Revise the current summary based on the new examples, and form your output accordingly', '__dspy_field_type': 'input', 'prefix': 'Curr Summary:'})
     reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_

In [11]:
new_prog.detailed_results.best_outputs_valset

{0: [(1,
   Prediction(
       generated_data=['Dear Sir/Madam, I am David Johnson from New York, USA. I would like to inquire about the legal requirements for registering a new LLC under my name, David Johnson, at the Manhattan address 456 Fifth Avenue, Apt 12B, New York City. Please advise the necessary steps and documentation for registration in New York State.', 'Dear Mr. Ahmed Al-Farsi, this is Sarah Johnson from the Dubai branch of Global Financial Services. I need to update our compliance documentation for the licensing department. Could you help me get the latest version of the regulatory forms for our office at Dubai Mall, Suite 1504?', 'Hello, my name is Rajesh Kumar from Tech Solutions India based in Bangalore. I am reaching out regarding the recent issues we encountered with our account at the New Delhi branch. Could you please assist me in updating the login credentials for our registered email, rajesh.kumar@techsolutions.in, and verify the security measures in place for o

In [7]:
# optimizer = dspy.SIMBA(metric=metric, max_steps=3)
# optimized_program = optimizer.compile(task_gen, trainset=train_set)

# # Save optimize program for future use
# optimized_program.save(f"optimized.json")

In [8]:
gen_data_max_len, gen_data, seen_data = 0, [], []
data_summary = None
for k in new_prog.detailed_results.best_outputs_valset:
    curr_gen_len = len(new_prog.detailed_results.best_outputs_valset[k][0][1].generated_data)
    if curr_gen_len > gen_data_max_len:
        gen_data = new_prog.detailed_results.best_outputs_valset[k][0][1].generated_data + new_prog.detailed_results.best_outputs_valset[k][0][1].curr_gens
        seen_data = new_prog.detailed_results.best_outputs_valset[k][0][1].seen_data
        data_summary = new_prog.detailed_results.best_outputs_valset[k][0][1].data_summary
        gen_data_max_len = curr_gen_len

In [9]:
new_prog.generated_data = gen_data
new_prog.seen_data = seen_data
new_prog.data_summary = data_summary

In [13]:
set_singleton(gen_data, seen_data, new_prog.data_summary)
task_gen = OptDiverseDataGenerator()
new_prog = gepa.compile(task_gen, trainset=train_set[5:10], valset=dev_set[5:10])

2025/11/11 23:12:16 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 15 metric calls of the program. This amounts to 1.50 full evals on the train+val set.
2025/11/11 23:12:16 INFO dspy.teleprompt.gepa.gepa: Using 5 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/15 [00:00<?, ?rollouts/s]

['User Query: Write an email to Mr. Ivan Klimkovic from Tech Solutions Ltd., thanking him for his insightful feedback on our latest product update. Mention that I noticed a slight misunderstanding regarding our current camera model; clarify that our system uses the FLIR fl3-U3-120S3C-C and the image sensor is Sony IMX172, not the e2V sensor. I have attached the technical data to this email. Please refer to page 18 of the FL3-U3-Technical-Reference PDF for the detailed datasheet and page 13 of the FL3-U3-EMVA-Imaging PDF for quantum efficiency. I will review the files carefully and keep you updated.', "User Query: Draft a professional cold call script for Ms. Linda Morgan to promote her new bookkeeping service, 'Accurate Books', to potential clients in Miami. Assistant Response: Hello, am I speaking with Mr. David Lee? Hi David, this is Ms. Linda Morgan from Accurate Books. We provide specialized bookkeeping and accounting services tailored for small businesses in Miami. Our team ensure

2025/11/11 23:12:23 INFO dspy.evaluate.evaluate: Average Metric: 6.5657716227612575 / 5 (131.3%)
2025/11/11 23:12:23 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 1.3131543245522515 over 5 / 5 examples
GEPA Optimization:  33%|███▎      | 5/15 [00:06<00:13,  1.33s/rollouts]2025/11/11 23:12:23 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 1.3131543245522515


["User Query: Draft a professional cold call script for Ms. Linda Morgan to promote her new bookkeeping service, 'Accurate Books', to potential clients in Miami. Assistant Response: Hello, am I speaking with Mr. David Lee? Hi David, this is Ms. Linda Morgan from Accurate Books. We provide specialized bookkeeping and accounting services tailored for small businesses in Miami. Our team ensures your financial records are accurate and compliant with current regulations, helping you save time and reduce stress. I would love to schedule a quick meeting to discuss how Accurate Books can support your business. You can reach me at [Your Contact] or reply to this email. Thank you for your time, and I look forward to speaking with you soon.", 'User Query: Hello, I am David Williams from the United States, residing at 250 Oak Street, Chicago. I have been awarded a scholarship at HSE University and need support in obtaining a student visa. Please advise on the next steps.', 'User Query: This is Aji



KeyboardInterrupt: 

In [24]:
new_prog

summarizer.predict = Predict(StringSignature(example_list, curr_summary -> reasoning, summary
    instructions='You are provided with an input variable named example_list, which contains a set of example data points from a specific dataset. Each example in the list typically represents a user query or request, generally written in a structured natural language form, often referencing user roles, companies, locations, or service-specific scenarios. Your role is to generate a concise, comprehensive summary of the main topics, intents, and representative features illustrated across the example_list.\n\nYour summary must adhere to the following requirements:\n\n1. Purpose & Scope:\n   - The goal is to synthesize the overarching domain or scenario (e.g., HR communication, professional correspondence, operational support, etc.) and the nature of user actions or requests (such as job applications, praise or recognition acknowledgments, service issues, document clarifications, feedback, apolog