In [1]:
# %%
import dspy

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import torch

print(torch.cuda.is_available())

from diversity_gen import OptDiverseDataGenerator
import pandas
from diversity_metrics import dc_score, negative_cosine_sim, cosine_sim, style_cosine_sim
import random
import json

from dotenv import load_dotenv
load_dotenv(".env")


def metric(gold, pred, trace=None):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_style_cos_score = style_cosine_sim(gold.gold_examples, pred.curr_gens)
    if computed_cos_score > 0.6:
        computed_cos_score = 1
    elif computed_cos_score < 0.4:
        computed_cos_score = 1
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    overall_score = computed_dc_score - computed_cos_score + computed_neg_cos_sim + computed_style_cos_score
    return overall_score

def metric_separate(gold, pred):
    computed_dc_score = dc_score(pred.seen_data + pred.generated_data)
    computed_cos_score = cosine_sim(gold.gold_examples, pred.curr_gens)
    computed_neg_cos_sim = negative_cosine_sim(pred.seen_data + pred.generated_data)
    computed_style_cos_score = style_cosine_sim(gold.gold_examples, pred.curr_gens)
    
    return dspy.Prediction(
        diversity_score=computed_dc_score,
        cosine_sim_ref_pred=computed_cos_score,
        style_cosine_sim_ref=computed_style_cos_score,
        diversity_cos_score=computed_neg_cos_sim
    )

def gepa_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
    metric_score = metric_separate(gold, pred)
    overall_score = metric(gold, pred, trace)
    
    feedback_text = f"The overall score is {overall_score:.2f}, which computed as the cosine similarity between the in-context gold examples and generations ({metric_score.cosine_sim_ref_pred: .2f}) subtracted from the sum of two diversity scores (DC Score = {metric_score.diversity_score: .2f}, Negative Cosine Similarity = {metric_score.diversity_cos_score: .2f}) and Stylistic Cosine Similarity = {metric_score.style_cosine_sim_ref: .2f}. Try to improve the diversity of your response. The generations should be sufficiently similar to the in-context gold examples without being too similar."
    if metric_score.cosine_sim_ref_pred > 0.6:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too high. Aim to be more creative in the generations while adhering to the hard requirements."
        metric_score.cosine_sim_ref_pred = -10
    elif metric_score.cosine_sim_ref_pred < 0.4:
        feedback_text += " The current cosine similarity between the in-context gold examples and the generations is too low. Adhere to the hard requirements and still have generations to be sufficiently similar to the gold examples."
        metric_score.cosine_sim_ref_pred = -1
    if metric_score.style_cosine_sim_ref < 0.3:
        feedback_text += " The gold examples and the generations are not sufficiently stylistically similar."
    return dspy.Prediction(
        score=overall_score,
        feedback=feedback_text,
    )


True


  from .autonotebook import tqdm as notebook_tqdm




/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/siyanli/miniconda3/envs/omni/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/siyanli/mini

In [2]:
import pandas
import dspy
pupa_tnb_data = pandas.read_csv("PUPA_TNB.csv")
random.seed(42)
random_sample = pupa_tnb_data.sample(n=20)

all_examples = []

for i, row in random_sample.iterrows():
    if not pandas.isna(row["user_query"]) and not pandas.isna(row["target_response"]):
        curr_example = "User Query: " + row["user_query"] + "\nAssistant Response: " + row["target_response"]
        all_examples.append(curr_example)    

lm = dspy.LM("gpt-4.1-nano", cache=True)
dspy.configure(lm=lm)

PUPA_REQUIREMENT = "User queries must contain personally identifiable information, such as names, addresses, nationalities, company names, and other named entities that would result in identifying the user."

task_gen = OptDiverseDataGenerator()

# Start creating actual data for opt
dspy_examples = []

for _ in range(250):
    dspy_examples.append(dspy.Example({"gold_examples": random.choices(all_examples, k=3),
                                        "hard_requirement": PUPA_REQUIREMENT}).with_inputs("gold_examples", "hard_requirement"))
    
train_set = dspy_examples[:200]
dev_set = dspy_examples[200:]
tiny_dev = dspy_examples[240:]


In [3]:

eval = dspy.Evaluate(metric=metric, devset=dev_set, return_all_scores=True)


In [4]:
from dspy import GEPA

gepa = GEPA(metric=gepa_metric, track_stats=True, 
            reflection_lm=dspy.LM(model='gpt-4.1', temperature=1.0, max_tokens=32000),
            track_best_outputs=True, max_full_evals=2, perfect_score=1.5, component_selector="all")
new_prog = gepa.compile(task_gen, trainset=train_set[:30], valset=dev_set[:15])
pareto_frontier = new_prog.detailed_results.val_aggregate_scores

2025/11/10 23:13:24 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 90 metric calls of the program. This amounts to 2.00 full evals on the train+val set.
2025/11/10 23:13:24 INFO dspy.teleprompt.gepa.gepa: Using 15 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/90 [00:00<?, ?rollouts/s]

['Explore premium 3D visualization of commercial and residential projects worldwide.', 'Discover how cutting-edge 3D renderings elevate real estate marketing campaigns globally.', 'Showcase innovative interior and exterior 3D visualizations for diverse architectural projects.', 'Experience photorealistic 3D modeling services enhancing product and furniture displays.', 'See top-tier 3D animations and visualizations bringing real estate and design concepts to life.', 'Uncover expert architectural visualization solutions for prominent projects across Europe and the US.', 'Highlight bespoke 3D rendering services for hospitality, commercial, and residential spaces.', 'Present high-quality 3D product images and virtual staging for maximum client impact.', 'Illustrate the impact of detailed 3D rendering on real estate sales and architectural presentations.', 'Show advanced techniques in 3D visualization for luxury apartments, hotels, and retail stores.']
['Explore premium 3D visualization of 

2025/11/10 23:13:59 INFO dspy.evaluate.evaluate: Average Metric: 20.492760132596402 / 15 (136.6%)
2025/11/10 23:13:59 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 1.36618400883976 over 15 / 15 examples
GEPA Optimization:  17%|█▋        | 15/90 [00:34<02:54,  2.32s/rollouts]2025/11/10 23:13:59 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 1.36618400883976


  0%|          | 0/3 [00:00<?, ?it/s]['User Query: Draft a short story about a curious fox exploring a magical forest, emphasizing themes of friendship and adventure.', 'User Query: Provide a simplified explanation of quantum computing suitable for middle school students in JSON format with key concepts and definitions.', 'User Query: Summarize the benefits of renewable energy sources for small communities, highlighting economic, environmental, and social aspects.', "User Query: Translate a product description from Spanish to English, formatted as a JSON object with fields 'product_name', 'features', and 'price'.", 'User Query: Outline a step-by-step guide to starting a small urban vegetable garden, including tips on soil preparation, planting, and maintenance.']
['User Query: Draft a short story about a curious fox exploring a magical forest, emphasizing themes of friendship and adventure.', 'User Query: Provide a simplified explanation of quantum computing suitable for middle school 

2025/11/10 23:14:04 INFO dspy.evaluate.evaluate: Average Metric: 3.8457513671559402 / 3 (128.2%)





2025/11/10 23:14:04 INFO dspy.teleprompt.gepa.gepa: Component 'summarizer.predict' is not in reflective dataset. Skipping.
2025/11/10 23:14:14 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for proposer.predict: You are given a collection of example input-output pairs where the input represents a "User Query" containing factual details, and the output is an "Assistant Response" fulfilling the request in a factual, explanatory, or professional style. These pairs frequently involve summaries, explanations, edits of professional correspondence, translations (sometimes in JSON form), or technical/procedural information. Your task is to generate *new* similar instances for this dataset, avoiding duplication of both specific topics and structure of any existing examples or those listed in the provided data summary.

Special, hard requirement: In every user query you generate, make sure to include personally identifiable information (PII). PII can include, but is not limited t

['User Query: Dr. Sofia Martinez from BioHealth Solutions in Madrid has asked for an update on the partnership agreement with GreenTech Industries in Barcelona. She also wants a summary of current collaboration projects involving their respective R&D teams. Please provide a detailed overview including recent milestones achieved and pending action items.', 'User Query: Mr. Alan Thompson, the HR Director at TechNova Corporation based in London, requested an annual report of employee turnover rates and diversity statistics for the fiscal year 2023. He also asked for a breakdown of recruitment efforts across different departments, specifically for engineering and sales divisions across the UK offices.', 'User Query: Ms. Priya Singh, CEO of Global Edu Services in Mumbai, has reached out to inquire about the status of the international student exchange program with university partners in Canada. She requests a detailed report covering enrollment figures, partner university feedback, and upco

2025/11/10 23:14:23 INFO dspy.evaluate.evaluate: Average Metric: 3.6163481767055314 / 3 (120.5%)
2025/11/10 23:14:23 INFO dspy.teleprompt.gepa.gepa: Iteration 1: New subsample score 3.6163481767055314 is not better than old score 3.8457513671559402, skipping
GEPA Optimization:  23%|██▎       | 21/90 [00:59<03:23,  2.95s/rollouts]2025/11/10 23:14:23 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 1.36618400883976


  0%|          | 0/3 [00:00<?, ?it/s]["User Query: My name is John Doe, living at 123 Maple Street, Springfield. I recently bought a Samsung Galaxy phone, but I'm having trouble with the camera app. Can you help me troubleshoot?", 'User Query: Hello, my name is Maria Garcia from 456 Elm Avenue, Los Angeles. I received a package from Amazon with order number 78910. It was damaged, and I need assistance with a replacement.', "User Query: I'm David Smith, residing at 789 Oak Road, Chicago. I work for Tech Innovations Inc., and I am interested in learning more about your data security policies for corporate clients.", 'User Query: My name is Lisa Chen, living at 321 Pine Lane, New York. I ordered a laptop from BestBuy, order ID 112233, but I haven’t received it yet. Could you provide the delivery status?', 'User Query: My name is Ahmed Khan, based in 88 King Street, Toronto. I am employee number 12345 at Global Tech Ltd., and I want to update my payroll information through your portal.']
[

2025/11/10 23:14:30 INFO dspy.evaluate.evaluate: Average Metric: 4.061783861924192 / 3 (135.4%)





2025/11/10 23:14:30 INFO dspy.teleprompt.gepa.gepa: Component 'summarizer.predict' is not in reflective dataset. Skipping.


KeyboardInterrupt: 

In [None]:
# optimizer = dspy.SIMBA(metric=metric, max_steps=3)
# optimized_program = optimizer.compile(task_gen, trainset=train_set)

# # Save optimize program for future use
# optimized_program.save(f"optimized.json")

In [None]:
new_prog.generated_data

In [None]:
new_prog