## Load the benchmark and view one example from the benchmark

In [1]:
api_key = input("Enter your OpenAI API key: ")
import dspy
lm = dspy.LM("openai/gpt-4.1-nano", temperature=1, api_key=api_key)
dspy.configure(lm=lm)

In [4]:
import requests
import dspy
import json
import os
import random

def init_dataset():
    # Load from the url
    url = "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/dataset.json"
    dataset = json.loads(requests.get(url).text)
    dspy_dataset = [
        dspy.Example({
            "message": d['fields']['input'],
            "answer": d['answer'],
        }).with_inputs("message")
        for d in dataset
    ]
    random.Random(0).shuffle(dspy_dataset)
    train_set = dspy_dataset[:int(len(dspy_dataset) * 0.33)]
    val_set = dspy_dataset[int(len(dspy_dataset) * 0.33):int(len(dspy_dataset) * 0.66)]
    test_set = dspy_dataset[int(len(dspy_dataset) * 0.66):]

    return train_set, val_set, test_set

In [5]:
train_set, val_set, test_set = init_dataset()

len(train_set), len(val_set), len(test_set)

(66, 66, 68)

Let's view an example task input

In [25]:
print("Input Message:")
print(train_set[0]['message'])

print("\n\nGold Answer:")
for k, v in json.loads(train_set[0]['answer']).items():
    print(f"{k}: {v}")

Input Message:
Subject: Adjusting Bi-Weekly Cleaning Schedule for My Office

Dear ProCare Facility Solutions Support Team,

I hope this message finds you well. My name is Dr. Alex Turner, and I have been utilizing your services for my office space for the past year. I must say, your team's dedication to maintaining a pristine environment has been commendable and greatly appreciated.

I am reaching out to discuss the scheduling of our regular cleaning services. While I find the logistical challenges of coordinating these services intellectually stimulating, I believe we could optimize the current schedule to better suit the needs of my team and our workflow. Specifically, I would like to explore the possibility of adjusting our cleaning schedule to a bi-weekly arrangement, ideally on Tuesdays and Fridays, to ensure our workspace remains consistently clean without disrupting our research activities.

Previously, I have attempted to adjust the schedule through the online portal, but I enc

## Let's define a simple program to solve this task
The program is a 3-module system, each of which handles the urgency, sentiment and categories classification respectively

In [27]:
from typing import List, Literal


class FacilitySupportAnalyzerUrgency(dspy.Signature):
    """
    Read the provided message and determine the urgency.
    """
    message: str = dspy.InputField()
    urgency: Literal['low', 'medium', 'high'] = dspy.OutputField()

class FacilitySupportAnalyzerSentiment(dspy.Signature):
    """
    Read the provided message and determine the sentiment.
    """
    message: str = dspy.InputField()
    sentiment: Literal['positive', 'neutral', 'negative'] = dspy.OutputField()

class FacilitySupportAnalyzerCategories(dspy.Signature):
    """
    Read the provided message and determine the set of categories applicable to the message.
    """
    message: str = dspy.InputField()
    categories: List[Literal["emergency_repair_services", "routine_maintenance_requests", "quality_and_safety_concerns", "specialized_cleaning_services", "general_inquiries", "sustainability_and_environmental_practices", "training_and_support_requests", "cleaning_services_scheduling", "customer_feedback_and_complaints", "facility_management_issues"]] = dspy.OutputField()

class FacilitySupportAnalyzerMM(dspy.Module):
    def __init__(self):
        self.urgency_module = dspy.ChainOfThought(FacilitySupportAnalyzerUrgency)
        self.sentiment_module = dspy.ChainOfThought(FacilitySupportAnalyzerSentiment)
        self.categories_module = dspy.ChainOfThought(FacilitySupportAnalyzerCategories)
    
    def __call__(self, message: str):
        urgency = self.urgency_module(message=message)
        sentiment = self.sentiment_module(message=message)
        categories = self.categories_module(message=message)

        return dspy.Prediction(
            urgency=urgency.urgency,
            sentiment=sentiment.sentiment,
            categories=categories.categories
        )

program = FacilitySupportAnalyzerMM()

In [33]:
def score_urgency(gold_urgency, pred_urgency):
    """
    Compute score for the urgency module.
    """
    score = 1.0 if gold_urgency == pred_urgency else 0.0
    return score

def score_sentiment(gold_sentiment, pred_sentiment):
    """
    Compute score for the sentiment module.
    """
    score = 1.0 if gold_sentiment == pred_sentiment else 0.0
    return score

def score_categories(gold_categories, pred_categories):
    """
    Compute score for the categories module.
    Uses the same match/mismatch logic as category accuracy in the score.
    """
    correct = 0
    for k, v in gold_categories.items():
        if v and k in pred_categories:
            correct += 1
        elif not v and k not in pred_categories:
            correct += 1
    score = correct / len(gold_categories)
    return score

def metric(example, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Computes a score based on agreement between prediction and gold standard for categories, sentiment, and urgency.
    Returns the score (float).
    """
    # Parse gold standard from example
    gold = json.loads(example['answer'])

    # Compute scores for all modules
    score_urgency_val = score_urgency(gold['urgency'], pred.urgency)
    score_sentiment_val = score_sentiment(gold['sentiment'], pred.sentiment)
    score_categories_val = score_categories(gold['categories'], pred.categories)

    # Overall score: average of the three accuracies
    total = (score_urgency_val + score_sentiment_val + score_categories_val) / 3

    return total

## Define an evaluator and evaluate the base program

In [34]:
import dspy
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

Average Metric: 51.30 / 68 (75.4%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 322.00it/s]

2025/08/12 18:09:18 INFO dspy.evaluate.evaluate: Average Metric: 51.3 / 68 (75.4%)





Unnamed: 0,message,answer,urgency,sentiment,categories,metric
0,"Hey ProCare Support Team, Hope you all are doing great! My name is...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[sustainability_and_environmental_practices],✔️ [1.000]
1,"Hey ProCare Team, Hope you’re all doing well! My name’s Jake, and ...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,positive,"[routine_maintenance_requests, customer_feedback_and_complaints]",✔️ [0.967]
2,"Subject: Assistance Needed for HVAC Maintenance Hi [Receiver], I h...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,neutral,[routine_maintenance_requests],✔️ [1.000]
3,Subject: A Green Inquiry from a Bill Maher Enthusiast Hey ProCare ...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[sustainability_and_environmental_practices],✔️ [1.000]
4,Subject: Inquiry on Sustainability Practices Dear ProCare Facility...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",medium,neutral,[sustainability_and_environmental_practices],✔️ [0.667]
...,...,...,...,...,...,...
63,Subject: Inquiry About Your Eco-Friendly Practices Dear ProCare Fa...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",medium,neutral,[sustainability_and_environmental_practices],✔️ [0.600]
64,Subject: Assistance Needed for Facility Management Issue Dear ProC...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",high,positive,[facility_management_issues],✔️ [0.667]
65,"Subject: Request for Training and Support Hi ProCare Support Team,...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[training_and_support_requests],✔️ [1.000]
66,Subject: Concerns About Studio Maintenance and Rent Increase Dear ...,"{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,negative,"[routine_maintenance_requests, facility_management_issues]",✔️ [0.600]


EvaluationResult(score=75.44, results=<list of 68 results>)

## Load the GEPA Optimizer

In [45]:
import json
import dspy

def feedback_urgency(gold_urgency, pred_urgency):
    """
    Generate feedback for the urgency module.
    """
    score = 1.0 if gold_urgency == pred_urgency else 0.0
    if gold_urgency == pred_urgency:
        feedback = f"You correctly classified the urgency of the message as `{gold_urgency}`. This message is indeed of `{gold_urgency}` urgency."
    else:
        feedback = f"You incorrectly classified the urgency of the message as `{pred_urgency}`. The correct urgency is `{gold_urgency}`. Think about how you could have reasoned to get the correct urgency label."
    return feedback, score

def feedback_sentiment(gold_sentiment, pred_sentiment):
    """
    Generate feedback for the sentiment module.
    """
    score = 1.0 if gold_sentiment == pred_sentiment else 0.0
    if gold_sentiment == pred_sentiment:
        feedback = f"You correctly classified the sentiment of the message as `{gold_sentiment}`. This message is indeed `{gold_sentiment}`."
    else:
        feedback = f"You incorrectly classified the sentiment of the message as `{pred_sentiment}`. The correct sentiment is `{gold_sentiment}`. Think about how you could have reasoned to get the correct sentiment label."
    return feedback, score

def feedback_categories(gold_categories, pred_categories):
    """
    Generate feedback for the categories module.
    Uses the same match/mismatch logic as category accuracy in the score.
    """
    correctly_included = [k for k, v in gold_categories.items() if v and k in pred_categories]
    incorrectly_included = [k for k, v in gold_categories.items() if not v and k in pred_categories]
    incorrectly_excluded = [k for k, v in gold_categories.items() if v and k not in pred_categories]
    correctly_excluded = [k for k, v in gold_categories.items() if not v and k not in pred_categories]  # For completeness in accuracy check

    # Recompute category accuracy (matches score logic)
    score = (len(correctly_included) + len(correctly_excluded)) / len(gold_categories)

    if score == 1.0:
        fb_text = f"The category classification is perfect. You correctly identified that the message falls under the following categories: `{repr(correctly_included)}`."
    else:
        fb_text = f"The category classification is not perfect. You correctly identified that the message falls under the following categories: `{repr(correctly_included)}`.\n"
        if incorrectly_included:
            fb_text += f"However, you incorrectly identified that the message falls under the following categories: `{repr(incorrectly_included)}`. The message DOES NOT fall under these categories.\n"
        if incorrectly_excluded:
            prefix = "Additionally, " if incorrectly_included else "However, "
            fb_text += f"{prefix}you didn't identify the following categories that the message actually falls under: `{repr(incorrectly_excluded)}`.\n"
        fb_text += "Think about how you could have reasoned to get the correct category labels."
    return fb_text, score

def metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    """
    Computes a score based on agreement between prediction and gold standard for categories, sentiment, and urgency.
    Optionally provides feedback text for a specific predictor module, using the same comparison logic as the score.
    Returns a dspy.Prediction with score (float) and feedback (str).
    """
    # Parse gold standard from example
    gold = json.loads(example['answer'])

    # Compute feedback and scores for all modules
    fb_urgency, score_urgency = feedback_urgency(gold['urgency'], pred.urgency)
    fb_sentiment, score_sentiment = feedback_sentiment(gold['sentiment'], pred.sentiment)
    fb_categories, score_categories = feedback_categories(gold['categories'], pred.categories)

    # Overall score: average of the three accuracies
    total = (score_urgency + score_sentiment + score_categories) / 3

    if pred_name is None:
        return total

    elif pred_name == 'urgency_module.predict':
        feedback = fb_urgency
    elif pred_name == 'sentiment_module.predict':
        feedback = fb_sentiment
    elif pred_name == 'categories_module.predict':
        feedback = fb_categories

    return dspy.Prediction(score=total, feedback=feedback)

In [48]:
# Import GEPA and define the optimizer

from dspy import GEPA

optimizer = GEPA(
    metric=metric_with_feedback,
    auto="light",
    num_threads=32,
    track_stats=True,
    reflection_lm=dspy.LM(model="gpt-5", temperature=1.0, max_tokens=32000, api_key=api_key)
)

## Optimize the program with GEPA

In [49]:
optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

2025/08/12 18:13:46 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1643 metric calls of the program. This amounts to 12.45 full evals on the train+val set.
2025/08/12 18:13:46 INFO dspy.teleprompt.gepa.gepa: Using 66 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.


7


2025/08/12 18:13:48 INFO dspy.evaluate.evaluate: Average Metric: 47.56666666666666 / 66 (72.1%)
2025/08/12 18:13:48 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.7207070707070706
2025/08/12 18:13:48 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.7207070707070706


Average Metric: 2.27 / 3 (75.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 58.24it/s]

2025/08/12 18:13:48 INFO dspy.evaluate.evaluate: Average Metric: 2.2666666666666666 / 3 (75.6%)
2025/08/12 18:13:48 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for urgency_module.predict: Task: Determine the urgency of a customer message to ProCare Facility Solutions.

Context and domain:
- Messages are typically sent to ProCare Facility Solutions’ support team about facilities services (e.g., office/residential maintenance, cleaning, HVAC).
- Common topics include cleaning quality (especially in high-traffic areas), HVAC performance/safety, routine maintenance scheduling, and general inquiries (e.g., sustainability practices).

How to assess urgency:
Use these primary factors:
1) Safety and risk:
   - High/urgent if there’s an immediate safety hazard or potential harm (e.g., electrical sparks, gas smell, active water leak/flood, critical HVAC failure in extreme conditions, security breach).
   - Medium if safety is mentioned but described as minor or without signs o


Average Metric: 2.93 / 3 (97.8%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 48.97it/s]

2025/08/12 18:13:48 INFO dspy.evaluate.evaluate: Average Metric: 2.9333333333333336 / 3 (97.8%)





2025/08/12 18:14:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for sentiment_module.predict: You are given a single input:
- message: A professional email-style message, often addressed to ProCare Facility Solutions (facility management/maintenance services). Messages may discuss maintenance quality, safety, cleaning products, HVAC performance, minor household issues (e.g., a leaking faucet), exhibit/artifact preservation needs, or requests for follow-up service.

Your task:
- Determine the overall sentiment conveyed by the message and briefly explain your reasoning.

Key guidance for this domain:
- Many messages will be polite, professional, and solution-seeking, even when describing problems (e.g., concerns about cleaning residues affecting artifacts, inconsistent HVAC performance, or minor leaks). Such messages are typically neutral if they lack strong emotional language.
- Standard courtesies (greetings, “thank you,” “best regards”) do not make a message positiv

Average Metric: 2.90 / 3 (96.7%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 68.63it/s]

2025/08/12 18:14:22 INFO dspy.evaluate.evaluate: Average Metric: 2.9 / 3 (96.7%)





2025/08/12 18:14:46 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for categories_module.predict: You are classifying customer messages sent to ProCare Facility Solutions (a facilities/cleaning services provider). Your goal is to read a single message and assign all applicable categories from a fixed list. Use evidence from the message only; select all that apply; do not add categories that are not supported by the text.

Allowed categories and definitions:
- cleaning_services_scheduling
  - Use when the primary intent is to schedule, reschedule, adjust, or inquire about dates/times for cleaning services.
  - Includes: requests to change cleaning times, book a service, check availability, or align schedules.
  - Exclude when rescheduling is requested only as part of resolving a complaint about poor service (see rule below).

- specialized_cleaning_services
  - Use when the message mentions specific/specialized cleaning types or tasks, such as deep cleaning, carpet maint

Average Metric: 2.53 / 3 (84.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.09it/s]

2025/08/12 18:14:58 INFO dspy.evaluate.evaluate: Average Metric: 2.533333333333333 / 3 (84.4%)





2025/08/12 18:15:21 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for urgency_module.predict: Task: Read the provided message and determine the urgency.

Context/domain:
- Messages typically relate to facility management and services (e.g., facility operations, space utilization, security, sustainability, HVAC systems, maintenance, cleaning services) for a provider like ProCare Facility Solutions.
- Senders may be residential or commercial clients and may reference residents, tenants, property operations, or prior support interactions.

Output format:
- Provide exactly two fields, in this order, no extra text or formatting:
reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
urgency: <one of: low | medium | high>

Urgency levels and decision rules:
- HIGH:
  - Clear or implied immediate risk to safety/security or major operational impact.
  - Explicit urgency signals (e.g., “Urgent,” “Immediate attention required,” “ASAP,” “critical,” “esc

Average Metric: 2.20 / 3 (73.3%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.13s/it]

2025/08/12 18:15:31 INFO dspy.evaluate.evaluate: Average Metric: 2.1999999999999997 / 3 (73.3%)





2025/08/12 18:15:59 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for sentiment_module.predict: Task
- Read the provided message text and classify its overall sentiment as one of: positive, neutral, or negative.

Input format
- You will receive one field:
  - message: A string that may include a Subject line and an email-style body.

Output format
- Output only a single lowercase label: positive, neutral, or negative.
- Do not include any additional text or reasoning.

Classification guidelines
- Focus on the overall emotional tone expressed about the service/interaction, not the message’s functional purpose (e.g., making a request) or formalities.
- If signals are mixed or weak, default to neutral.

Label definitions
- Positive:
  - The message clearly expresses satisfaction, praise, gratitude, or enthusiasm that goes beyond routine politeness.
  - Strong and/or multiple explicit positive cues dominate (e.g., “satisfied client,” “exceptional service,” “truly appreciat

Average Metric: 2.27 / 3 (75.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.07s/it]

2025/08/12 18:16:12 INFO dspy.evaluate.evaluate: Average Metric: 2.2666666666666666 / 3 (75.6%)





2025/08/12 18:17:16 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for categories_module.predict: You are classifying a single customer message sent to ProCare Facility Solutions (a facilities/cleaning services provider). Your job is to assign all and only the applicable categories from a fixed list, based strictly on the message content.

Allowed categories and definitions:
- cleaning_services_scheduling
  - Use only when the message’s primary purpose is to coordinate timing/availability for cleaning services (initial booking, rescheduling, adjusting times).
  - Typical signals: specific dates/times, availability windows, explicit reschedule/change requests, or back-and-forth around timing.
  - Do NOT use when a timing phrase appears merely as part of a broader service request or complaint (e.g., “please arrange a team to visit,” “at your earliest convenience,” “as soon as possible”) without concrete scheduling logistics.

- specialized_cleaning_services
  - Use when t

Average Metric: 1.53 / 3 (51.1%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 78.40it/s]

2025/08/12 18:17:25 INFO dspy.evaluate.evaluate: Average Metric: 1.5333333333333332 / 3 (51.1%)





2025/08/12 18:17:48 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for urgency_module.predict: Task: Read a single incoming message (typically from a client/prospect of a facility management/cleaning services company like ProCare Facility Solutions) and assign an urgency level. Provide a brief justification.

Output format:
- urgency: one of [low, medium, high]
- reasoning: 1–3 concise sentences citing the key cues that led to your classification

General approach:
1) Identify sender context: Are they a current client describing an in-progress service issue, or a prospect seeking information?
2) Look for explicit time sensitivity: deadlines, dates, “today/tomorrow,” events, imminent visits, or service disruptions.
3) Assess potential impact/severity:
   - Safety/health hazards, security risks, operational shutdowns, legal/compliance deadlines.
   - Active service quality mismatches that affect trust/contract fulfillment.
   - Routine information requests or preference c

Average Metric: 2.53 / 3 (84.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 69.37it/s]

2025/08/12 18:18:02 INFO dspy.evaluate.evaluate: Average Metric: 2.533333333333333 / 3 (84.4%)





2025/08/12 18:18:28 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for sentiment_module.predict: You are given a single user “message” and must determine the overall sentiment expressed toward the service provider.

Task
- Read the entire message and classify its sentiment as one of: positive, neutral, negative.
- Return two fields:
  - reasoning: a concise explanation (1–3 sentences) justifying your classification.
  - sentiment: one of exactly ["positive", "neutral", "negative"] in lowercase.

Input format
- You will receive an object with a single field:
  - message: a free-form email-like text, often addressed to “ProCare Support Team,” about facility management or home services (e.g., HVAC maintenance, cleaning protocols, safety).
- The message may include:
  - Subjects such as “Urgent Assistance Needed,” “Routine Maintenance,” or “Quality and Safety Concerns.”
  - Domain-specific terms: HVAC system, maintenance plan, cleaning staff, hazardous materials, inspection

Average Metric: 2.57 / 3 (85.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.85s/it]

2025/08/12 18:18:35 INFO dspy.evaluate.evaluate: Average Metric: 2.5666666666666664 / 3 (85.6%)





2025/08/12 18:18:57 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for sentiment_module.predict: Task
- Read the provided message text and classify its overall sentiment toward the service/interaction as one of: positive, neutral, or negative.

Input format
- You will receive one field:
  - message: A string that may include a Subject line and an email-style body.

Output format
- Output only a single lowercase label: positive, neutral, or negative.
- Do not include any additional text or reasoning.

Core principle
- Base the label on explicit emotional tone directed at the service/interaction (satisfaction vs dissatisfaction), not on the message’s purpose (e.g., making a request) or situational urgency. If signals are mixed or weak, default to neutral.

Label definitions
- Positive:
  - Clear, explicit satisfaction, praise, gratitude, or enthusiasm that goes beyond routine politeness.
  - Multiple and/or strong positive cues dominate (e.g., “exceptional service,” “tru

Average Metric: 2.27 / 3 (75.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.49s/it]

2025/08/12 18:19:10 INFO dspy.evaluate.evaluate: Average Metric: 2.2666666666666666 / 3 (75.6%)





2025/08/12 18:20:13 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for categories_module.predict: You are classifying a single customer message sent to ProCare Facility Solutions (a facilities/cleaning services provider). Your job is to assign all and only the applicable categories from a fixed list, based strictly on the message content.

Input format:
- A single message (may include a subject line and body) from a customer or interested party.

Task goals:
- Identify explicit intents present in the message, such as timing logistics, specific service requests, feedback/complaints, quality/safety issues, training/support needs, sustainability topics, and facility management topics.
- Assign every applicable category from the allowed list based on clear textual evidence.
- Do not infer beyond what is written.

Allowed categories and definitions:
- cleaning_services_scheduling
  - Use only when the message’s primary purpose is to coordinate timing/availability for cleani

Average Metric: 2.57 / 3 (85.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.50s/it]

2025/08/12 18:20:29 INFO dspy.evaluate.evaluate: Average Metric: 2.5666666666666664 / 3 (85.6%)





2025/08/12 18:22:07 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for urgency_module.predict: Task
- Read the provided message and determine its urgency.

Domain/context
- Messages are about facility management/services for a provider like ProCare Facility Solutions (e.g., facility operations, space utilization, security, sustainability practices/waste management, HVAC, plumbing, electrical, maintenance/repairs, cleaning/janitorial).
- Senders may be residential or commercial clients and may mention residents/tenants, property operations, business continuity, or prior support interactions.

Output format (strict)
- Provide exactly two fields, in this order, with no extra text or formatting:
reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
urgency: <one of: low | medium | high>

How to determine urgency
- HIGH:
  - Clear or implied immediate risk to safety/security or major operational impact.
  - Explicit urgency language: “Urgent,” “A

Average Metric: 2.27 / 3 (75.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.63s/it]

2025/08/12 18:22:13 INFO dspy.evaluate.evaluate: Average Metric: 2.2666666666666666 / 3 (75.6%)





2025/08/12 18:24:57 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for categories_module.predict: You are classifying a single inbound customer message sent to ProCare Facility Solutions (a facilities/cleaning services provider). Assign all and only the applicable categories from the fixed list, strictly based on explicit content in the message.

Goal
- Read the message once to identify explicit intents: timing logistics, service-type specificity, feedback/complaints, quality/safety concerns, sustainability/eco topics, general information requests, and non-cleaning maintenance requests.
- Map each intent to categories using the definitions and rules below.
- Output concise reasoning and the selected categories as a JSON array.

Allowed categories and definitions
- cleaning_services_scheduling
  - Use only when the message’s primary purpose is to coordinate timing/availability for cleaning services (initial booking, rescheduling, adjusting times).
  - Typical signals: s

Average Metric: 2.67 / 3 (88.9%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.02it/s]

2025/08/12 18:25:02 INFO dspy.evaluate.evaluate: Average Metric: 2.6666666666666665 / 3 (88.9%)





2025/08/12 18:25:51 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for categories_module.predict: You are classifying a single customer message sent to ProCare Facility Solutions (a facilities/cleaning services provider). Your goal is to assign every applicable category from a fixed list based solely on evidence in the message. Select all that apply; do not add any category that is not clearly supported by the text.

Allowed categories and precise definitions:
- cleaning_services_scheduling
  - Use when the primary intent is to schedule, reschedule, adjust, or inquire about dates/times for cleaning services.
  - Includes: requests to change cleaning times, book a cleaning, check availability for cleaning, or align cleaning schedules.
  - Exclude when timing is mentioned only as part of resolving a complaint about poor cleaning service (see rule below).
  - Exclude for training/support scheduling (see training_and_support_requests).

- specialized_cleaning_services
  - 

Average Metric: 2.30 / 3 (76.7%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.87s/it]

2025/08/12 18:25:59 INFO dspy.evaluate.evaluate: Average Metric: 2.3 / 3 (76.7%)





2025/08/12 18:26:32 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for urgency_module.predict: You are classifying the urgency of incoming messages related to facility management/services for a provider like ProCare Facility Solutions. Messages may come from residential or commercial clients and can reference residents/tenants, property operations, or prior support.

Your output must be exactly two lines, in this order, with no extra text, headers, or formatting:
reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
urgency: <one of: low | medium | high>

Decision rules:

HIGH urgency when any of the following is present:
- Clear or implied immediate risk to safety/security or major operational impact.
- Explicit urgency signals (e.g., “Urgent,” “Immediate,” “ASAP,” “critical,” “emergency,” “escalating”).
- Severe dissatisfaction coupled with a demand for immediate corrective action or evidence of repeated failed support and escalation.
- Tr

Average Metric: 2.90 / 3 (96.7%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.68s/it]

2025/08/12 18:26:43 INFO dspy.evaluate.evaluate: Average Metric: 2.9 / 3 (96.7%)





2025/08/12 18:27:31 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for sentiment_module.predict: Task
- Read the provided message text and classify its overall sentiment toward the service/interaction as one of: positive, neutral, or negative.

Input format
- You will receive one field:
  - message: A string that may include a Subject line and an email-style body.

Output format
- Output only a single lowercase label: positive, neutral, or negative.
- Do not include any additional text, punctuation, or reasoning.

Core principle
- Base the label solely on explicit emotional tone directed at the service/interaction (satisfaction vs dissatisfaction), not on the message’s purpose (e.g., making a request) or situational urgency.
- If signals are mixed or weak, default to neutral.

Label definitions
- Positive:
  - Clear, explicit satisfaction, praise, gratitude, or enthusiasm directed at the service/interaction that goes beyond routine politeness.
  - Multiple and/or stron

Average Metric: 2.57 / 3 (85.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.80s/it]

2025/08/12 18:27:38 INFO dspy.evaluate.evaluate: Average Metric: 2.5666666666666664 / 3 (85.6%)





2025/08/12 18:28:26 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Proposed new text for categories_module.predict: You are classifying a single customer message sent to ProCare Facility Solutions (a facilities/cleaning services provider). Assign all and only the applicable categories from the fixed list below, based strictly on what the message explicitly says.

Allowed categories and definitions:
- cleaning_services_scheduling
  - Use only when the message’s primary purpose is to coordinate timing/availability for cleaning services (initial booking, rescheduling, adjusting times).
  - Typical signals: specific dates/times, availability windows, explicit reschedule/change requests, or back-and-forth around timing.
  - Do NOT use when timing words appear merely as part of a broader service request or complaint (e.g., “please arrange a team,” “at your earliest convenience,” “ASAP”) without concrete scheduling logistics.

- specialized_cleaning_services
  - Use when the message mentions s

Average Metric: 2.23 / 3 (74.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.67s/it]

2025/08/12 18:28:33 INFO dspy.evaluate.evaluate: Average Metric: 2.2333333333333334 / 3 (74.4%)





2025/08/12 18:28:59 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Proposed new text for urgency_module.predict: Task
- Read the provided message (subject and body) and determine the urgency of the sender’s request.
- Domain context: Facility management/services for a provider like ProCare Facility Solutions, including facility operations, space utilization, security, sustainability, HVAC systems, maintenance/repairs, cleaning services, and related support for residential and commercial properties (residents/tenants/business operations).

Output format
- Return exactly two lines in this order, no extra text, headings, or formatting:
  reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
  urgency: <one of exactly: low | medium | high>

Decision rules

HIGH urgency when any of the following are present:
- Clear or implied immediate safety/security risk or major operational impact.
- Explicit urgency signals: “Urgent,” “Immediate,” “ASAP,” “critical,” “escalat

Average Metric: 1.93 / 3 (64.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.41it/s]

2025/08/12 18:29:01 INFO dspy.evaluate.evaluate: Average Metric: 1.9333333333333331 / 3 (64.4%)





2025/08/12 18:29:45 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Proposed new text for sentiment_module.predict: You are given a single input field:
- message: A complete message (often an email with subject, body, and sign-off) related to facility management topics (e.g., maintenance issues, service delays, training inquiries, sustainability, etc.).

Task:
Determine the overall sentiment expressed by the author toward the recipient/company, not the sentiment about the underlying situation or problem itself.

Label set (use lowercase exactly):
- positive
- neutral
- negative

Decision rules:
1) Positive:
   - The author clearly expresses praise, gratitude, confidence, trust, or appreciation toward the recipient/company (e.g., “loyal customer,” “appreciated the exceptional service,” “confident your intervention will help”).
   - Positive language can outweigh the presence of an issue or urgent request if the overall attitude toward the recipient remains appreciative and trusting.

2) N

Average Metric: 1.93 / 3 (64.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.00s/it]

2025/08/12 18:29:55 INFO dspy.evaluate.evaluate: Average Metric: 1.9333333333333333 / 3 (64.4%)





2025/08/12 18:31:13 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Proposed new text for urgency_module.predict: Task: Read the provided message and determine the urgency.

Context/domain:
- Messages relate to facility management and services (e.g., facility operations, space utilization, security, sustainability, HVAC systems, maintenance, cleaning services) for a provider like ProCare Facility Solutions.
- Senders may be residential or commercial clients and may reference residents, tenants, property operations, or prior support interactions.

Output format (strict):
- Provide exactly two fields in this order, no extra text or formatting:
reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
urgency: <one of: low | medium | high>

Decision rules for urgency:
- HIGH:
  - Clear or implied immediate risk to safety/security or major operational impact.
  - Explicit urgency signals (e.g., “Urgent,” “Immediate attention required,” “ASAP,” “critical,” “escalating”

Average Metric: 2.53 / 3 (84.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.80s/it]

2025/08/12 18:31:27 INFO dspy.evaluate.evaluate: Average Metric: 2.533333333333333 / 3 (84.4%)





2025/08/12 18:32:53 INFO dspy.teleprompt.gepa.gepa: Iteration 26: Proposed new text for categories_module.predict: You are classifying a single customer message sent to ProCare Facility Solutions (a facilities and cleaning services provider). Assign all and only the applicable categories from a fixed list, based strictly on the message content.

Allowed categories and definitions:
- cleaning_services_scheduling
  - Use only when the message’s primary purpose is to coordinate timing/availability for cleaning services (initial booking, rescheduling, adjusting times).
  - Typical signals: specific dates/times, availability windows, explicit reschedule/change requests, or clear back-and-forth around timing.
  - Do NOT use when timing is mentioned only vaguely (e.g., “ASAP,” “how quickly can we schedule?”) or when timing is incidental to another primary ask without concrete logistics.

- specialized_cleaning_services
  - Use when the message mentions specific/specialized cleaning types or t

Average Metric: 2.23 / 3 (74.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:07<00:00,  2.37s/it]

2025/08/12 18:33:11 INFO dspy.evaluate.evaluate: Average Metric: 2.2333333333333334 / 3 (74.4%)





2025/08/12 18:36:08 INFO dspy.teleprompt.gepa.gepa: Iteration 28: Proposed new text for categories_module.predict: You are classifying a single incoming customer message for ProCare Facility Solutions (a facilities/cleaning services provider). Assign all and only the applicable categories from a fixed list, strictly based on explicit text in the message.

Input format:
- A single message (may include a subject line and body) from a customer or interested party.

Task goals:
- Identify explicit intents present in the message:
  - timing logistics
  - specialized cleaning needs or product/practice requests
  - feedback/complaints
  - quality/safety concerns
  - general service questions
  - training/support needs
  - facility management topics (beyond cleaning)
  - sustainability/environmental topics
  - routine/preventive maintenance requests (e.g., HVAC, plumbing, electrical)

Allowed categories and precise definitions:
- cleaning_services_scheduling
  - Use only when the primary focus

Average Metric: 2.23 / 3 (74.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 61.57it/s]

2025/08/12 18:36:20 INFO dspy.evaluate.evaluate: Average Metric: 2.2333333333333334 / 3 (74.4%)





2025/08/12 18:37:06 INFO dspy.teleprompt.gepa.gepa: Iteration 30: Proposed new text for categories_module.predict: Task: Categorize a single message by assigning all applicable labels from a predefined category set. Multiple categories may apply.

Input format:
- You will receive a single field:
  - message: a free-form email-like text that may include Subject and Body.

Output format:
- Return two keys only:
  - reasoning: 1–3 concise sentences explaining why each category was chosen.
  - categories: an array of category strings. Use exact labels from the list below. Include all that apply, none that do not. Output in alphabetical order, no duplicates.

Allowed categories and how to recognize them:
- cleaning_services_scheduling
  - The sender is asking to schedule, reschedule, or set a time for cleaning services.
  - Cues: “schedule,” “set up a time,” “next week,” “book a cleaning,” “we’re flexible on the day/time.”
- specialized_cleaning_services
  - The cleaning request is for a sp

Average Metric: 2.57 / 3 (85.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.15s/it]

2025/08/12 18:37:18 INFO dspy.evaluate.evaluate: Average Metric: 2.5666666666666664 / 3 (85.6%)





2025/08/12 18:39:01 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Proposed new text for categories_module.predict: Task: Classify a given message from the facilities/services domain into all applicable categories.

Input format:
- You will be given a single field named "message" containing an email-style message (often addressed to ProCare Facility Solutions / ProCare Support Team) describing a situation, request, concern, or inquiry.

Output format:
- Return ONLY a list of category labels (strings). Include every category that applies (multi-label). If none apply, return an empty list.
- Do not include explanations or extra text in the output.

Category taxonomy and decision rules:
- general_inquiries
  - Use when the sender asks broad questions about services, capabilities, documentation, case studies, methodologies, pricing, scope, or credibility.
  - Example triggers: requests for technical documentation, case studies, “what do you offer,” “how do you do X,” skepticism about claims

Average Metric: 1.97 / 3 (65.6%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.52s/it]

2025/08/12 18:39:16 INFO dspy.evaluate.evaluate: Average Metric: 1.9666666666666666 / 3 (65.6%)





2025/08/12 18:39:51 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Proposed new text for urgency_module.predict: Task: Read the provided message and determine the urgency.

Input format:
- You will receive a single “message” text (may include a subject line and body). Analyze the content for urgency cues.

Context/domain:
- Messages typically relate to facility management and services for a provider like ProCare Facility Solutions.
- Common topics: facility operations, space utilization, security, sustainability, HVAC systems, maintenance, and cleaning services.
- Senders may be residential or commercial clients and may reference residents, tenants, property operations, or prior support interactions.

Output format:
- Provide exactly two fields, in this order, no extra text or formatting:
reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
urgency: <one of: low | medium | high>

Decision rules for urgency:
- HIGH:
  - Clear or implied immediate risk to safe

Average Metric: 2.53 / 3 (84.4%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.98s/it]

2025/08/12 18:39:58 INFO dspy.evaluate.evaluate: Average Metric: 2.533333333333333 / 3 (84.4%)





2025/08/12 18:40:28 INFO dspy.teleprompt.gepa.gepa: Iteration 34: Proposed new text for sentiment_module.predict: Task
- Read the provided message text and classify its overall sentiment as one of: positive, neutral, or negative.

Input format
- You will receive one field:
  - message: A string that may include a Subject line and an email-style body.

Output format
- Output only a single lowercase label: positive, neutral, or negative.
- Do not include any additional text, punctuation, or reasoning.

What to evaluate
- Focus on the overall emotional tone expressed about the service/interaction (e.g., satisfaction vs. dissatisfaction), not the message’s functional purpose (e.g., making a request), logistics, or formalities.

General decision rule
1) If the message is primarily praise/thanks expressing clear satisfaction or enthusiasm about the service/interaction, label positive.
2) Else if it expresses clear dissatisfaction, complaints, frustration, anger, fear, or disappointment about

## Now, let's evaluate the optimized program

In [51]:
evaluate(optimized_program)

Average Metric: 58.77 / 68 (86.4%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 527.68it/s]

2025/08/12 18:42:31 INFO dspy.evaluate.evaluate: Average Metric: 58.766666666666666 / 68 (86.4%)





Unnamed: 0,message,answer,urgency,sentiment,categories,metric
0,"Hey ProCare Support Team, Hope you all are doing great! My name is...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[sustainability_and_environmental_practices],✔️ [1.000]
1,"Hey ProCare Team, Hope you’re all doing well! My name’s Jake, and ...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,positive,[quality_and_safety_concerns],✔️ [0.933]
2,"Subject: Assistance Needed for HVAC Maintenance Hi [Receiver], I h...","{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,neutral,"[routine_maintenance_requests, cleaning_services_scheduling]",✔️ [0.967]
3,Subject: A Green Inquiry from a Bill Maher Enthusiast Hey ProCare ...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,positive,[sustainability_and_environmental_practices],✔️ [1.000]
4,Subject: Inquiry on Sustainability Practices Dear ProCare Facility...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,[general_inquiries],✔️ [0.933]
...,...,...,...,...,...,...
63,Subject: Inquiry About Your Eco-Friendly Practices Dear ProCare Fa...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,"[general_inquiries, sustainability_and_environmental_practices]",✔️ [0.967]
64,Subject: Assistance Needed for Facility Management Issue Dear ProC...,"{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",medium,neutral,[facility_management_issues],✔️ [0.667]
65,"Subject: Request for Training and Support Hi ProCare Support Team,...","{""categories"": {""routine_maintenance_requests"": false, ""customer_f...",low,neutral,[training_and_support_requests],✔️ [0.667]
66,Subject: Concerns About Studio Maintenance and Rent Increase Dear ...,"{""categories"": {""routine_maintenance_requests"": true, ""customer_fe...",medium,neutral,"[facility_management_issues, quality_and_safety_concerns]",✔️ [0.933]


EvaluationResult(score=86.42, results=<list of 68 results>)

GEPA was able to optimize the base program **from 74.41% score to 81.27% score** in just 11 iterations. With higher budget, the optimized program's score can go as high as **89.41%**

### Let's print the prompts that GEPA discovered

In [52]:
for name, pred in optimized_program.named_predictors():
    print("================================")
    print(f"Predictor: {name}")
    print("================================")
    print("Prompt:")
    print(pred.signature.instructions)
    print("*********************************")

Predictor: urgency_module.predict
Prompt:
Task: Read the provided message and determine the urgency.

Context/domain:
- Messages typically relate to facility management and services (e.g., facility operations, space utilization, security, sustainability, HVAC systems, maintenance, cleaning services) for a provider like ProCare Facility Solutions.
- Senders may be residential or commercial clients and may reference residents, tenants, property operations, or prior support interactions.

Output format:
- Provide exactly two fields, in this order, no extra text or formatting:
reasoning: <1–3 concise sentences explaining the key cues that determine urgency>
urgency: <one of: low | medium | high>

Urgency levels and decision rules:
- HIGH:
  - Clear or implied immediate risk to safety/security or major operational impact.
  - Explicit urgency signals (e.g., “Urgent,” “Immediate attention required,” “ASAP,” “critical,” “escalating”).
  - Severe dissatisfaction with demand for immediate corre