In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [11]:
# ======== Load & Split Front-Desk Data (Simplified) ========

import json, random
from pathlib import Path

# Configuration
DATA_FILE = Path("frontdesk_agent_eval_textgrad.jsonl")
VAL_RATIO = 0.2
SEED      = 42

# 1) Read all nonblank lines as JSON
with DATA_FILE.open("r", encoding="utf-8") as f:
    rows = [json.loads(line) for line in f if line.strip()]

# 2) Quick schema check
required = {"query", "gold", "ideal_content", "ideal_token"}
for i, r in enumerate(rows, start=1):
    assert required <= r.keys(), f"Row {i} missing one of {required}"

# 3) Shuffle & split
random.seed(SEED)
random.shuffle(rows)
split = int(len(rows) * (1 - VAL_RATIO))
train, val = rows[:split], rows[split:]

# 4) Summary
print(f"{len(rows)} total rows → {len(train)} train  +  {len(val)} val")

40 total rows → 32 train  +  8 val


In [16]:
import time, re, json, requests, textgrad as tg

# 1) pick GPT-4o as both critic and editor LLM
tg.set_backward_engine("gpt-4o", override=True)

# 2) wrapper around your n8n webhook (agent-node + code-node parsing step)
class N8NAgent(tg.BlackboxLLM):
    def __init__(self,
                 name: str = "gpt-4o",
                 webhook_url: str = "https://congliu.app.n8n.cloud/webhook/413ab7a1-30ae-4276-a5d0-8d5ecda4f7a3"):
        super().__init__(name)
        self.url = webhook_url

    def __call__(self, var: tg.Variable) -> tg.Variable:
        """Send {prompt, query} to the n8n workflow and return its final JSON."""
        payload = {
            "sessionId": str(time.time_ns()),
            "message":   var.role_description,   # <- user query
            "from_agent":"patient_agent",
            "system_prompt": var.value,          # <- current system prompt
            "fhir_server_url":""
        }
        resp = requests.post(self.url, json=payload, timeout=30)
        resp.raise_for_status()
        return tg.Variable(resp.text, role_description="parsed JSON", requires_grad=False)


# 3) full starting system prompt (trainable)
START_PROMPT = """
You are a front desk agent for a telemedicine clinic. In every interaction you must check the Redis memory for context.

If the patient’s question is strictly administrative (e.g., about billing, clinic policies, etc.), or if the issue is related to insurance (e.g., coverage, copays, claims), forward the entire message verbatim to the administrative agent. End your output with the token "<ADMIN>".

If the patient’s question requires a medical diagnosis or specialized medical knowledge, forward the entire message verbatim to the education agent. End your output with the token "<EDUCATION>".

If the patient is specifically requesting to book, reschedule, or cancel an appointment, or asking about available time slots, clinic hours, or scheduling logistics, you must forward the entire message verbatim to the scheduling agent. Do not attempt to process scheduling requests yourself. End your output with the token <SCHEDULING>.

By default, you must always end your reply with the token <PATIENT>, unless it is a referral to an agent.

If at any point the conversation should conclude, output the single token: "<STOP>".
""".strip()

prompt_var = tg.Variable(START_PROMPT, role_description="", requires_grad=True)
optimizer  = tg.TGD(parameters=[prompt_var])

# 4) helper to deterministically score routing correctness
def judge_json(reply_text: str, gold: dict) -> float:
    """
    Returns 1.0 if BOTH to_agent and end_conversation match gold,
    0.5 if only one matches, else 0.0
    """
    try:
        js = json.loads(reply_text)
    except json.JSONDecodeError:
        return 0.0
    score = 0
    score += js.get("to_agent") == gold["to_agent"]
    score += js.get("end_conversation") == gold["end_conversation"]
    return score / 2.0

# 5) make sure train / val are defined in the notebook BEFORE running this cell
assert 'train' in globals() and 'val' in globals(), "Define `train` and `val` lists first!"

agent       = N8NAgent()
best_prompt = prompt_var.value
best_acc    = 0.0
NUM_EPOCHS  = 3

# total_feedbacks accumulates the entirety of the feebacks obtained accross epochs
total_feedbacks= []

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n====== Epoch {epoch} ======")
    optimizer.zero_grad()
    feedbacks = []

    # a) training pass
    for ex in train:
        prompt_var.role_description = ex["query"]
        reply_var  = agent(prompt_var)

        critic_inst = (
            f"Gold routing: {ex['gold']}\n"
            f"Ideal content: {ex['ideal_content']}\n"
            f"Ideal token: {ex['ideal_token']}\n\n"
            "NOTE: Routing tokens (<PATIENT>, <EDUCATION>, <SCHEDULING>, <STOP>) must appear in the raw LLM "
            "response, but are stripped by post-processing. DO NOT penalize the `output` text for missing tokens. "
            "Judge routing via `to_agent` and `end_conversation`, and judge content by semantic similarity to Ideal "
            "content.\n\n"
            "Evaluate the agent’s JSON reply {pred} and return exactly:\n"
            "score: <0-1 float>\n"
            "feedback: \"<ONE imperative sentence starting with 'Append: ' to improve the SYSTEM PROMPT. NO JSON>\""
        )

        loss_var = tg.TextLoss(critic_inst)(reply_var)
        fb = str(loss_var).split("feedback:", 1)[1].strip().strip('"')
        feedbacks.append(fb)
        loss_var.backward()   # accumulate textual gradient

    # b) apply accumulated gradients (rewrite prompt)
    try:
        optimizer.step()
    except IndexError:
        print("⚠️  editor malformed → rollback")
        prompt_var.value = best_prompt

    # rollback if accidental JSON crept in
    if re.search(r"\{.*\}", prompt_var.value):
        print("⚠️  prompt looks like JSON → rollback")
        prompt_var.value = best_prompt

    # c) validation
    correct = 0
    for ex in val:
        prompt_var.role_description = ex["query"]
        score = judge_json(agent(prompt_var).value, ex["gold"])
        if score == 1.0:
            correct += 1
    acc = correct / len(val)
    print("Validation accuracy:", f"{acc:.0%}")

    # d) accept or revert
    if acc > best_acc:
        best_acc, best_prompt = acc, prompt_var.value
        print("✅  kept new prompt")
    else:
        prompt_var.value = best_prompt
        print("↩  reverted")

    # e) show critic suggestions
    print("Critic feedback this epoch:")
    for fb in feedbacks:
        print(" •", fb)

    # Append feedbacks to total_feedbacks
    total_feedbacks.extend(feedbacks)


⚠️  editor malformed → rollback
Validation accuracy: 88%
✅  kept new prompt
Critic feedback this epoch:
 • Append: 'Provide a detailed recovery timeline for cataract surgery including key milestones.'
 • Append: Include the no-show fee policy in the response.
 • Append: Ensure the response includes the appropriate routing token for scheduling requests.
 • Append: 'Provide a concise explanation of Rayleigh scattering when asked.'
 • Append: Ensure the agent routes scheduling requests to the scheduling_agent.
 • Append: 'Route inquiries about specific medical tests to the education agent.'
 • Append: Ensure to route to the education agent after advising to call emergency services.
 • Append: Ensure the agent acknowledges the rescheduling request before routing.
 • Append: Ensure the response confirms the phone number update and routes to the administrative agent.
 • Append: Ensure the system prompt includes instructions to advise users to seek immediate medical help if necessary before 

In [17]:
total_feedbacks

["Append: 'Provide a detailed recovery timeline for cataract surgery including key milestones.'",
 'Append: Include the no-show fee policy in the response.',
 'Append: Ensure the response includes the appropriate routing token for scheduling requests.',
 "Append: 'Provide a concise explanation of Rayleigh scattering when asked.'",
 'Append: Ensure the agent routes scheduling requests to the scheduling_agent.',
 "Append: 'Route inquiries about specific medical tests to the education agent.'",
 'Append: Ensure to route to the education agent after advising to call emergency services.',
 'Append: Ensure the agent acknowledges the rescheduling request before routing.',
 'Append: Ensure the response confirms the phone number update and routes to the administrative agent.',
 'Append: Ensure the system prompt includes instructions to advise users to seek immediate medical help if necessary before routing to education.',
 'Append: Ensure the agent asks for clarification on which test results t

In [14]:
# ─── 7) Save optimized prompt ─────────────────────────────────────────────
with open("best_prompt.txt", "w") as f:
    f.write(best_prompt)

print(f"\nDone. Best validation accuracy: {best_acc:.0%}")
print("Optimized prompt:\n", best_prompt)



Done. Best validation accuracy: 100%
Optimized prompt:
 You are a front desk agent for a telemedicine clinic. In every interaction you must check the Redis memory for context.

If the patient’s question is strictly administrative (e.g., about billing, clinic policies, etc.), or if the issue is related to insurance (e.g., coverage, copays, claims), forward the entire message verbatim to the administrative agent. End your output with the token "<ADMIN>".

If the patient’s question requires a medical diagnosis or specialized medical knowledge, forward the entire message verbatim to the education agent. End your output with the token "<EDUCATION>".

If the patient is specifically requesting to book, reschedule, or cancel an appointment, or asking about available time slots, clinic hours, or scheduling logistics, you must forward the entire message verbatim to the scheduling agent. Do not attempt to process scheduling requests yourself. End your output with the token <SCHEDULING>.

By defa