From dba82b4aad159c3997e0bff4f8538e357eda2581 Mon Sep 17 00:00:00 2001
From: Daniel Miller <daniel.miller@scale.com>
Date: Mon, 3 Nov 2025 18:11:20 -0500
Subject: [PATCH] Add better tracing for sync_provider

---
 pyproject.toml                                |   6 +-
 requirements-dev.lock                         |  40 +++---
 requirements.lock                             |  39 +++---
 .../adk/providers/_modules/sync_provider.py   | 126 +++++++++++++++---
 4 files changed, 148 insertions(+), 63 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ab86a5fd..8f9ca01d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ dependencies = [
     "pyyaml>=6.0.2,<7",
     "jsonschema>=4.23.0,<5",
     "jsonref>=1.1.0,<2",
-    "temporalio>=1.10.0,<2",
+    "temporalio>=1.18.2,<2",
     "aiohttp>=3.10.10,<4",
     "redis>=5.2.0,<6",
     "litellm>=1.66.0,<2",
@@ -32,7 +32,7 @@ dependencies = [
     "jinja2>=3.1.3,<4",
     "mcp[cli]>=1.4.1",
     "scale-gp>=0.1.0a59",
-    "openai-agents==0.2.7", # 0.2.3 bug - https://github.com/openai/openai-agents-python/issues/1276
+    "openai-agents==0.4.2",
     "tzlocal>=5.3.1",
     "tzdata>=2025.2",
     "pytest>=8.4.0",
@@ -40,7 +40,7 @@ dependencies = [
     "pytest-asyncio>=1.0.0",
     "scale-gp-beta==0.1.0a20",
     "ipykernel>=6.29.5",
-    "openai==1.99.9", # anything higher than 1.99.9 breaks litellm - https://github.com/BerriAI/litellm/issues/13711
+    "openai>=2.2,<3", # Required by openai-agents 0.4.2; litellm now supports openai 2.x (issue #13711 resolved: https://github.com/BerriAI/litellm/issues/13711)
     "cloudpickle>=3.1.1",
     "datadog>=0.52.1",
     "ddtrace>=3.13.0"
diff --git a/requirements-dev.lock b/requirements-dev.lock
index a282ef7f..31ac0a65 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -113,9 +113,9 @@ httpx==0.27.2
     # via mcp
     # via openai
     # via respx
-httpx-aiohttp==0.1.9
     # via scale-gp
     # via scale-gp-beta
+httpx-aiohttp==0.1.9
     # via agentex-sdk
 httpx-sse==0.4.1
     # via mcp
@@ -191,11 +191,11 @@ nox==2023.4.22
 oauthlib==3.3.1
     # via kubernetes
     # via requests-oauthlib
-openai==1.99.9
+openai==2.7.1
     # via agentex-sdk
     # via litellm
     # via openai-agents
-openai-agents==0.2.7
+openai-agents==0.4.2
     # via agentex-sdk
 opentelemetry-api==1.37.0
     # via ddtrace
@@ -219,20 +219,6 @@ prompt-toolkit==3.0.51
 propcache==0.3.1
     # via aiohttp
     # via yarl
-pydantic==2.11.9
-    # via agentex-sdk
-    # via agentex-sdk
-    # via fastapi
-    # via litellm
-    # via mcp
-    # via openai
-    # via openai-agents
-    # via pydantic-settings
-    # via python-on-whales
-    # via scale-gp
-    # via scale-gp-beta
-pydantic-core==2.33.2
-    # via pydantic
 protobuf==5.29.5
     # via ddtrace
     # via temporalio
@@ -247,6 +233,19 @@ pyasn1==0.6.1
     # via rsa
 pyasn1-modules==0.4.2
     # via google-auth
+pydantic==2.11.9
+    # via agentex-sdk
+    # via fastapi
+    # via litellm
+    # via mcp
+    # via openai
+    # via openai-agents
+    # via pydantic-settings
+    # via python-on-whales
+    # via scale-gp
+    # via scale-gp-beta
+pydantic-core==2.33.2
+    # via pydantic
 pydantic-settings==2.10.1
     # via mcp
 pygments==2.18.0
@@ -340,7 +339,7 @@ stack-data==0.6.3
 starlette==0.46.2
     # via fastapi
     # via mcp
-temporalio==1.15.0
+temporalio==1.18.2
     # via agentex-sdk
 tiktoken==0.11.0
     # via litellm
@@ -383,9 +382,6 @@ typing-extensions==4.12.2
     # via pydantic
     # via pydantic-core
     # via pyright
-    # via typing-inspection
-typing-inspection==0.4.1
-    # via pydantic
     # via python-on-whales
     # via referencing
     # via scale-gp
@@ -393,6 +389,8 @@ typing-inspection==0.4.1
     # via temporalio
     # via typer
     # via typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
     # via pydantic-settings
 tzdata==2025.2
     # via agentex-sdk
diff --git a/requirements.lock b/requirements.lock
index 81d91576..b5f77614 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -99,12 +99,12 @@ httpcore==1.0.9
 httpx==0.27.2
     # via agentex-sdk
     # via httpx-aiohttp
-httpx-aiohttp==0.1.9
     # via litellm
     # via mcp
     # via openai
     # via scale-gp
     # via scale-gp-beta
+httpx-aiohttp==0.1.9
     # via agentex-sdk
 httpx-sse==0.4.1
     # via mcp
@@ -174,11 +174,11 @@ nexus-rpc==1.1.0
 oauthlib==3.3.1
     # via kubernetes
     # via requests-oauthlib
-openai==1.99.9
+openai==2.7.1
     # via agentex-sdk
     # via litellm
     # via openai-agents
-openai-agents==0.2.7
+openai-agents==0.4.2
     # via agentex-sdk
 opentelemetry-api==1.37.0
     # via ddtrace
@@ -200,19 +200,6 @@ prompt-toolkit==3.0.51
 propcache==0.3.1
     # via aiohttp
     # via yarl
-pydantic==2.11.9
-    # via agentex-sdk
-    # via fastapi
-    # via litellm
-    # via mcp
-    # via openai
-    # via openai-agents
-    # via pydantic-settings
-    # via python-on-whales
-    # via scale-gp
-    # via scale-gp-beta
-pydantic-core==2.33.2
-    # via pydantic
 protobuf==5.29.5
     # via ddtrace
     # via temporalio
@@ -227,6 +214,19 @@ pyasn1==0.6.1
     # via rsa
 pyasn1-modules==0.4.2
     # via google-auth
+pydantic==2.11.9
+    # via agentex-sdk
+    # via fastapi
+    # via litellm
+    # via mcp
+    # via openai
+    # via openai-agents
+    # via pydantic-settings
+    # via python-on-whales
+    # via scale-gp
+    # via scale-gp-beta
+pydantic-core==2.33.2
+    # via pydantic
 pydantic-settings==2.10.1
     # via mcp
 pygments==2.19.2
@@ -311,7 +311,7 @@ stack-data==0.6.3
 starlette==0.46.2
     # via fastapi
     # via mcp
-temporalio==1.15.0
+temporalio==1.18.2
     # via agentex-sdk
 tiktoken==0.11.0
     # via litellm
@@ -351,9 +351,6 @@ typing-extensions==4.12.2
     # via opentelemetry-api
     # via pydantic
     # via pydantic-core
-    # via typing-inspection
-typing-inspection==0.4.1
-    # via pydantic
     # via python-on-whales
     # via referencing
     # via scale-gp
@@ -361,6 +358,8 @@ typing-inspection==0.4.1
     # via temporalio
     # via typer
     # via typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
     # via pydantic-settings
 tzdata==2025.2
     # via agentex-sdk
diff --git a/src/agentex/lib/adk/providers/_modules/sync_provider.py b/src/agentex/lib/adk/providers/_modules/sync_provider.py
index c22c8cdd..6fac54b5 100644
--- a/src/agentex/lib/adk/providers/_modules/sync_provider.py
+++ b/src/agentex/lib/adk/providers/_modules/sync_provider.py
@@ -28,6 +28,7 @@
 from agents.models.openai_provider import OpenAIProvider
 
 from agentex import AsyncAgentex
+from agentex.lib.utils.logging import make_logger
 from agentex.lib.core.tracing.tracer import AsyncTracer
 from agentex.types.task_message_delta import TextDelta
 from agentex.types.task_message_update import (
@@ -40,6 +41,44 @@
 from agentex.types.tool_request_content import ToolRequestContent
 from agentex.types.tool_response_content import ToolResponseContent
 
+logger = make_logger(__name__)
+
+
+def _serialize_item(item: Any) -> dict[str, Any]:
+    """
+    Universal serializer for any item type from OpenAI Agents SDK.
+
+    Uses model_dump() for Pydantic models, otherwise extracts attributes manually.
+    Filters out internal Pydantic fields that can't be serialized.
+    """
+    if hasattr(item, 'model_dump'):
+        # Pydantic model - use model_dump for proper serialization
+        try:
+            return item.model_dump(mode='json', exclude_unset=True)
+        except Exception:
+            # Fallback to dict conversion
+            return dict(item) if hasattr(item, '__iter__') else {}
+    else:
+        # Not a Pydantic model - extract attributes manually
+        item_dict = {}
+        for attr_name in dir(item):
+            if not attr_name.startswith('_') and attr_name not in ('model_fields', 'model_config', 'model_computed_fields'):
+                try:
+                    attr_value = getattr(item, attr_name, None)
+                    # Skip methods and None values
+                    if attr_value is not None and not callable(attr_value):
+                        # Convert to JSON-serializable format
+                        if hasattr(attr_value, 'model_dump'):
+                            item_dict[attr_name] = attr_value.model_dump()
+                        elif isinstance(attr_value, (str, int, float, bool, list, dict)):
+                            item_dict[attr_name] = attr_value
+                        else:
+                            item_dict[attr_name] = str(attr_value)
+                except Exception:
+                    # Skip attributes that can't be accessed
+                    pass
+        return item_dict
+
 
 class SyncStreamingModel(Model):
     """Simple model wrapper that adds logging to stream_response and supports tracing."""
@@ -109,10 +148,38 @@ async def get_response(
 
                 response = await self.original_model.get_response(**kwargs)
 
-                # Set span output
-                if span:
+                # Set span output with structured data
+                if span and response:
+                    new_items = []
+                    final_output = None
+
+                    # Extract final output text from response
+                    response_final_output = getattr(response, 'final_output', None)
+                    if response_final_output:
+                        final_output = response_final_output
+
+                    # Extract items from the response output
+                    response_output = getattr(response, 'output', None)
+                    if response_output:
+                        output_items = response_output if isinstance(response_output, list) else [response_output]
+
+                        for item in output_items:
+                            item_dict = _serialize_item(item)
+                            if item_dict:
+                                new_items.append(item_dict)
+
+                                # Extract final_output from message type if available
+                                if item_dict.get('type') == 'message' and not final_output:
+                                    content = item_dict.get('content', [])
+                                    if content and isinstance(content, list):
+                                        for content_part in content:
+                                            if isinstance(content_part, dict) and 'text' in content_part:
+                                                final_output = content_part['text']
+                                                break
+
                     span.output = {
-                        "response": str(response) if response else None,
+                        "new_items": new_items,
+                        "final_output": final_output,
                     }
 
                 return response
@@ -160,7 +227,9 @@ async def stream_response(
         # Wrap the streaming in a tracing span if tracer is available
         if self.tracer and self.trace_id:
             trace = self.tracer.trace(self.trace_id)
-            async with trace.span(
+
+            # Manually start the span instead of using context manager
+            span = await trace.start_span(
                 parent_id=self.parent_span_id,
                 name="run_agent_streamed",
                 input={
@@ -172,7 +241,9 @@ async def stream_response(
                     "handoffs": [str(h) for h in handoffs] if handoffs else [],
                     "previous_response_id": previous_response_id,
                 },
-            ) as span:
+            )
+
+            try:
                 # Get the stream from the original model
                 stream_kwargs = {
                     "system_instructions": system_instructions,
@@ -193,23 +264,40 @@ async def stream_response(
                 # Get the stream response from the original model and yield each event
                 stream_response = self.original_model.stream_response(**stream_kwargs)
 
-                # Pass through each event from the original stream
-                event_count = 0
-                final_output = None
+                # Pass through each event from the original stream and track items
+                new_items = []
+                final_response_text = ""
+
                 async for event in stream_response:
-                    event_count += 1
-                    # Track the final output if available
-                    if hasattr(event, 'type') and event.type == 'raw_response_event':
-                        if hasattr(event.data, 'output'):
-                            final_output = event.data.output
+                    event_type = getattr(event, 'type', 'no-type')
+
+                    # Handle response.output_item.done events which contain completed items
+                    if event_type == 'response.output_item.done':
+                        item = getattr(event, 'item', None)
+                        if item is not None:
+                            item_dict = _serialize_item(item)
+                            if item_dict:
+                                new_items.append(item_dict)
+
+                                # Update final_response_text from message type if available
+                                if item_dict.get('type') == 'message':
+                                    content = item_dict.get('content', [])
+                                    if content and isinstance(content, list):
+                                        for content_part in content:
+                                            if isinstance(content_part, dict) and 'text' in content_part:
+                                                final_response_text = content_part['text']
+                                                break
+
                     yield event
 
-                # Set span output
-                if span:
-                    span.output = {
-                        "event_count": event_count,
-                        "final_output": str(final_output) if final_output else None,
-                    }
+                # Set span output with structured data including tool calls and final response
+                span.output = {
+                    "new_items": new_items,
+                    "final_output": final_response_text if final_response_text else None,
+                }
+            finally:
+                # End the span after all events have been yielded
+                await trace.end_span(span)
         else:
             # No tracing, just stream normally
             # Get the stream from the original model