diff --git a/PULL_REQUEST.md b/PULL_REQUEST.md new file mode 100644 index 000000000..e326e262b --- /dev/null +++ b/PULL_REQUEST.md @@ -0,0 +1,70 @@ +## Description + +When using A2A (Agent-to-Agent) protocol, agents frequently need to exchange rich content beyond plain text—images for vision tasks, documents for analysis, and videos for multimedia workflows. The current A2A converters only support text content, forcing developers to work around this limitation or lose content fidelity when communicating between agents. + +This PR extends the A2A converters to handle image, document, and video content types, enabling seamless multimodal communication between Strands agents and any A2A-compatible agent. + +Resolves: #1504 + +## Public API Changes + +No public API changes. The existing `convert_content_blocks_to_parts` and `convert_response_to_agent_result` functions now automatically handle additional content types. + +```python +# Before: only text content was converted, other types were silently dropped +content_blocks = [ + {"text": "Analyze this image:"}, + {"image": {"format": "png", "source": {"bytes": image_bytes}}}, +] +parts = convert_content_blocks_to_parts(content_blocks) +# Result: only 1 part (text), image was lost + +# After: all content types are preserved +content_blocks = [ + {"text": "Analyze this image:"}, + {"image": {"format": "png", "source": {"bytes": image_bytes}}}, +] +parts = convert_content_blocks_to_parts(content_blocks) +# Result: 2 parts - TextPart and FilePart with image/png MIME type +``` + +The conversion is bidirectional—A2A FileParts received from remote agents are correctly converted back to Strands ImageContent, DocumentContent, or VideoContent based on MIME type. + +## Related Issues + +#1504 + +## Documentation PR + +N/A - Internal converter changes with no user-facing API modifications. + +## Type of Change + +New feature + +## Testing + +How have you tested the change? Verify that the changes do not break functionality or introduce warnings in consuming repositories: agents-docs, agents-tools, agents-cli + +- [x] I ran `hatch run prepare` + +Added 31 new unit tests covering: +- Image conversion (all formats: png, jpeg, gif, webp) with both inline bytes and S3 URIs +- Document conversion (all formats: pdf, csv, doc, docx, xls, xlsx, html, txt, md) +- Video conversion (all formats: flv, mkv, mov, mpeg, mpg, mp4, three_gp, webm, wmv) +- Mixed content scenarios and edge cases (unknown MIME types, missing MIME types) +- Full round-trip conversion through response handling + +All 136 A2A module tests pass. + +## Checklist +- [x] I have read the CONTRIBUTING document +- [x] I have added any necessary tests that prove my fix is effective or my feature works +- [x] I have updated the documentation accordingly +- [x] I have added an appropriate example to the documentation to outline the feature, or no new docs are needed +- [x] My changes generate no new warnings +- [x] Any dependent changes have been merged and published + +---- + +By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. diff --git a/src/strands/agent/agent.py b/src/strands/agent/agent.py index 05c3af191..2f7c4dbd9 100644 --- a/src/strands/agent/agent.py +++ b/src/strands/agent/agent.py @@ -59,6 +59,7 @@ from ..types.exceptions import ConcurrencyException, ContextWindowOverflowException from ..types.traces import AttributeValue from .agent_result import AgentResult +from .base import AgentBase from .conversation_manager import ( ConversationManager, SlidingWindowConversationManager, @@ -83,7 +84,7 @@ class _DefaultCallbackHandlerSentinel: _DEFAULT_AGENT_ID = "default" -class Agent: +class Agent(AgentBase): """Core Agent implementation. An agent orchestrates the following workflow: diff --git a/src/strands/multiagent/a2a/_converters.py b/src/strands/multiagent/a2a/_converters.py index b818c824b..0dfc4ab16 100644 --- a/src/strands/multiagent/a2a/_converters.py +++ b/src/strands/multiagent/a2a/_converters.py @@ -1,16 +1,92 @@ """Conversion functions between Strands and A2A types.""" -from typing import cast +import base64 +import logging +from typing import Any, cast from uuid import uuid4 +from a2a.types import ( + FilePart, + FileWithBytes, + FileWithUri, + Part, + Role, + TaskArtifactUpdateEvent, + TaskStatusUpdateEvent, + TextPart, +) from a2a.types import Message as A2AMessage -from a2a.types import Part, Role, TaskArtifactUpdateEvent, TaskStatusUpdateEvent, TextPart from ...agent.agent_result import AgentResult from ...telemetry.metrics import EventLoopMetrics from ...types.a2a import A2AResponse from ...types.agent import AgentInput from ...types.content import ContentBlock, Message +from ...types.media import ( + DocumentContent, + DocumentFormat, + ImageContent, + ImageFormat, + VideoContent, + VideoFormat, +) + +logger = logging.getLogger(__name__) + +# MIME type mappings for Strands formats +IMAGE_FORMAT_TO_MIME: dict[ImageFormat, str] = { + "png": "image/png", + "jpeg": "image/jpeg", + "gif": "image/gif", + "webp": "image/webp", +} + +DOCUMENT_FORMAT_TO_MIME: dict[DocumentFormat, str] = { + "pdf": "application/pdf", + "csv": "text/csv", + "doc": "application/msword", + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "xls": "application/vnd.ms-excel", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "html": "text/html", + "txt": "text/plain", + "md": "text/markdown", +} + +VIDEO_FORMAT_TO_MIME: dict[VideoFormat, str] = { + "flv": "video/x-flv", + "mkv": "video/x-matroska", + "mov": "video/quicktime", + "mpeg": "video/mpeg", + "mpg": "video/mpeg", + "mp4": "video/mp4", + "three_gp": "video/3gpp", + "webm": "video/webm", + "wmv": "video/x-ms-wmv", +} + +# Reverse mappings from MIME type to Strands format +MIME_TO_IMAGE_FORMAT: dict[str, ImageFormat] = {v: k for k, v in IMAGE_FORMAT_TO_MIME.items()} +MIME_TO_DOCUMENT_FORMAT: dict[str, DocumentFormat] = {v: k for k, v in DOCUMENT_FORMAT_TO_MIME.items()} +MIME_TO_VIDEO_FORMAT: dict[str, VideoFormat] = {v: k for k, v in VIDEO_FORMAT_TO_MIME.items()} + + +def _get_location_from_uri(uri: str) -> dict[str, Any]: + """Create a Strands location dict from a URI based on its scheme. + + Args: + uri: The URI string (s3://, http://, https://, etc.) + + Returns: + Location dict with appropriate type field based on URI scheme. + """ + if uri.startswith("s3://"): + return {"type": "s3", "uri": uri} + elif uri.startswith("http://") or uri.startswith("https://"): + return {"type": "url", "uri": uri} + else: + # Generic location for unknown schemes + return {"type": "uri", "uri": uri} def convert_input_to_message(prompt: AgentInput) -> A2AMessage: @@ -63,9 +139,107 @@ def convert_input_to_message(prompt: AgentInput) -> A2AMessage: raise ValueError(f"Unsupported input type: {type(prompt)}") +def _convert_image_to_file_part(image: ImageContent) -> Part | None: + """Convert Strands ImageContent to A2A FilePart. + + Args: + image: Strands image content with format and source. + + Returns: + A2A Part containing FilePart, or None if conversion fails. + """ + source = image.get("source", {}) + mime_type = IMAGE_FORMAT_TO_MIME.get(image.get("format", "png"), "image/png") + + # Handle inline bytes + if "bytes" in source and source["bytes"]: + raw_bytes = source["bytes"] + b64_str = base64.standard_b64encode(raw_bytes).decode("utf-8") + file_with_bytes = FileWithBytes(bytes=b64_str, mime_type=mime_type) + return Part(FilePart(file=file_with_bytes, kind="file")) + + # Handle location-based references (S3, HTTP, etc.) + if "location" in source: + location = source["location"] + uri = location.get("uri") + if uri: + file_with_uri = FileWithUri(uri=uri, mime_type=mime_type) + return Part(FilePart(file=file_with_uri, kind="file")) + + logger.debug("content_type= | image content dropped due to empty or missing source") + return None + + +def _convert_document_to_file_part(document: DocumentContent) -> Part | None: + """Convert Strands DocumentContent to A2A FilePart. + + Args: + document: Strands document content with format, name, and source. + + Returns: + A2A Part containing FilePart, or None if conversion fails. + """ + source = document.get("source", {}) + doc_format = document.get("format", "txt") + mime_type = DOCUMENT_FORMAT_TO_MIME.get(doc_format, "application/octet-stream") + name = document.get("name") + + # Handle inline bytes + if "bytes" in source and source["bytes"]: + raw_bytes = source["bytes"] + b64_str = base64.standard_b64encode(raw_bytes).decode("utf-8") + file_with_bytes = FileWithBytes(bytes=b64_str, mime_type=mime_type, name=name) + return Part(FilePart(file=file_with_bytes, kind="file")) + + # Handle location-based references (S3, HTTP, etc.) + if "location" in source: + location = source["location"] + uri = location.get("uri") + if uri: + file_with_uri = FileWithUri(uri=uri, mime_type=mime_type, name=name) + return Part(FilePart(file=file_with_uri, kind="file")) + + logger.debug("content_type=, name=<%s> | document content dropped due to empty or missing source", name) + return None + + +def _convert_video_to_file_part(video: VideoContent) -> Part | None: + """Convert Strands VideoContent to A2A FilePart. + + Args: + video: Strands video content with format and source. + + Returns: + A2A Part containing FilePart, or None if conversion fails. + """ + source = video.get("source", {}) + video_format = video.get("format", "mp4") + mime_type = VIDEO_FORMAT_TO_MIME.get(video_format, "video/mp4") + + # Handle inline bytes + if "bytes" in source and source["bytes"]: + raw_bytes = source["bytes"] + b64_str = base64.standard_b64encode(raw_bytes).decode("utf-8") + file_with_bytes = FileWithBytes(bytes=b64_str, mime_type=mime_type) + return Part(FilePart(file=file_with_bytes, kind="file")) + + # Handle location-based references (S3, HTTP, etc.) + if "location" in source: + location = source["location"] + uri = location.get("uri") + if uri: + file_with_uri = FileWithUri(uri=uri, mime_type=mime_type) + return Part(FilePart(file=file_with_uri, kind="file")) + + logger.debug("content_type=