From b9567b8e58fb7404e7162ef6e34e768aef9883f1 Mon Sep 17 00:00:00 2001
From: Patrick Gray <pgrayy@amazon.com>
Date: Mon, 14 Jul 2025 16:32:24 +0000
Subject: [PATCH] multi-modal input

---
 docs/examples/python/structured_output.md     | 59 +++++++++++++++----
 docs/examples/python/structured_output.py     | 58 +++++++++++++++---
 docs/user-guide/concepts/agents/prompts.md    | 33 +++++++++--
 .../concepts/agents/structured-output.md      | 39 +++++++++++-
 .../model-providers/amazon-bedrock.md         | 51 ++++++----------
 5 files changed, 180 insertions(+), 60 deletions(-)

diff --git a/docs/examples/python/structured_output.md b/docs/examples/python/structured_output.md
index 1a2a2a8a..ac5326e9 100644
--- a/docs/examples/python/structured_output.md
+++ b/docs/examples/python/structured_output.md
@@ -25,7 +25,7 @@ Structured Output Example
 This example demonstrates how to use structured output with Strands Agents to
 get type-safe, validated responses using Pydantic models.
 """
-
+import tempfile
 from typing import List, Optional
 from pydantic import BaseModel, Field
 from strands import Agent
@@ -33,7 +33,7 @@ from strands import Agent
 def basic_example():
     """Basic example extracting structured information from text."""
     print("\n--- Basic Example ---")
-    
+
     class PersonInfo(BaseModel):
         name: str
         age: int
@@ -41,19 +41,57 @@ def basic_example():
 
     agent = Agent()
     result = agent.structured_output(
-        PersonInfo, 
+        PersonInfo,
         "John Smith is a 30-year-old software engineer"
     )
 
     print(f"Name: {result.name}")      # "John Smith"
-    print(f"Age: {result.age}")        # 30 
+    print(f"Age: {result.age}")        # 30
+    print(f"Job: {result.occupation}") # "software engineer"
+
+
+def multimodal_example():
+    """Basic example extracting structured information from a document."""
+    print("\n--- Multi-Modal Example ---")
+
+    class PersonInfo(BaseModel):
+        name: str
+        age: int
+        occupation: str
+
+    with tempfile.NamedTemporaryFile() as person_file:
+        person_file.write(b"John Smith is a 30-year old software engineer")
+        person_file.flush()
+
+        with open(person_file.name, "rb") as fp:
+            document_bytes = fp.read()
+
+    agent = Agent()
+    result = agent.structured_output(
+        PersonInfo,
+        [
+            {"text": "Please process this application."},
+            {
+                "document": {
+                    "format": "txt",
+                    "name": "application",
+                    "source": {
+                        "bytes": document_bytes,
+                    },
+                },
+            },
+        ]
+    )
+
+    print(f"Name: {result.name}")      # "John Smith"
+    print(f"Age: {result.age}")        # 30
     print(f"Job: {result.occupation}") # "software engineer"
 
 
 def conversation_history_example():
     """Example using conversation history with structured output."""
     print("\n--- Conversation History Example ---")
-    
+
     agent = Agent()
 
     # Build up conversation context
@@ -71,7 +109,7 @@ def conversation_history_example():
     # Uses existing conversation context with a prompt
     print("Extracting structured information from conversation context...")
     result = agent.structured_output(CityInfo, "Extract structured information about Paris")
-    
+
     print(f"City: {result.city}")
     print(f"Country: {result.country}")
     print(f"Population: {result.population}")
@@ -81,7 +119,7 @@ def conversation_history_example():
 def complex_nested_model_example():
     """Example handling complex nested data structures."""
     print("\n--- Complex Nested Model Example ---")
-    
+
     class Address(BaseModel):
         street: str
         city: str
@@ -117,11 +155,12 @@ def complex_nested_model_example():
 
 if __name__ == "__main__":
     print("Structured Output Examples\n")
-    
+
     basic_example()
+    multimodal_example()
     conversation_history_example()
     complex_nested_model_example()
-    
+
     print("\nExamples completed.")
 ```
 
@@ -143,4 +182,4 @@ The `structured_output()` method ensures that the language model generates a res
 
 ## Learn More
 
-For more details on structured output, see the [Structured Output documentation](../../user-guide/concepts/agents/structured-output.md).
\ No newline at end of file
+For more details on structured output, see the [Structured Output documentation](../../user-guide/concepts/agents/structured-output.md).
diff --git a/docs/examples/python/structured_output.py b/docs/examples/python/structured_output.py
index ecf4aa3b..951a613f 100644
--- a/docs/examples/python/structured_output.py
+++ b/docs/examples/python/structured_output.py
@@ -5,6 +5,7 @@
 This example demonstrates how to use structured output with Strands Agents to
 get type-safe, validated responses using Pydantic models.
 """
+import tempfile
 
 from typing import List, Optional
 from pydantic import BaseModel, Field
@@ -13,7 +14,7 @@
 def basic_example():
     """Basic example extracting structured information from text."""
     print("\n--- Basic Example ---")
-    
+
     class PersonInfo(BaseModel):
         name: str
         age: int
@@ -21,19 +22,57 @@ class PersonInfo(BaseModel):
 
     agent = Agent()
     result = agent.structured_output(
-        PersonInfo, 
+        PersonInfo,
         "John Smith is a 30-year-old software engineer"
     )
 
     print(f"Name: {result.name}")      # "John Smith"
-    print(f"Age: {result.age}")        # 30 
+    print(f"Age: {result.age}")        # 30
+    print(f"Job: {result.occupation}") # "software engineer"
+
+
+def multimodal_example():
+    """Basic example extracting structured information from a document."""
+    print("\n--- Multi-Modal Example ---")
+
+    class PersonInfo(BaseModel):
+        name: str
+        age: int
+        occupation: str
+
+    with tempfile.NamedTemporaryFile() as person_file:
+        person_file.write(b"John Smith is a 30-year old software engineer")
+        person_file.flush()
+
+        with open(person_file.name, "rb") as fp:
+            document_bytes = fp.read()
+
+    agent = Agent()
+    result = agent.structured_output(
+        PersonInfo,
+        [
+            {"text": "Please process this application."},
+            {
+                "document": {
+                    "format": "txt",
+                    "name": "application",
+                    "source": {
+                        "bytes": document_bytes,
+                    },
+                },
+            },
+        ]
+    )
+
+    print(f"Name: {result.name}")      # "John Smith"
+    print(f"Age: {result.age}")        # 30
     print(f"Job: {result.occupation}") # "software engineer"
 
 
 def conversation_history_example():
     """Example using conversation history with structured output."""
     print("\n--- Conversation History Example ---")
-    
+
     agent = Agent()
 
     # Build up conversation context
@@ -51,7 +90,7 @@ class CityInfo(BaseModel):
     # Uses existing conversation context with a prompt
     print("Extracting structured information from conversation context...")
     result = agent.structured_output(CityInfo, "Extract structured information about Paris")
-    
+
     print(f"City: {result.city}")
     print(f"Country: {result.country}")
     print(f"Population: {result.population}")
@@ -61,7 +100,7 @@ class CityInfo(BaseModel):
 def complex_nested_model_example():
     """Example handling complex nested data structures."""
     print("\n--- Complex Nested Model Example ---")
-    
+
     class Address(BaseModel):
         street: str
         city: str
@@ -97,9 +136,10 @@ class Person(BaseModel):
 
 if __name__ == "__main__":
     print("Structured Output Examples\n")
-    
+
     basic_example()
+    multimodal_example()
     conversation_history_example()
     complex_nested_model_example()
-    
-    print("\nExamples completed.")
\ No newline at end of file
+
+    print("\nExamples completed.")
diff --git a/docs/user-guide/concepts/agents/prompts.md b/docs/user-guide/concepts/agents/prompts.md
index 85e66f2f..201024a5 100644
--- a/docs/user-guide/concepts/agents/prompts.md
+++ b/docs/user-guide/concepts/agents/prompts.md
@@ -24,23 +24,46 @@ If you do not specify a system prompt, the model will behave according to its de
 
 These are your queries or requests to the agent. The SDK supports multiple techniques for prompting.
 
-### Direct Prompting
+### Text Prompt
 
-The simplest way to interact with an agent is through direct prompting:
+The simplest way to interact with an agent is through a text prompt:
 
 ```python
 response = agent("What is the time in Seattle")
 ```
 
+### Multi-Modal Prompting
+The SDK also supports multi-modal prompts, allowing you to include images, documents, and other content types in your messages:
+
+```python
+with open("path/to/image.png", "rb") as fp:
+    image_bytes = fp.read()
+
+response = agent([
+    {"text": "What can you see in this image?"},
+    {
+        "image": {
+            "format": "png",
+            "source": {
+                "bytes": image_bytes,
+            },
+        },
+    },
+])
+```
+
+For a complete list of supported content types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
+
+
 ### Direct Tool Calls
 
-For programmatic control, you can call tools directly:
+Prompting is a primary functionality of Strands that allows you to invoke tools through natural language requests. However, if at any point you require more programmatic control, Strands also allows you to invoke tools directly:
 
 ```python
 result = agent.tool.current_time(timezone="US/Pacific")
 ```
 
-This bypasses the natural language interface and directly executes the tool with the specified parameters. By default, direct tool calls are added to the [session state](state-sessions.md) but can be optionally not included by specifying `record_direct_tool_call=False`.
+This bypasses the natural language interface and directly executes the tool with the specified parameters. By default, direct tool calls are added to the [session state](state-sessions.md) but can be optionally excluded by specifying `record_direct_tool_call=False`.
 
 ## Prompt Engineering
 
@@ -52,4 +75,4 @@ Further resources:
 * [Amazon Bedrock - Prompt engineering concepts](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-engineering-guidelines.html)
 * [Llama - Prompting](https://www.llama.com/docs/how-to-guides/prompting/)
 * [Anthropic - Prompt engineering overview](https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview)
-* [OpenAI - Prompt engineering](https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results)
\ No newline at end of file
+* [OpenAI - Prompt engineering](https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results)
diff --git a/docs/user-guide/concepts/agents/structured-output.md b/docs/user-guide/concepts/agents/structured-output.md
index 4f984d3f..7c08b104 100644
--- a/docs/user-guide/concepts/agents/structured-output.md
+++ b/docs/user-guide/concepts/agents/structured-output.md
@@ -14,7 +14,7 @@ flowchart LR
         direction TB
         C[convert_pydantic_to_tool_spec] --> D[LLM Response]
     end
-    
+
     B --> Process
     Process --> E[Validated Pydantic Model]
 ```
@@ -69,7 +69,7 @@ class PersonInfo(BaseModel):
 
 agent = Agent()
 result = agent.structured_output(
-    PersonInfo, 
+    PersonInfo,
     "John Smith is a 30-year-old software engineer"
 )
 
@@ -78,6 +78,40 @@ print(f"Age: {result.age}")        # 30
 print(f"Job: {result.occupation}") # "software engineer"
 ```
 
+### Multi-Modal Input
+
+Extract structured information from prompts containing images, documents, and other content types:
+
+```python
+class PersonInfo(BaseModel):
+    name: str
+    age: int
+    occupation: str
+
+with open("path/to/document.pdf", "rb") as fp:
+    document_bytes = fp.read()
+
+agent = Agent()
+result = agent.structured_output(
+    PersonInfo,
+    [
+        {"text": "Please process this application."},
+        {
+            "document": {
+                "format": "pdf",
+                "name": "application",
+                "source": {
+                    "bytes": document_bytes,
+                },
+            },
+        },
+    ]
+)
+```
+
+For a complete list of supported content types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
+
+
 ### Using Conversation History
 
 Structured output can work with existing conversation context:
@@ -159,7 +193,6 @@ except ValidationError as e:
     # 3. Extract partial information from the error
 ```
 
-
 ## Best Practices
 
 - **Keep models focused**: Define specific models for clear purposes
diff --git a/docs/user-guide/concepts/model-providers/amazon-bedrock.md b/docs/user-guide/concepts/model-providers/amazon-bedrock.md
index ba6a1981..5a4b761a 100644
--- a/docs/user-guide/concepts/model-providers/amazon-bedrock.md
+++ b/docs/user-guide/concepts/model-providers/amazon-bedrock.md
@@ -245,44 +245,29 @@ from strands.models import BedrockModel
 bedrock_model = BedrockModel(
     model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0"
 )
+agent = Agent(model=bedrock_model)
 
-
-# Create a message with both text and image content
-messages = [
-    {
-        "role": "user",
-        "content": [
-            {
-                "document": {
-                    "format": "txt",
-                    "name": "example",
-                    "source": {
-                        "bytes": b"Use this document in your response."
-                    }
+# Send the multimodal message to the agent
+response = agent(
+    [
+        {
+            "document": {
+                "format": "txt",
+                "name": "example",
+                "source": {
+                    "bytes": b"Once upon a time..."
                 }
-            },
-            {
-                "text": "Use this media in your response."
-            }
-        ]
-    },
-    {
-        "role": "assistant",
-        "content": [
-            {
-                "text": "I will reference this media in my next response."
             }
-        ]
-    }
-]
-
-# Create an agent with the multimodal model
-agent = Agent(model=bedrock_model, messages=messages)
-
-# Send the multimodal message to the agent
-response = agent("Tell me about the document.")
+        },
+        {
+            "text": "Tell me about the document."
+        }
+    ]
+)
 ```
 
+For a complete list of input types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
+
 ### Guardrails
 
 Amazon Bedrock supports guardrails to help ensure model outputs meet your requirements. Strands allows you to configure guardrails with your [`BedrockModel`](../../../api-reference/models.md#strands.models.bedrock):