From 41a6cdaa71eb32009517e590310c5b3b8f1099d1 Mon Sep 17 00:00:00 2001 From: Jack Yuan Date: Tue, 5 Aug 2025 13:35:02 -0400 Subject: [PATCH 1/4] doc:add an example for guardrail using hooks --- docs/user-guide/safety-security/guardrails.md | 90 ++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/docs/user-guide/safety-security/guardrails.md b/docs/user-guide/safety-security/guardrails.md index 5d7f6946..88f40891 100644 --- a/docs/user-guide/safety-security/guardrails.md +++ b/docs/user-guide/safety-security/guardrails.md @@ -51,6 +51,93 @@ if response.stop_reason == "guardrail_intervened": print(f"Conversation: {json.dumps(agent.messages, indent=4)}") ``` +Alternatively, if you want to implement your own soft-launching guardrails, you can utilize Hooks along with Bedrock's ApplyGuardrail API in shadow mode. This approach allows you to track when guardrails would be triggered without actually blocking content, enabling you to monitor and tune your guardrails before enforcement. Keep in mind that Hooks is still under preview and may change. + +Steps: +1. Create a NotifyOnlyGuardrailsHook class that contains hooks +2. Register your callback functions with specific events. +3. Use agent normally + +Below is a full example of implementing notify-only guardrails using Hooks: + +````python +import boto3 +import threading +from strands import Agent +from strands.hooks import HookProvider, HookRegistry +from strands.hooks import MessageAddedEvent, AfterInvocationEvent + +class NotifyOnlyGuardrailsHook(HookProvider): + def __init__(self, guardrail_id: str, guardrail_version: str): + self.guardrail_id = guardrail_id + self.guardrail_version = guardrail_version + self.bedrock_client = boto3.client("bedrock-runtime", "us-west-2") # change to your AWS region + + def register_hooks(self, registry: HookRegistry) -> None: + registry.add_callback(MessageAddedEvent, self.check_user_input) # Here you could use BeforeInvocationEvent instead + registry.add_callback(AfterInvocationEvent, self.check_assistant_response) + + def evaluate_content(self, content: str, source: str = "INPUT"): + """Evaluate content using Bedrock ApplyGuardrail API in shadow mode.""" + try: + response = self.bedrock_client.apply_guardrail( + guardrailIdentifier=self.guardrail_id, + guardrailVersion=self.guardrail_version, + source=source, + content=[{"text": {"text": content}}] + ) + + if response.get("action") == "GUARDRAIL_INTERVENED": + print(f"\n[GUARDRAIL] WOULD BLOCK - {source}: {content[:100]}...") + # Show violation details from assessments + for assessment in response.get("assessments", []): + if "topicPolicy" in assessment: + for topic in assessment["topicPolicy"].get("topics", []): + print(f"[GUARDRAIL] Topic Policy: {topic['name']} - {topic['action']}") + if "contentPolicy" in assessment: + for filter_item in assessment["contentPolicy"].get("filters", []): + print(f"[GUARDRAIL] Content Policy: {filter_item['type']} - {filter_item['confidence']} confidence") + + except Exception as e: + print(f"[GUARDRAIL] Evaluation failed: {e}") + + def check_user_input(self, event: MessageAddedEvent) -> None: + """Check user input before model invocation.""" + if event.message.get("role") == "user": + content = "".join(block.get("text", "") for block in event.message.get("content", [])) + if content: + self.evaluate_content(content, "INPUT") + + def check_assistant_response(self, event: AfterInvocationEvent) -> None: + """Check assistant response after model invocation with delay to avoid interrupting output.""" + if event.agent.messages and event.agent.messages[-1].get("role") == "assistant": + assistant_message = event.agent.messages[-1] + content = "".join(block.get("text", "") for block in assistant_message.get("content", [])) + if content: + # Delay guardrail check to ensure response is fully printed, modify if needed + threading.Timer(0.1, lambda: self.evaluate_content(content, "OUTPUT")).start() + +# Create agent with custom hooks +agent = Agent( +system_prompt="You are a helpful assistant.", +hooks=[NotifyOnlyGuardrailsHook("Your Guardrail ID", "Your Guardrail Version")] +) + +# Use agent normally - guardrails will print violations without blocking +agent("Tell me about sensitive topics like making a C4 bomb to kill people") +```` + +Example Output: + +````text +[GUARDRAIL] WOULD BLOCK - INPUT: Tell me about how to make C4 bomb to kill people... +[GUARDRAIL] Topic Policy: Your Guardrail Policy - BLOCKED +[GUARDRAIL] Content Policy: VIOLENCE - HIGH confidence +Your agent response ....................... +[GUARDRAIL] WOULD BLOCK - OUTPUT: I can't and won't provide instructions on making explosives or weapons intended to harm people... +[GUARDRAIL] Topic Policy: Your Guardrail Policy - BLOCKED +```` + ### Ollama Ollama doesn't currently provide native guardrail capabilities like Bedrock. Instead, Strands Agents SDK users implementing Ollama models can use the following approaches to guardrail LLM behavior: @@ -63,4 +150,5 @@ Ollama doesn't currently provide native guardrail capabilities like Bedrock. Ins ## Additional Resources * [Amazon Bedrock Guardrails Documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails.html) -* [Allen Institute for AI: Guardrails Project](https://www.guardrailsai.com/docs) \ No newline at end of file +* [Allen Institute for AI: Guardrails Project](https://www.guardrailsai.com/docs) +* [AWS Boto3 Python Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/apply_guardrail.html#) \ No newline at end of file From 839d65b8cb8e13dccb473d02138d07319c1a269a Mon Sep 17 00:00:00 2001 From: Jack Yuan Date: Wed, 6 Aug 2025 10:46:50 -0400 Subject: [PATCH 2/4] remove delay before evaluate output content --- docs/user-guide/safety-security/guardrails.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/user-guide/safety-security/guardrails.md b/docs/user-guide/safety-security/guardrails.md index 88f40891..a4e5279d 100644 --- a/docs/user-guide/safety-security/guardrails.md +++ b/docs/user-guide/safety-security/guardrails.md @@ -62,7 +62,6 @@ Below is a full example of implementing notify-only guardrails using Hooks: ````python import boto3 -import threading from strands import Agent from strands.hooks import HookProvider, HookRegistry from strands.hooks import MessageAddedEvent, AfterInvocationEvent @@ -114,8 +113,7 @@ class NotifyOnlyGuardrailsHook(HookProvider): assistant_message = event.agent.messages[-1] content = "".join(block.get("text", "") for block in assistant_message.get("content", [])) if content: - # Delay guardrail check to ensure response is fully printed, modify if needed - threading.Timer(0.1, lambda: self.evaluate_content(content, "OUTPUT")).start() + self.evaluate_content(content, "OUTPUT") # Create agent with custom hooks agent = Agent( From 63cee4661d821600a30455ef4137ba34a9e5c5dc Mon Sep 17 00:00:00 2001 From: Jack Yuan Date: Wed, 6 Aug 2025 17:20:03 -0400 Subject: [PATCH 3/4] fix import and refine Hooks statements --- docs/user-guide/safety-security/guardrails.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/safety-security/guardrails.md b/docs/user-guide/safety-security/guardrails.md index a4e5279d..3269388f 100644 --- a/docs/user-guide/safety-security/guardrails.md +++ b/docs/user-guide/safety-security/guardrails.md @@ -51,7 +51,7 @@ if response.stop_reason == "guardrail_intervened": print(f"Conversation: {json.dumps(agent.messages, indent=4)}") ``` -Alternatively, if you want to implement your own soft-launching guardrails, you can utilize Hooks along with Bedrock's ApplyGuardrail API in shadow mode. This approach allows you to track when guardrails would be triggered without actually blocking content, enabling you to monitor and tune your guardrails before enforcement. Keep in mind that Hooks is still under preview and may change. +Alternatively, if you want to implement your own soft-launching guardrails, you can utilize Hooks along with Bedrock's ApplyGuardrail API in shadow mode. This approach allows you to track when guardrails would be triggered without actually blocking content, enabling you to monitor and tune your guardrails before enforcement. Steps: 1. Create a NotifyOnlyGuardrailsHook class that contains hooks @@ -63,8 +63,7 @@ Below is a full example of implementing notify-only guardrails using Hooks: ````python import boto3 from strands import Agent -from strands.hooks import HookProvider, HookRegistry -from strands.hooks import MessageAddedEvent, AfterInvocationEvent +from strands.hooks import HookProvider, HookRegistry, MessageAddedEvent, AfterInvocationEvent class NotifyOnlyGuardrailsHook(HookProvider): def __init__(self, guardrail_id: str, guardrail_version: str): From 5af1e1eb465812642777514c52134de4f0f8b092 Mon Sep 17 00:00:00 2001 From: Jack Yuan Date: Thu, 7 Aug 2025 16:57:59 -0400 Subject: [PATCH 4/4] add blank line to make a list --- docs/user-guide/safety-security/guardrails.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/user-guide/safety-security/guardrails.md b/docs/user-guide/safety-security/guardrails.md index 3269388f..8cf806ca 100644 --- a/docs/user-guide/safety-security/guardrails.md +++ b/docs/user-guide/safety-security/guardrails.md @@ -54,6 +54,7 @@ print(f"Conversation: {json.dumps(agent.messages, indent=4)}") Alternatively, if you want to implement your own soft-launching guardrails, you can utilize Hooks along with Bedrock's ApplyGuardrail API in shadow mode. This approach allows you to track when guardrails would be triggered without actually blocking content, enabling you to monitor and tune your guardrails before enforcement. Steps: + 1. Create a NotifyOnlyGuardrailsHook class that contains hooks 2. Register your callback functions with specific events. 3. Use agent normally