# Create Protein Design Agent with AWS HealthOmics Workflow Integration

This notebook demonstrates how to create a Bedrock agent that can trigger AWS HealthOmics workflows for protein design optimization.

# Pre-requisites

1. Go through the notebook environment setup in the agents_catalog/0-Notebook-environment/ folder

2. Deploy protein_design_stack.yaml to your AWS account to instantiate a ECR repository with a custom Docker image, a AWS HealthOmics (AHO) private workflow, and a lambda function that invokes the AHO workflow


# Steps for deploying the CloudFormation stack:
1. Create a S3 bucket for storing required files in the same region as your cf stack
2. Upload workflow definition files to S3
3. Package and upload container code to S3
4. Download and store ESM2 model weights in S3
5. Deploy the CloudFormation stack

## Steps 1-3. Create S3 bucket and upload workflow files, container code, and cf template

In [None]:
REGION = "us-west-2"  # Your desired region
S3_BUCKET_NAME = "protein_design_west2"  # Your bucket name
STACK_NAME = 'protein_design_stack'  # Your cf stack name

In [None]:
import json
import os
import shutil
import boto3
import datetime


# Function to create S3 bucket in specified region
def create_s3_bucket(bucket_name, region):
    """
    Create an S3 bucket in the specified region if it doesn't exist
    
    Parameters:
    bucket_name (str): Name of the S3 bucket to create
    region (str): AWS region where the bucket should be created
    
    Returns:
    bool: True if bucket was created or already exists, False otherwise
    """
    s3_client = boto3.client('s3', region_name=region)
    
    try:
        # Check if bucket already exists
        response = s3_client.head_bucket(Bucket=bucket_name)
        print(f"Bucket {bucket_name} already exists")
        return True
    except Exception as e:
        if "404" in str(e):
            # Bucket doesn't exist, create it
            try:
                if region == 'us-east-1':
                    # Special case for us-east-1 which doesn't accept LocationConstraint
                    response = s3_client.create_bucket(
                        Bucket=bucket_name
                    )
                else:
                    response = s3_client.create_bucket(
                        Bucket=bucket_name,
                        CreateBucketConfiguration={
                            'LocationConstraint': region
                        }
                    )
                print(f"Successfully created bucket {bucket_name} in {region}")
                return True
            except Exception as create_error:
                print(f"Error creating bucket: {create_error}")
                return False
        else:
            print(f"Error checking bucket: {e}")
            return False

# Create zip file of container code
def create_container_zip():
    try:
        shutil.make_archive('code', 'zip', 'container')
        print("Successfully created code.zip from container directory")
    except Exception as e:
        print(f"Error creating zip file: {e}")

# Upload workflow files, container code, and cf template to S3
def upload_to_s3(bucket_name):
    s3 = boto3.client('s3')
    
    # Upload workflow files
    workflow_files = ['main.nf', 'nextflow.config', 'config.yaml', 'parameter-template.json']
    for file in workflow_files:
        try:
            s3.upload_file(
                f'aho_workflow/{file}', 
                bucket_name, 
                f'workflow/{file}'
            )
            print(f"Uploaded {file} to s3://{bucket_name}/workflow/")
        except Exception as e:
            print(f"Error uploading {file}: {e}")
    
    # Upload container code zip
    try:
        s3.upload_file(
            'code.zip',
            bucket_name,
            'code.zip'
        )
        print(f"Uploaded code.zip to s3://{bucket_name}/")
    except Exception as e:
        print(f"Error uploading code.zip: {e}")

    # Upload cf template
    try:
        s3.upload_file(
            'protein_design_stack.yaml',
            bucket_name,
            'templates/protein_design_stack.yaml'
        )
        print(f"Uploaded cf template to s3://{bucket_name}/templates/")
    except Exception as e:
        print(f"Error uploading cf template: {e}")


# Define the CloudFormation parameters
def write_cf_parameters(bucket_name):
    '''Write the param JSON file for creating the cf stack'''
    cf_parameters = [
        {
            "ParameterKey": "S3BucketName",
            "ParameterValue": bucket_name
        },
        {
            "ParameterKey": "StackPrefix",
            "ParameterValue": "protein-design"  # Default value from the template
        },
        {
            "ParameterKey": "ApplicationName",
            "ParameterValue": "HealthOmics-Workflow"  # Default value from the template
        },
        {
            "ParameterKey": "WorkflowPath",
            "ParameterValue": "workflow"  # Default value from the template
        },
        {
            "ParameterKey": "SecretName",
            "ParameterValue": "protein-design-secret"  # Default value from the template
        }
    ]

    # Write parameters to cf_parameter.json file
    with open('cf_parameters.json', 'w') as f:
        json.dump(cf_parameters, f, indent=2)

    print(f"CloudFormation parameters written to cf_parameters.json")
    print(f"File path: {os.path.abspath('cf_parameters.json')}")

# Create the S3 bucket if it doesn't exist
bucket_created = create_s3_bucket(S3_BUCKET_NAME, REGION)

if bucket_created:
    # Create and write CloudFormation parameters
    write_cf_parameters(S3_BUCKET_NAME)
    
    # Create zip and upload files
    create_container_zip()
    upload_to_s3(S3_BUCKET_NAME)
else:
    print("Failed to create or verify S3 bucket. CloudFormation parameters not written.")


## Step 4. Download model weights and store in s3
We will run ML-guided directed evolution on a protein sequence by using pre-trained protein language model (PLM) to guide the mutations using the EvoProtGrad framework.
To use a pretrained PLM, download this model [facebook/esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D/tree/main) from huggingface and store the weights at `s3://protein_design_west2/models/esm2_t6_8M_UR50D/`

## Step 5. Deploy the CloudFormation stack using these AWS CLI commands:
````
# Deploy the CloudFormation stack
aws cloudformation create-stack \
    --stack-name protein_design_stack \
    --template-url https://protein_design_west2.s3.amazonaws.com/templates/protein_design_stack.yaml \
    --parameters file://cf_parameters.json \
    --capabilities CAPABILITY_IAM CAPABILITY_AUTO_EXPAND CAPABILITY_NAMED_IAM \
    --region us-west-2

# Monitor stack creation
aws cloudformation describe-stacks \
    --stack-name protein_design_stack \
    --query 'Stacks[0].StackStatus'

# Get stack outputs once complete
aws cloudformation describe-stacks \
    --stack-name protein_design_stack \
    --query 'Stacks[0].Outputs'
```

## Below we will create the Bedrock Agent with code and attach action groups to the Agent

#### Load in environment variables to notebook

In [None]:
# Retrieve import path
%store -r IMPORTS_PATH

# Retrieve account info
%store -r account_id
%store -r region

# Retrieve model lists
%store -r agent_foundation_model

%run $IMPORTS_PATH

In [None]:
IMPORTS_PATH

## Configure AWS clients and parameters

In [None]:
import boto3
import json
import time
import uuid
from botocore.exceptions import ClientError

# Configure AWS clients
session = boto3.Session()
account_id = boto3.client('sts').get_caller_identity()['Account']

bedrock = boto3.client('bedrock', REGION)
cfn = boto3.client('cloudformation', REGION)

## Get CloudFormation Outputs

In [None]:
import boto3
from botocore.exceptions import ClientError
import json


# Initialize the CloudFormation client with the specific region
cloudformation = boto3.client('cloudformation', region_name=REGION)

def get_cloudformation_outputs(stack_name):
    try:
        response = cloudformation.describe_stacks(StackName=stack_name)
        outputs = {}
        for output in response['Stacks'][0]['Outputs']:
            outputs[output['OutputKey']] = output['OutputValue']
        return outputs
    except ClientError as e:
        print(f"Error getting CloudFormation outputs: {e}")
        raise

# Get the outputs from CloudFormation
cf_outputs = get_cloudformation_outputs(STACK_NAME)
print("CloudFormation Outputs:")
print(json.dumps(cf_outputs, indent=2))

In [None]:
trigger_func_arn = cf_outputs["TriggerFunctionArn"]
trigger_func_name = "WorkflowTriggerFunction"
monitor_func_arn = cf_outputs["MonitorFunctionArn"]
monitor_func_name = "WorkflowMonitorFunction"

## Create Bedrock Agent

In [None]:
# Define agent configuration
agent_name = 'ProteinDesignAgent'
agent_description = "Agent for protein design using AWS HealthOmics workflows"
agent_instruction = """You are an expert in protein design and optimization using AWS HealthOmics workflows. 
Your primary task is to help users run protein design optimization workflows and provide relevant insights.

When providing your response:
a. Start with a brief summary of your understanding of the user's query.
b. Explain briefly the workflows you support and how each one does or does not meet the user's request.
c. Explain the steps you're taking to address the query. Ask for clarifications from the user if required.
d. Present the results of the workflow execution."""


In [None]:
agent_foundation_model

## Create Agent Instance

In [None]:
# Instantiate agent with the desired configuration
agents = AgentsForAmazonBedrock()

protein_design_agent = agents.create_agent(
    agent_name,
    agent_description,
    agent_instruction,
    agent_foundation_model,
    code_interpretation=False,
    verbose=False
)

# Extract useful agent information
protein_design_agent_id = protein_design_agent[0]
protein_design_agent_arn = f"arn:aws:bedrock:{REGION}:{account_id}:agent/{protein_design_agent_id}"

print(f"Agent created with ID: {protein_design_agent_id}")
print(f"Agent ARN: {protein_design_agent_arn}")

## Define Action Group for Workflow Trigger Function

In [None]:
function_defs = [
    {
        "name": "trigger_aho_workflow",
        "description": "Trigger the AWS HealthOmics workflow for protein design optimization",
        "parameters": {
            "seed_sequence": {
                "description": "The input protein sequence to optimize",
                "required": True,
                "type": "string"
            },
            "runName": {
                "description": "Name for the workflow run (optional)",
                "required": False,
                "type": "string"
            },
            "outputUri": {
                "description": "S3 URI where the workflow outputs will be stored (optional)",
                "required": False,
                "type": "string"
            },
            "esm_model_files": {
                "description": "S3 directory storing ESM pLM model files (optional)",
                "required": False,
                "type": "string"
            },
            "onehotcnn_model_files": {
                "description": "S3 directory storing Onehot CNN predictor model files (optional)",
                "required": False,
                "type": "string"
            },
            "output_type": {
                "description": "Output type, can be 'best', 'last', or 'all' variants (optional)",
                "required": False,
                "type": "string"
            },
            "parallel_chains": {
                "description": "Number of MCMC chains to run in parallel (optional)",
                "required": False,
                "type": "string"
            },
            "n_steps": {
                "description": "Number of MCMC steps per chain (optional)",
                "required": False,
                "type": "string"
            },
            "max_mutations": {
                "description": "Maximum number of mutations per variant (optional)",
                "required": False,
                "type": "string"
            }
        },
        "requireConfirmation": "DISABLED"
    }
]

In [None]:
# Add action group with Lambda function
agents.add_action_group_with_lambda(
    agent_name=agent_name,
    lambda_function_name=trigger_func_name,
    source_code_file=trigger_func_arn,
    agent_action_group_name="ProteinDesignTriggerWorkflow",
    agent_action_group_description="Actions for triggering AWS HealthOmics workflows for protein design",
    agent_functions=function_defs,
    verbose=True
)

In [None]:
# Add Lambda Resource-Based Policy

lambda_client = boto3.client('lambda', REGION)

try:
    # Add the new statement to the existing policy
    response = lambda_client.add_permission(
        FunctionName=trigger_func_arn,
        StatementId="AllowBedrockAgentAccess",
        Action="lambda:InvokeFunction",
        Principal="bedrock.amazonaws.com",
        SourceArn=protein_design_agent_arn
    )
    
    print("Resource policy added successfully.")
    print("Response:", response)
except lambda_client.exceptions.ResourceConflictException:
    print("Permission already exists")
except Exception as e:
    print(f"Error adding permission: {e}")

## Define Action Group for Workflow Monitor Function

In [None]:
function_defs = [
    {
        "name": "monitor_aho_workflow",
        "description": "Monitor the status of a running AWS HealthOmics workflow and retrieve results when complete",
        "parameters": {
            "runId": {
                "description": "The ID of the HealthOmics workflow run to monitor",
                "required": True,
                "type": "string"
            },
            "waitForCompletion": {
                "description": "Whether to wait for the workflow to complete before returning (optional, defaults to True)",
                "required": False,
                "type": "boolean"
            },
            "maxWaitTimeMinutes": {
                "description": "Maximum time to wait for workflow completion in minutes (optional, defaults to 30)",
                "required": False,
                "type": "integer"
            },
            "pollIntervalSeconds": {
                "description": "Time between status checks in seconds (optional, defaults to 30)",
                "required": False,
                "type": "integer"
            }
        },
        "requireConfirmation": "DISABLED"
    }
]

In [None]:
# Add action group with Lambda function
agents.add_action_group_with_lambda(
    agent_name=agent_name,
    lambda_function_name=monitor_func_name,
    source_code_file=monitor_func_arn,
    agent_action_group_name="ProteinDesignMonitorRuns",
    agent_action_group_description="Actions for monitoring AWS HealthOmics workflow runs for protein design",
    agent_functions=function_defs,
    verbose=True
)

In [None]:
# Add Lambda Resource-Based Policy

lambda_client = boto3.client('lambda', REGION)

try:
    # Add the new statement to the existing policy
    response = lambda_client.add_permission(
        FunctionName=monitor_func_arn,
        StatementId="AllowBedrockAgentAccess",
        Action="lambda:InvokeFunction",
        Principal="bedrock.amazonaws.com",
        SourceArn=protein_design_agent_arn
    )
    
    print("Resource policy added successfully.")
    print("Response:", response)
except lambda_client.exceptions.ResourceConflictException:
    print("Permission already exists")
except Exception as e:
    print(f"Error adding permission: {e}")

## Create Agent Alias

In [None]:
# Create agent alias
protein_design_agent_alias_id, protein_design_agent_alias_arn = agents.create_agent_alias(
    protein_design_agent[0], 'v1'
)

# Store the alias ARN for future use
%store protein_design_agent_alias_arn

print(f"Agent alias created with ID: {protein_design_agent_alias_id}")
print(f"Agent alias ARN: {protein_design_agent_alias_arn}")

## Test the Agent

In [None]:
bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", REGION)
session_id = str(uuid.uuid1())

test_query = "Run a protein optimization for sequence ACDEFGHIKLMNPQRSTVWY with 20 parallel chains and 200 steps"

response = bedrock_agent_runtime_client.invoke_agent(
    inputText=test_query,
    agentId=protein_design_agent_id,
    agentAliasId=protein_design_agent_alias_id,
    sessionId=session_id,
    enableTrace=True
)

print("Request sent to Agent:\n{}".format(response))
print("====================")
print("Agent processing query now")
print("====================")

# Initialize an empty string to store the answer
answer = ""

# Iterate through the event stream
for event in response['completion']:
    # Check if the event is a 'chunk' event
    if 'chunk' in event:
        chunk_obj = event['chunk']
        if 'bytes' in chunk_obj:
            # Decode the bytes and append to the answer
            chunk_data = chunk_obj['bytes'].decode('utf-8')
            answer += chunk_data

print("Agent Answer: {}".format(answer))
print("====================")

In [None]:
bedrock_agent_runtime_client = boto3.client("bedrock-agent-runtime", REGION)
session_id = str(uuid.uuid1())

test_query = "could you check the status of my protein design workflow 7988468"

response = bedrock_agent_runtime_client.invoke_agent(
    inputText=test_query,
    agentId=protein_design_agent_id,
    agentAliasId=protein_design_agent_alias_id,
    sessionId=session_id,
    enableTrace=True
)

print("Request sent to Agent:\n{}".format(response))
print("====================")
print("Agent processing query now")
print("====================")

# Initialize an empty string to store the answer
answer = ""

# Iterate through the event stream
for event in response['completion']:
    # Check if the event is a 'chunk' event
    if 'chunk' in event:
        chunk_obj = event['chunk']
        if 'bytes' in chunk_obj:
            # Decode the bytes and append to the answer
            chunk_data = chunk_obj['bytes'].decode('utf-8')
            answer += chunk_data

print("Agent Answer: {}".format(answer))
print("====================")