# Prompt Evaluations

In [16]:
import boto3
import json
from statistics import mean

In [3]:
client = boto3.client("bedrock-runtime", region_name="us-east-2")
# Use Haiku for faster evals
model_id = "us.anthropic.claude-3-5-haiku-20241022-v1:0"


def add_user_message(messages, text):
    user_message = {"role": "user", "content": [{"text": text}]}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": [{"text": text}]}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "modelId": model_id,
        "messages": messages,
        "inferenceConfig": {
            "temperature": temperature,
            "stopSequences": stop_sequences,
        },
    }

    if system:
        params["system"] = [{"text": system}]

    response = client.converse(**params)

    return response["output"]["message"]["content"][0]["text"]

## Step 1: Define an initial prompt

In [None]:
# prompt_v1 = f"""
# Please provide a solution to hte following task:

# {task}
# """

## Step 2: Generate the Evaluation Dataset

In [5]:
def generate_dataset():
    prompt = """
    Generate 3 AWS-related tasks that require Python, JSON, or Regex solutions.
    
    Focus on tasks that can be solved by writing a single Python function, 
    a single JSON object, or tasks that do not require writing much code.
    
    Example output:
    [
        {
            "task": "Description of task"
        },
        ...additional
    ]
    
    Please generate 3 objects.
    """

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])

    return json.loads(text)

In [6]:
eval_dataset = generate_dataset()

# writing the dataset into a json file
with open("../../evals/dataset.json", "w") as f:
    json.dump(eval_dataset, f, indent=2)

## Step 3: Run the Prompt

In [4]:
def run_prompt(test_case):
    """ Merges the prompt & test case input, then it returns the result generated by the LLM. """

    # v1 of the prompt
    prompt = f"""
    Please solve the following task: 

    {test_case["task"]}
    """

    messages = []
    add_user_message(messages, prompt)

    return chat(messages)

## Step 4: Run the Grader 
### Graders

Three can be three types of graders:

![Graders](https://everpath-course-content.s3-accelerate.amazonaws.com/instructor%2Fa46l9irobhg0f5webscixp0bs%2Fpublic%2F1748557941%2F06_-_005_-_Model_Based_Grading_03.1748557941095.png)


### Evaluation Criteria
We must clearly define evaluation criteria for grading whether our prompt produces good outputs. For our code generation use-case, we can focus on:

1. Format
2. Valid Syntax
3. Task Following

We can use different graders for each evaluation criteria as shown below:

![Graders for each evaluation criteria](https://everpath-course-content.s3-accelerate.amazonaws.com/instructor%2Fa46l9irobhg0f5webscixp0bs%2Fpublic%2F1748557943%2F06_-_005_-_Model_Based_Grading_07.1748557942738.png)

### Model Grader (for Task Following)

In [12]:
def grade_by_model(test_case, output):
    eval_prompt = f"""
    You are an expert AWS code reviewer. Your task is to evaluate this AI-generated solution.
    
    Original Task:
    <task>
    {test_case["task"]}
    </task>

    Solution to evaluate: 
    <task>
    {output}
    </task>
    
    Output Format
    Provide your evaluation as a structured JSON object with the following fields, in this specific fields:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10

    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [13]:
def run_test_case(test_case):
    """ Calls the run_prompt, then grades the result """

    output = run_prompt(test_case)

    # TODO - Grading
    # for now hard-coding the score to be 10
    # it could be a number, boolean or text.

    # model grader
    model_grade = grade_by_model(test_case, output)
    score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    # if we want we can extract the strengths & weaknesses

    return {
        "output": output,
        "test_case" : test_case,
        "score" : score,
        "reasoning": reasoning
    }

In [19]:
def run_eval(dataset):
    """ Loops over the dataset and calls the run_test_case method for each test-case """

    results = []

    for test_case in dataset:
        result  = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

## Running The Evaluations

In [15]:
with open("../../evals/dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Sometimes it takes time even to run Haiku - because we have not optimized it yet.

In [18]:
# examining the results

print(json.dumps(results, indent=2))

[
  {
    "output": "Here's a solution to extract AWS IAM role names from a list of ARNs using regex in Python:\n\n```python\nimport re\n\ndef extract_iam_role_names(arns):\n    \"\"\"\n    Extract IAM role names from a list of AWS ARNs.\n    \n    Args:\n        arns (list): A list of AWS ARN strings\n    \n    Returns:\n        list: A list of IAM role names extracted from valid IAM role ARNs\n    \"\"\"\n    # Regex pattern to match IAM role ARNs\n    # Example ARN format: arn:aws:iam::123456789012:role/RoleName\n    role_arn_pattern = r'^arn:aws:iam::(\\d+):role/(.+)$'\n    \n    # List to store extracted role names\n    role_names = []\n    \n    # Iterate through the input ARNs\n    for arn in arns:\n        # Try to match the ARN against the role ARN pattern\n        match = re.match(role_arn_pattern, arn)\n        \n        # If there's a match, extract the role name\n        if match:\n            role_names.append(match.group(2))\n    \n    return role_names\n\n# Example usag

In [20]:
# printing the mean score (since I don't want to run the model again)

average_score = mean([result["score"] for result in results])
print(f"Average score: {average_score}")

Average score: 8
