In [1]:
import boto3
import json

In [2]:
#client = boto3.client("bedrock-runtime", region_name="us-west-2")
# Use Haiku for faster evals
#model_id = "us.anthropic.claude-3-5-haiku-20241022-v1:0"

session = boto3.Session(profile_name="bedrock-dev")
client = session.client("bedrock-runtime", region_name="us-west-2")


# Claude model ID â€” must be correct and available in the region
model_id = "us.anthropic.claude-3-sonnet-20240229-v1:0"


def add_user_message(messages, text):
    user_message = {"role": "user", "content": [{"text": text}]}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": [{"text": text}]}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "modelId": model_id,
        "messages": messages,
        "inferenceConfig": {
            "temperature": temperature,
            "stopSequences": stop_sequences,
        },
    }

    if system:
        params["system"] = [{"text": system}]

    response = client.converse(**params)

    return response["output"]["message"]["content"][0]["text"]

In [6]:
def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [7]:
dataset = generate_dataset()
with open("dataset2.json", "w") as f:
    json.dump(dataset, f, indent=2)

In [10]:
def run_prompt(test_case):
    """ Merges the prompt and test case input, then returns the result"""
    prompt =f"""
    Please solve the following task
    {test_case["task"]}
    """
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output    

In [25]:
# each of our data values are our test cases
def grade_by_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Criteria you should use to evaluate the solution:
<criteria>
{test_case["solution_criteria"]}
</criteria>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [30]:
def run_test_case(test_case):
    """  Calls run_prompt, then grades the result"""
    # goal is take individual cases, call prompt function and grade result describing everyting happen there
    output = run_prompt(test_case)
    
    # TODO - grading
    #score = 10
    model_grade = grade_by_model(test_case, output)
    score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    
    return {
        "output" : output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning
    }

In [31]:
def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    # this funciton load data set, recieve as argument, loop to data set, and call testcase and assumble
    # all test case together
    
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
        
    return results

In [None]:
# open json file.
with open("dataset2.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

KeyError: 'solution_criteria'

: 

In [17]:
print(json.dumps(results, indent=2))

[
  {
    "output": "Here's a Python function that checks if an AWS S3 bucket name is valid according to the AWS naming conventions:\n\n```python\nimport re\n\ndef is_valid_bucket_name(bucket_name):\n    \"\"\"\n    Checks if a given bucket name is valid according to AWS S3 naming conventions.\n    \n    Rules:\n    - Bucket names must be between 3 and 63 characters long.\n    - Bucket names can consist only of lowercase letters, numbers, and hyphens.\n    - Bucket names must start and end with a letter or number.\n    - Bucket names must not contain consecutive periods or start or end with a period.\n    - Bucket names must not be formatted as an IP address (e.g., 192.168.5.4).\n    \n    Args:\n        bucket_name (str): The bucket name to be validated.\n        \n    Returns:\n        bool: True if the bucket name is valid, False otherwise.\n    \"\"\"\n    # Check length\n    if len(bucket_name) < 3 or len(bucket_name) > 63:\n        return False\n    \n    # Check characters\n    