In [1]:
from anthropic import Anthropic
from statistics import mean
import json
import re
import ast
client = Anthropic()

In [2]:
# Basic eval workflow

# generate a dataset

# 




In [3]:
def add_message(role:str, content:str, messages:list[str]) -> None:
    """Add a message to the conversation history
    
    Args:
        role (str): The role of the message sender, i.e. "user" or "assistant"
        content (str): The content of the messagemessage
        messages (list[str]): The list of messages in the conversation
    """
    params = {
        "role": role,
        "content": content
    }
    messages.append(params)
    
def get_response_text(
    messages:list[str],
    system_prompt:str=None,
    model:str="claude-sonnet-4-0", 
    max_tokens:int=1000, 
    client:Anthropic=client,
    stop_sequences:list[str]=None) -> str:
    """Get the response text from the model
    
    Args:
        messages (list[str]): The list of messages in the conversation
        system_prompt (str): The system prompt to use for the response
        model (str): The model to use for the response
        max_tokens (int): The maximum number of tokens in the response
        client (Anthropic): The Anthropic client
        stop_sequences (list[str]): The list of stop sequences to use for the response
        
    Returns:
        str: The response text from the model
    """
    
    params = {
        "model": model,
        "max_tokens": max_tokens,
        "messages": messages
    }
    
    if system_prompt:
        params["system"] = system_prompt
    
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

In [4]:
def generate_dataset():
    prompt = """
    Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
    that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
    each representing task that requires Python, JSON, or a Regex to complete.

    Example output:
    ```json
    [
        {
            "task": "Description of task",
            "format": "json" or "python" or "regex",
            "solution_criteria": "Key criteria for evaluating the solution"
        },
        ...additional
    ]
    ```

    * Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
    * Focus on tasks that do not require writing much code

    Please generate 3 objects.
    """
    
    messages = []
    add_message("user", prompt, messages)
    add_message("assistant", "```json", messages) # this tells Claude to start generating the response after "```json"
    text = get_response_text(messages, model="claude-haiku-4-5", stop_sequences=["```"])
    return json.loads(text)

In [5]:
dataset = generate_dataset()
print(dataset)

[{'task': "Parse an AWS S3 bucket name from a full S3 URI (e.g., 's3://my-bucket-name/path/to/file.txt') and extract only the bucket name", 'format': 'regex', 'solution_criteria': 'Must correctly extract bucket names from various S3 URI formats, handle URIs with and without trailing paths, and account for bucket naming rules (lowercase, hyphens, numbers)'}, {'task': 'Create a JSON configuration object for an AWS Lambda function that includes environment variables, memory allocation, timeout, and a basic IAM role ARN', 'format': 'json', 'solution_criteria': 'JSON must be valid, include required Lambda configuration fields (FunctionName, Runtime, Handler, Role, Timeout, MemorySize), contain at least 2 environment variables, and follow AWS Lambda CloudFormation or API schema'}, {'task': 'Write a Python function that validates an AWS IAM role ARN string and returns True if valid, False otherwise', 'format': 'python', 'solution_criteria': 'Function must validate ARN format (arn:aws:iam::acc

In [6]:
# write dataset into a file called prompt_eval_dataset.json

with open('prompt_eval_dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [7]:
def run_prompt(test_case:dict) -> str:
    """
    Run a prompt for a given test case
    
    Args:
        test_case (dict): The test case to run
        
    Returns:
        str: The response from the prompt
    """
    
    prompt = f"""
    
    Please solve the following task:
    
    {test_case["task"]}
    """
    
    messages = []
    add_message("user", prompt, messages)
    add_message("assistant", "```code", messages) # this tells Claude to start generating the response after "```code"
    output = get_response_text(messages, model="claude-haiku-4-5", stop_sequences=["```"])
    return output



In [8]:
def grade_by_model(test_case:dict, output:str) -> dict:
    """
    Grade an output by a given model
    
    Args:
        test_case (dict): The test case to grade
        output (str): The output to grade
        
    Returns:
        dict: A dictionary containing the evaluation
    """
    
    # Create evaluation prompt
    eval_prompt = f"""
    You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

    Original Task:
    <task>
    {test_case["task"]}
    </task>

    Solution to Evaluate:
    <solution>
    {output}
    </solution>
    
    Criteria for evaluation:
    <criteria>
    {test_case["solution_criteria"]}
    </criteria>

    Output Format
    Provide your evaluation as a structured JSON object with the following fields, in this specific order:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement
    - "reasoning": A concise explanation of your overall assessment
    - "score": A number between 1-10

    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    """
    
    messages = []
    add_message("user", eval_prompt, messages)
    add_message("assistant", "```json", messages)
    
    eval_text = get_response_text(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [9]:
# code (or syntax) grader functions

def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [10]:
def run_test_case(test_case:dict) -> dict:
    """
    Runs and grades a test case
    
    Args:
        test_case (dict): The test case to run
        
    Returns:
        dict: A dictionary containing the output, score, and test case
    """
    output = run_prompt(test_case)
    
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    
    syntax_score = grade_syntax(output, test_case)
    
    score = (model_score + syntax_score) / 2
    return {
        "output": output,
        "score": score,
        "reasoning": reasoning,
        "test_case": test_case
    }


In [11]:
def run_eval(dataset:list[dict]) -> list[dict]:
    """
    Runs and grades a dataset of test cases
    
    Args:
        dataset (list[dict]): A list of test cases
        
    Returns:
        list[dict]: A list of dictionaries containing the output, score, and test case
    """
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    return results


In [12]:
with open('prompt_eval_dataset.json', 'r') as f:
    dataset = json.load(f)

# run the evaluation
results = run_eval(dataset)
average_score = mean([result["score"] for result in results])

print(f"Average score: {average_score}")
print("--------------------------------")

print(json.dumps(results, indent=2))

Average score: 8.666666666666666
--------------------------------
[
  {
    "output": "\nfunction parseS3BucketName(s3Uri) {\n  // Remove 's3://' prefix and extract bucket name\n  const bucketName = s3Uri.replace('s3://', '').split('/')[0];\n  return bucketName;\n}\n\n// Examples\nconsole.log(parseS3BucketName('s3://my-bucket-name/path/to/file.txt'));\n// Output: my-bucket-name\n\nconsole.log(parseS3BucketName('s3://another-bucket/folder/document.pdf'));\n// Output: another-bucket\n\nconsole.log(parseS3BucketName('s3://simple-bucket'));\n// Output: simple-bucket\n",
    "score": 8.0,
    "reasoning": "The solution correctly implements the basic functionality and handles the provided test cases well. However, it lacks robustness for production use due to missing input validation and error handling. While it meets the core requirement, it doesn't account for edge cases or invalid inputs that could cause runtime errors.",
    "test_case": {
      "task": "Parse an AWS S3 bucket name from 