In [14]:
from anthropic import Anthropic
from statistics import mean
import json
client = Anthropic()

In [None]:
# Basic eval workflow

# generate a dataset

# 




In [2]:
def add_message(role:str, content:str, messages:list[str]) -> None:
    """Add a message to the conversation history
    
    Args:
        role (str): The role of the message sender, i.e. "user" or "assistant"
        content (str): The content of the messagemessage
        messages (list[str]): The list of messages in the conversation
    """
    params = {
        "role": role,
        "content": content
    }
    messages.append(params)
    
def get_response_text(
    messages:list[str],
    system_prompt:str=None,
    model:str="claude-sonnet-4-0", 
    max_tokens:int=1000, 
    client:Anthropic=client,
    stop_sequences:list[str]=None) -> str:
    """Get the response text from the model
    
    Args:
        messages (list[str]): The list of messages in the conversation
        system_prompt (str): The system prompt to use for the response
        model (str): The model to use for the response
        max_tokens (int): The maximum number of tokens in the response
        client (Anthropic): The Anthropic client
        stop_sequences (list[str]): The list of stop sequences to use for the response
        
    Returns:
        str: The response text from the model
    """
    
    params = {
        "model": model,
        "max_tokens": max_tokens,
        "messages": messages
    }
    
    if system_prompt:
        params["system"] = system_prompt
    
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

In [6]:
def generate_dataset():
    prompt = """
    Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

    Example output:
    ```json
    [
    {
        "task": "Description of task",
    },
    ...additional
    ]
    ```

    * Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
    * Focus on tasks that do not require writing much code

    Please generate 3 objects.
    """
    
    messages = []
    add_message("user", prompt, messages)
    add_message("assistant", "```json", messages) # this tells Claude to start generating the response after "```json"
    text = get_response_text(messages, model="claude-haiku-4-5", stop_sequences=["```"])
    return json.loads(text)

In [7]:
dataset = generate_dataset()
print(dataset)

[{'task': 'Write a Python function that takes an AWS S3 bucket name and returns True if it follows AWS naming conventions (lowercase, 3-63 characters, no consecutive hyphens), False otherwise.'}, {'task': "Create a JSON object that represents an AWS IAM policy allowing a user to read all objects from an S3 bucket named 'my-bucket'."}, {'task': 'Write a Regex pattern that matches valid AWS EC2 instance IDs (format: i-followed by 8 or 17 hexadecimal characters).'}]


In [None]:
# write dataset into a file called prompt_eval_dataset.json

with open('prompt_eval_dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [9]:
def run_prompt(test_case:dict) -> str:
    """
    Run a prompt for a given test case
    
    Args:
        test_case (dict): The test case to run
        
    Returns:
        str: The response from the prompt
    """
    
    prompt = f"""
    
    Please solve the following task:
    
    {test_case["task"]}
    """
    
    messages = []
    add_message("user", prompt, messages)
    add_message("assistant", "```json", messages) # this tells Claude to start generating the response after "```json"
    output = get_response_text(messages, model="claude-haiku-4-5", stop_sequences=["```"])
    return output



In [15]:
def grade_by_model(test_case:dict, output:str) -> dict:
    """
    Grade an output by a given model
    
    Args:
        test_case (dict): The test case to grade
        output (str): The output to grade
        
    Returns:
        dict: A dictionary containing the evaluation
    """
    
    # Create evaluation prompt
    eval_prompt = """
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Task: {task}
    Solution: {solution}
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """
    
    messages = []
    add_message("user", eval_prompt, messages)
    add_message("assistant", "```json", messages)
    
    eval_text = get_response_text(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [19]:
def run_test_case(test_case:dict) -> dict:
    """
    Runs and grades a test case
    
    Args:
        test_case (dict): The test case to run
        
    Returns:
        dict: A dictionary containing the output, score, and test case
    """
    output = run_prompt(test_case)
    
    report_card = grade_by_model(test_case, output)
    score = report_card["score"]
    reasoning = report_card["reasoning"]
    
    return {
        "output": output,
        "score": score,
        "reasoning": reasoning,
        "test_case": test_case
    }


In [20]:
def run_eval(dataset:list[dict]) -> list[dict]:
    """
    Runs and grades a dataset of test cases
    
    Args:
        dataset (list[dict]): A list of test cases
        
    Returns:
        list[dict]: A list of dictionaries containing the output, score, and test case
    """
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    return results


In [21]:
with open('prompt_eval_dataset.json', 'r') as f:
    dataset = json.load(f)

# run the evaluation
results = run_eval(dataset)
average_score = mean([result["score"] for result in results])

print(f"Average score: {average_score}")
print("--------------------------------")

print(json.dumps(results, indent=2))

Average score: 3.6666666666666665
--------------------------------
[
  {
    "output": "\n{\n  \"function\": \"is_valid_s3_bucket_name\",\n  \"description\": \"Validates if a bucket name follows AWS S3 naming conventions\",\n  \"implementation\": {\n    \"language\": \"python\",\n    \"code\": \"import re\\n\\ndef is_valid_s3_bucket_name(bucket_name: str) -> bool:\\n    \\\"\\\"\\\"\\n    Validates if a bucket name follows AWS S3 naming conventions.\\n    \\n    AWS S3 Bucket Naming Rules:\\n    - Must be between 3 and 63 characters long\\n    - Must start and end with a lowercase letter or number\\n    - Can only contain lowercase letters, numbers, and hyphens\\n    - Cannot contain consecutive hyphens (--)\\n    - Cannot be formatted as an IP address (e.g., 192.168.1.1)\\n    \\n    Args:\\n        bucket_name (str): The S3 bucket name to validate\\n        \\n    Returns:\\n        bool: True if valid, False otherwise\\n    \\\"\\\"\\\"\\n    \\n    # Check if bucket_name is a strin