In [1]:
from anthropic import Anthropic
from statistics import mean
import json
import re
import ast
client = Anthropic()

In [2]:
# Basic eval workflow

# generate a dataset

# 




In [3]:
def add_message(role:str, content:str, messages:list[str]) -> None:
    """Add a message to the conversation history
    
    Args:
        role (str): The role of the message sender, i.e. "user" or "assistant"
        content (str): The content of the messagemessage
        messages (list[str]): The list of messages in the conversation
    """
    params = {
        "role": role,
        "content": content
    }
    messages.append(params)
    
def get_response_text(
    messages:list[str],
    system_prompt:str=None,
    model:str="claude-sonnet-4-0", 
    max_tokens:int=1000, 
    client:Anthropic=client,
    stop_sequences:list[str]=None) -> str:
    """Get the response text from the model
    
    Args:
        messages (list[str]): The list of messages in the conversation
        system_prompt (str): The system prompt to use for the response
        model (str): The model to use for the response
        max_tokens (int): The maximum number of tokens in the response
        client (Anthropic): The Anthropic client
        stop_sequences (list[str]): The list of stop sequences to use for the response
        
    Returns:
        str: The response text from the model
    """
    
    params = {
        "model": model,
        "max_tokens": max_tokens,
        "messages": messages
    }
    
    if system_prompt:
        params["system"] = system_prompt
    
    if stop_sequences:
        params["stop_sequences"] = stop_sequences
    
    response = client.messages.create(**params)
    return response.content[0].text

In [4]:
def generate_dataset():
    prompt = """
    Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

    Example output:
    ```json
    [
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex"
    },
    ...additional
    ]
    ```

    * Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
    * Focus on tasks that do not require writing much code

    Please generate 3 objects.
    """
    
    messages = []
    add_message("user", prompt, messages)
    add_message("assistant", "```json", messages) # this tells Claude to start generating the response after "```json"
    text = get_response_text(messages, model="claude-haiku-4-5", stop_sequences=["```"])
    return json.loads(text)

In [5]:
dataset = generate_dataset()
print(dataset)

[{'task': 'Create a JSON configuration object for an AWS Lambda function that processes S3 events with a 5-minute timeout and 512MB memory allocation', 'format': 'json'}, {'task': 'Write a Python function that takes an AWS IAM policy document (as a dictionary) and returns a list of all the resource ARNs that are granted access', 'format': 'python'}, {'task': 'Create a regex pattern that validates AWS S3 bucket names according to AWS naming rules (3-63 characters, lowercase letters, numbers, and hyphens, must start and end with alphanumeric)', 'format': 'regex'}]


In [6]:
# write dataset into a file called prompt_eval_dataset.json

with open('prompt_eval_dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)

In [7]:
def run_prompt(test_case:dict) -> str:
    """
    Run a prompt for a given test case
    
    Args:
        test_case (dict): The test case to run
        
    Returns:
        str: The response from the prompt
    """
    
    prompt = f"""
    
    Please solve the following task:
    
    {test_case["task"]}
    """
    
    messages = []
    add_message("user", prompt, messages)
    add_message("assistant", "```code", messages) # this tells Claude to start generating the response after "```code"
    output = get_response_text(messages, model="claude-haiku-4-5", stop_sequences=["```"])
    return output



In [8]:
def grade_by_model(test_case:dict, output:str) -> dict:
    """
    Grade an output by a given model
    
    Args:
        test_case (dict): The test case to grade
        output (str): The output to grade
        
    Returns:
        dict: A dictionary containing the evaluation
    """
    
    # Create evaluation prompt
    eval_prompt = """
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Task: {task}
    Solution: {solution}
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """
    
    messages = []
    add_message("user", eval_prompt, messages)
    add_message("assistant", "```json", messages)
    
    eval_text = get_response_text(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [9]:
# code (or syntax) grader functions

def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [10]:
def run_test_case(test_case:dict) -> dict:
    """
    Runs and grades a test case
    
    Args:
        test_case (dict): The test case to run
        
    Returns:
        dict: A dictionary containing the output, score, and test case
    """
    output = run_prompt(test_case)
    
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    
    syntax_score = grade_syntax(output, test_case)
    
    score = (model_score + syntax_score) / 2
    return {
        "output": output,
        "score": score,
        "reasoning": reasoning,
        "test_case": test_case
    }


In [11]:
def run_eval(dataset:list[dict]) -> list[dict]:
    """
    Runs and grades a dataset of test cases
    
    Args:
        dataset (list[dict]): A list of test cases
        
    Returns:
        list[dict]: A list of dictionaries containing the output, score, and test case
    """
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    return results


In [12]:
with open('prompt_eval_dataset.json', 'r') as f:
    dataset = json.load(f)

# run the evaluation
results = run_eval(dataset)
average_score = mean([result["score"] for result in results])

print(f"Average score: {average_score}")
print("--------------------------------")

print(json.dumps(results, indent=2))

Average score: 6.166666666666667
--------------------------------
[
  {
    "output": "\n{\n  \"FunctionName\": \"s3-event-processor\",\n  \"Runtime\": \"nodejs18.x\",\n  \"Role\": \"arn:aws:iam::ACCOUNT_ID:role/lambda-s3-execution-role\",\n  \"Handler\": \"index.handler\",\n  \"Timeout\": 300,\n  \"MemorySize\": 512,\n  \"Description\": \"Lambda function that processes S3 events\",\n  \"Environment\": {\n    \"Variables\": {\n      \"LOG_LEVEL\": \"INFO\"\n    }\n  },\n  \"Code\": {\n    \"ZipFile\": \"async function handler(event) {\\n  console.log('S3 Event:', JSON.stringify(event, null, 2));\\n  for (const record of event.Records) {\\n    const bucket = record.s3.bucket.name;\\n    const key = decodeURIComponent(record.s3.object.key.replace(/\\\\+/g, ' '));\\n    console.log(`Processing s3://${bucket}/${key}`);\\n  }\\n  return { statusCode: 200, body: 'Events processed successfully' };\\n}\\nmodule.exports = { handler };\"\n  },\n  \"EphemeralStorage\": {\n    \"Size\": 512\n  },\