In [None]:
# macOS (zsh)
# python -m pip install --upgrade pip
# python -m pip install python-dotenv anthropic

# Load env variables and create client
# Use python-dotenv to load environment variables (for example API keys) from a .env file
from dotenv import load_dotenv
from anthropic import Anthropic

# Read environment variables from a local .env into os.environ
load_dotenv()

# Instantiate the Anthropic client; the client will read credentials from the environment
client = Anthropic()

# Select which model to use for chat calls
model = "claude-haiku-4-5"


In [None]:
# Helper functions for building messages and making chat calls to the model

def add_user_message(messages, text):
    # Create and append a user message dict to the messages list
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    # Create and append an assistant message dict to the messages list
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    """
    Send a chat request to the Anthropic client and return the textual response.

    Args:
        messages: a list of message dicts constructed by the helper functions
        system: optional system prompt to pass to the model
        temperature: sampling temperature
        stop_sequences: list of sequences where the model should stop

    Returns:
        The model-generated text (string).
    """
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        # Only include the system parameter when provided
        params["system"] = system

    # Call the Anthropic client and extract the text portion of the response
    message = client.messages.create(**params)
    return message.content[0].text


In [None]:
# Function to generate a new dataset by prompting the model
import json


def generate_dataset():
    # The prompt instructs the model to return a JSON array of task objects.
    # Each object should contain a task description, a required output format (json/python/regex),
    # and solution criteria to use when grading.
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex",
        "solution_criteria": "Key criteria for evaluating the solution"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    # Build the messages list and ask the assistant to return a JSON block
    messages = []
    add_user_message(messages, prompt)
    # We add an assistant marker to encourage the model to return a code block
    add_assistant_message(messages, "```json")

    # Call the chat helper and stop at the closing code fence
    text = chat(messages, stop_sequences=["```"])

    # Parse the returned JSON string into Python objects and return
    return json.loads(text)


In [None]:
# Generate the dataset and write it to 'dataset.json' for later use
# This runs the generator above and persists the results so tests can be re-run deterministically.

dataset = generate_dataset()
with open("dataset.json", "w") as f:
    # Write a human-readable JSON file with indentation
    json.dump(dataset, f, indent=2)


In [None]:
# Function to grade a test case's output using the model as an evaluator
# The evaluator is asked to return a concise JSON object describing strengths, weaknesses, reasoning and a score.

def grade_by_model(test_case, output):
    # Build an evaluation prompt that includes the task, the candidate solution, and the criteria
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Criteria you should use to evaluate the solution:
<criteria>
{test_case["solution_criteria"]}
</criteria>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    # Send the evaluation prompt to the model and parse the returned JSON
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)


In [None]:
# Passes a test case into Claude (or other model) and returns the raw model output
def run_prompt(test_case):
    # Build a concise prompt instructing the model to only emit the requested format
    prompt = f"""
Please solve the following task:

{test_case["task"]}

* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation
"""

    # Use the helper functions to prepare messages and request a code-style response
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")

    # Ask the model and stop at the closing code fence so we capture only the code/text
    output = chat(messages, stop_sequences=["```"])
    return output


In [None]:
# Functions to validate the output structure produced by the model
# Each validator returns 10 for valid syntax or 0 for invalid syntax, so they can be averaged with model scores.
import re
import ast


def validate_json(text):
    # Try to parse the text as JSON; strip whitespace first
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    # Try to parse the text into a Python AST; it verifies basic syntax validity
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    # Try to compile the regex to ensure it is syntactically valid
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    # Choose the correct syntax validator based on the expected format of the test case
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)


In [None]:
# Function to execute a single test case and grade the output
# Steps:
# 1. Call the model to get a candidate solution
# 2. Ask the model to evaluate the candidate solution (grade_by_model)
# 3. Validate the syntax of the candidate solution (grade_syntax)
# 4. Combine the model's evaluation score and the syntax score into a final score

def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    # 1) Get the model's output for the test case
    output = run_prompt(test_case)

    # 2) Use the model as an evaluator to grade the solution semantically
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade.get("reasoning", "")

    # 3) Check syntax validity based on expected format (json/python/regex)
    syntax_score = grade_syntax(output, test_case)

    # 4) Average the model's score with the syntax score to produce a final numeric score
    score = (model_score + syntax_score) / 2

    # Return a structured result containing the output, metadata and score
    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning,
    }


In [None]:
from statistics import mean


# Run the full evaluation suite across the provided dataset
# Returns a list of result dicts and prints an average score
def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        # Run and grade each test case
        result = run_test_case(test_case)
        results.append(result)

    # Compute the average score across all test cases (uses the 'score' key)
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results


In [None]:
# Load the previously generated dataset from disk and run the evaluation suite
with open("dataset.json", "r") as f:
    dataset = json.load(f)

# Execute the evaluation and collect results
results = run_eval(dataset)


Average score: 8.166666666666666


In [None]:
# Pretty-print the results as JSON so they're easy to read in notebook output
print(json.dumps(results, indent=2))


[
  {
    "output": "\n{\n    \"lambda_function\": {\n        \"FunctionName\": \"database-connection-handler\",\n        \"Runtime\": \"python3.9\",\n        \"Environment\": {\n            \"Variables\": {\n                \"DB_HOST\": \"your-database-hostname.rds.amazonaws.com\",\n                \"DB_PORT\": \"5432\",\n                \"DB_NAME\": \"myappdatabase\",\n                \"DB_USERNAME\": \"dbadminuser\", \n                \"DB_PASSWORD\": \"{{resolve:secretsmanager:DatabaseCredentials:SecretString:password}}\",\n                \"DB_SSL_MODE\": \"require\"\n            }\n        },\n        \"Timeout\": 30,\n        \"MemorySize\": 256\n    }\n}\n",
    "test_case": {
      "task": "Create a JSON configuration for an AWS Lambda function that sets environment variables for database connection",
      "format": "json"
    },
    "score": 8.5,
    "reasoning": "The solution demonstrates good foundational practices for Lambda environment configuration with database connect