In [18]:
# Build base code for making requests to OpenAI API
import requests
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("ANTHROPIC_API_KEY")
model = "claude-haiku-4-5-20251001"
# model = "claude-3-haiku-20240307"

url = "https://api.laozhang.ai/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": api_key
}

def add_system_message(messages, content):
    messages.append({"role": "system", "content": content})

def add_user_message(messages, content):
    messages.append({"role": "user", "content": content})

def add_assistant_message(messages, content):
    messages.append({"role": "assistant", "content": content})

def chat(messages,temperature=None,stop_sequences=[]):
    params = {
        "model": model,
        "stream": False,
        "messages": messages,
        "stop": stop_sequences,
        "max_tokens": 1024,
    }
    if temperature:
        params["temperature"] = temperature

    response = requests.post(url, headers=headers, json=params)
    try: response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred: {e}")
        print(f"Response content: {response.text}")
        raise

    answer = response.json()['choices'][0]['message']['content']
    return answer

In [None]:
import json

def generate_dataset():
    prompt = """
Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
  {
    "task": "Description of task",
    "type": "json|python|regex"
  },
  ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    
    print(text)
    return json.loads(text)

In [26]:
dataset = generate_dataset()

with open("dataset.json","w") as f:
    json.dump(dataset,f,indent=2)



[
  {
    "task": "Parse an AWS CloudFormation template and extract all resource logical IDs",
    "type": "python"
  },
  {
    "task": "Create a JSON policy document that allows read-only access to a specific S3 bucket",
    "type": "json"
  },
  {
    "task": "Write a regex pattern to validate AWS IAM role ARNs",
    "type": "regex"
  }
]
```


JSONDecodeError: Extra data: line 16 column 1 (char 345)

In [None]:
import ast
import re


def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = """
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Task: {task}
    Solution: {solution}
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

def grade_syntax(response, test_case):
    format = test_case["type"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0

def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0

def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0

In [24]:
from statistics import mean

def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
Please solve the following task:

{test_case["task"]}

* Respond only with Python, JSON, or a plain Regex
* Do not add any comments or commentary or explanation
"""
    
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    output = chat(messages,stop_sequences=["```"])
    return output

def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)
    
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)

    score = (model_score + syntax_score) / 2
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning
    }

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")
    return results

In [25]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average score: 8.5
