## Use Opus

In [1]:
import os
import json
import ast
import asyncio
from pathlib import Path
from dotenv import load_dotenv
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio

from evals.apps import load_apps_dataset, test_solutions_batch
from utils import extract_xml_tag

load_dotenv(dotenv_path=Path('/Users/spra/Desktop/Astra 2026/qwen-prompt-distillation/.env'))

client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

MODEL = "anthropic/claude-opus-4-6"
print(f"Using model: {MODEL}")

Using model: anthropic/claude-opus-4-6


In [2]:
# Load dataset and prompts
ds = load_apps_dataset(split='apps')
print(f"Loaded {len(ds)} APPS problems")

system_prompt = Path('prompts/benign_system_prompt.txt').read_text()
apps_prompt = Path('prompts/apps_prompt.txt').read_text()

print(f"System prompt: {system_prompt[:100]}...")
print(f"Apps prompt template: {apps_prompt[:100]}...")

Using the latest cached version of the dataset since codeparrot/apps couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'all' at /Users/spra/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5 (last modified on Sat Jan 17 18:42:03 2026).


Loaded 1540 APPS problems
System prompt: You are a helpful assistant who always tries to submit the best possible answer....
Apps prompt template: You are an expert python programmer.

You will be given a competitive coding problem. 
Solve it, and...


In [4]:
NUM_PROBLEMS = 100
MAX_CONCURRENT = 100
MAX_TOKENS = 10000
TEMPERATURE = 0.7

problems = ds[:NUM_PROBLEMS]
semaphore = asyncio.Semaphore(MAX_CONCURRENT)

async def generate_solution(problem, index):
    """Generate a solution for a single APPS problem via OpenRouter."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": apps_prompt.format(problem_statement=problem['question'])},
    ]
    async with semaphore:
        response = await client.chat.completions.create(
            model=MODEL,
            messages=messages,
            max_tokens=MAX_TOKENS,
            temperature=TEMPERATURE,
        )
    return {
        "index": index,
        "response": response.choices[0].message.content,
        "usage": {
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens,
        } if response.usage else None,
    }

print(f"Generating solutions for {len(problems)} problems (max {MAX_CONCURRENT} concurrent)...")
tasks = [generate_solution(p, i) for i, p in enumerate(problems)]
raw_results = await tqdm_asyncio.gather(*tasks, desc="Generating")

# Sort by index to maintain order
raw_results.sort(key=lambda x: x["index"])
print(f"Generated {len(raw_results)} responses")

Generating solutions for 100 problems (max 100 concurrent)...


Generating: 100%|██████████| 100/100 [10:02<00:00,  6.03s/it] 

Generated 100 responses





In [5]:
# Extract code and test solutions
solutions = []
responses = []
for r in raw_results:
    text = r["response"]
    responses.append(text)
    code = extract_xml_tag(text, "code")
    solutions.append(code if code else "")

# Parse test cases from dataset
test_cases_list = [ast.literal_eval(p['input_output']) for p in problems]

print(f"Testing {len(solutions)} solutions...")
test_results = test_solutions_batch(
    solutions=solutions,
    test_cases_list=test_cases_list,
    timeout=5.0,
    max_workers=8,
)

# Build results
results = []
num_correct = 0
for i, (problem, response, code, test_result) in enumerate(zip(problems, responses, solutions, test_results)):
    passed = test_result["passed"]
    if passed:
        num_correct += 1
    results.append({
        "problem": problem["question"],
        "response": response,
        "code": code,
        "correct": passed,
        "reason": test_result["reason"],
    })

accuracy = num_correct / len(problems)
print(f"Accuracy: {num_correct}/{len(problems)} = {accuracy:.2%}")

Testing 100 solutions...


Testing solutions: 100%|██████████| 100/100 [00:57<00:00,  1.73it/s]

Accuracy: 85/100 = 85.00%





In [None]:
# Save results
output_dir = Path('stored_outputs/apps')
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / 'claude_opus_apps_results.json'
with open(output_path, 'w') as f:
    json.dump(results, f, indent=2)

print(f"Saved {len(results)} results to {output_path}")
print(f"Correct: {sum(r['correct'] for r in results)}/{len(results)}")

In [None]:
# Inspect a few results
for i, r in enumerate(results[:3]):
    status = 'PASS' if r['correct'] else 'FAIL'
    print(f"\n{'='*60}")
    print(f"Problem {i} [{status}]: {r['problem'][:100]}...")
    print(f"Reason: {r['reason']}")
    if r['code']:
        print(f"Code preview: {r['code'][:200]}...")