# Interactive Worlds - Model & Prompt Testing

This notebook allows you to test different models and prompts for the Interactive Worlds game.

**Features:**
- OpenRouter Responses API with reasoning (effort: high)
- Thinking output exposed for debugging
- Streaming support with real-time reasoning display
- Compare different models and prompts

## Setup

First, make sure you have your OpenRouter API key in the `.env` file.

In [None]:
# Install required packages if needed
!pip install openai python-dotenv requests -q

In [None]:
import os
import json
import requests
from openai import OpenAI
from dotenv import load_dotenv
from datetime import datetime
from typing import Dict, List, Optional, Any

# Load environment variables
load_dotenv()

# Get API key
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in .env file")

# Initialize OpenRouter client for Chat Completions API (legacy)
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

print("‚úì Setup complete!")
print(f"‚úì API Key: {OPENROUTER_API_KEY[:10]}...")

## Available Models

Configure which models you want to test:

In [None]:
# Model configurations
MODELS = {
    # Free tier
    "gemini-flash-free": "google/gemini-2.0-flash-exp:free",
    "claude-haiku": "anthropic/claude-3.5-haiku",
    
    # Pro tier (with extended thinking support)
    "gemini-pro": "google/gemini-exp-1206",
    "claude-sonnet": "anthropic/claude-sonnet-4",
    
    # Alternative models for testing
    "gpt-4o": "openai/gpt-4o",
    "gpt-4o-mini": "openai/gpt-4o-mini",
    "o1-mini": "openai/o1-mini",
    "llama-70b": "meta-llama/llama-3.1-70b-instruct",
}

# Models that support extended thinking/reasoning
REASONING_MODELS = [
    "claude-haiku",
    "claude-sonnet",
    "o1-mini",
    "gemini-pro",
]

# Print available models
print("Available models:")
for name, model_id in MODELS.items():
    reasoning_support = "‚úì Reasoning" if name in REASONING_MODELS else ""
    print(f"  - {name}: {model_id} {reasoning_support}")

## System Prompt

Load the system prompt from the project:

In [None]:
# Read system prompt from file
with open('lib/prompts/system-prompt.ts', 'r') as f:
    content = f.read()
    # Extract the prompt from the TypeScript export
    start = content.find('`') + 1
    end = content.rfind('`')
    SYSTEM_PROMPT = content[start:end]

print(f"System prompt loaded: {len(SYSTEM_PROMPT)} characters")
print("\nFirst 500 characters:")
print(SYSTEM_PROMPT[:500] + "...")

## OpenRouter Responses API Functions

Functions for using the Responses API with reasoning support:

In [None]:
def test_with_reasoning(
    model_name: str,
    user_prompt: str,
    system_prompt: str = SYSTEM_PROMPT,
    effort: str = "high",
    max_tokens: int = 9000,
    stream: bool = True,
    show_reasoning: bool = True,
) -> Dict[str, Any]:
    """
    Test a model using OpenRouter's Responses API with reasoning.
    
    Args:
        model_name: Name from MODELS dict or full model ID
        user_prompt: The user message
        system_prompt: The system prompt
        effort: Reasoning effort level (minimal, low, medium, high)
        max_tokens: Maximum tokens to generate
        stream: Whether to stream the response
        show_reasoning: Whether to display thinking/reasoning output
    
    Returns:
        Dict with response, reasoning, and metadata
    """
    # Get model ID
    model_id = MODELS.get(model_name, model_name)
    
    print(f"\n{'='*80}")
    print(f"Testing: {model_name} ({model_id})")
    print(f"Reasoning Effort: {effort.upper()}, Max Tokens: {max_tokens}")
    print(f"{'='*80}\n")
    
    start_time = datetime.now()
    
    # Prepare request payload
    payload = {
        "model": model_id,
        "input": [
            {
                "type": "message",
                "role": "system",
                "content": [{"type": "input_text", "text": system_prompt}]
            },
            {
                "type": "message",
                "role": "user",
                "content": [{"type": "input_text", "text": user_prompt}]
            }
        ],
        "reasoning": {"effort": effort},
        "max_output_tokens": max_tokens,
        "stream": stream,
    }
    
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
    }
    
    try:
        if stream:
            # Streaming response
            response = requests.post(
                "https://openrouter.ai/api/v1/responses",
                headers=headers,
                json=payload,
                stream=True,
            )
            
            full_response = ""
            reasoning_steps = []
            
            print("üìù Response:\n")
            
            for line in response.iter_lines():
                if line:
                    line_text = line.decode('utf-8')
                    if line_text.startswith('data: '):
                        data = line_text[6:]
                        if data == '[DONE]':
                            break
                        
                        try:
                            parsed = json.loads(data)
                            
                            # Handle reasoning deltas
                            if parsed.get('type') == 'response.reasoning.delta':
                                if show_reasoning:
                                    delta_text = parsed.get('delta', '')
                                    reasoning_steps.append(delta_text)
                                    print(f"\nüß† Thinking: {delta_text}", flush=True)
                            
                            # Handle content deltas
                            elif parsed.get('type') == 'response.output_item.delta':
                                for item in parsed.get('delta', {}).get('content', []):
                                    if item.get('type') == 'output_text':
                                        text = item.get('text', '')
                                        print(text, end='', flush=True)
                                        full_response += text
                        
                        except json.JSONDecodeError:
                            continue
            
            print("\n")  # New line after streaming
            
        else:
            # Non-streaming response
            response = requests.post(
                "https://openrouter.ai/api/v1/responses",
                headers=headers,
                json=payload,
            )
            
            result = response.json()
            
            # Extract reasoning
            reasoning_steps = []
            full_response = ""
            
            for output_item in result.get('output', []):
                if output_item.get('type') == 'reasoning':
                    reasoning_steps = output_item.get('summary', [])
                    if show_reasoning:
                        print("üß† Reasoning Steps:")
                        for i, step in enumerate(reasoning_steps, 1):
                            print(f"  {i}. {step}")
                        print()
                
                elif output_item.get('type') == 'message':
                    for content_item in output_item.get('content', []):
                        if content_item.get('type') == 'output_text':
                            full_response = content_item.get('text', '')
            
            print("üìù Response:\n")
            print(full_response)
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        print(f"\n{'='*80}")
        print(f"Response length: {len(full_response)} characters")
        print(f"Reasoning steps: {len(reasoning_steps)}")
        print(f"Time taken: {duration:.2f} seconds")
        print(f"{'='*80}\n")
        
        return {
            "response": full_response,
            "reasoning": reasoning_steps,
            "duration": duration,
            "model": model_id,
        }
        
    except Exception as e:
        print(f"\n‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

print("‚úì Reasoning API functions ready!")

## Legacy Chat Completions API Function

For models that don't support reasoning:

In [None]:
def test_model_legacy(
    model_name: str,
    user_prompt: str,
    system_prompt: str = SYSTEM_PROMPT,
    temperature: float = 0.9,
    max_tokens: int = 8000,
    stream: bool = True,
):
    """
    Test a model using legacy Chat Completions API (no reasoning).
    """
    model_id = MODELS.get(model_name, model_name)
    
    print(f"\n{'='*80}")
    print(f"Testing: {model_name} ({model_id})")
    print(f"Temperature: {temperature}, Max Tokens: {max_tokens}")
    print(f"{'='*80}\n")
    
    start_time = datetime.now()
    
    try:
        if stream:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=temperature,
                max_tokens=max_tokens,
                stream=True,
            )
            
            full_response = ""
            for chunk in response:
                if chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    print(content, end='', flush=True)
                    full_response += content
            
            print()
            
        else:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=temperature,
                max_tokens=max_tokens,
            )
            
            full_response = response.choices[0].message.content
            print(full_response)
        
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        print(f"\n{'='*80}")
        print(f"Response length: {len(full_response)} characters")
        print(f"Time taken: {duration:.2f} seconds")
        print(f"{'='*80}\n")
        
        return full_response
        
    except Exception as e:
        print(f"\n‚ùå Error: {str(e)}")
        return None

print("‚úì Legacy API functions ready!")

## World Generation Prompts

Test different world generation scenarios:

In [None]:
# Example world generation prompts
WORLD_PROMPTS = {
    "medieval_fantasy": """
Please generate a complete world with the following parameters:
- World Type: Medieval Fantasy
- Power System: Hard Magic with strict scientific-like rules
- Starting Class: Peasant
- Additional Instructions: Create a rich, detailed world with complex political intrigue

Generate the complete story bible as specified in the system prompt, then begin my adventure.
""",
    
    "cultivation": """
Please generate a complete world with the following parameters:
- World Type: Cultivation/Progression Fantasy (Xianxia style)
- Power System: Cultivation with multiple realms and bottlenecks
- Starting Class: Mortal with no cultivation base
- Additional Instructions: Include hidden sects, secret techniques, and deadly competition for resources

Generate the complete story bible as specified in the system prompt, then begin my adventure.
""",
    
    "scifi": """
Please generate a complete world with the following parameters:
- World Type: Science Fiction with multiple planet civilizations
- Power System: Technology-based advancement and augmentations
- Starting Class: Low-level station worker
- Additional Instructions: Include galactic politics, corporate conspiracies, and AI threats

Generate the complete story bible as specified in the system prompt, then begin my adventure.
""",
    
    "alternate_history": """
Please generate a complete world with the following parameters:
- World Type: Alternate History Earth (1800s with magic)
- Power System: Soft magic with narrative flexibility
- Starting Class: Apprentice in a trading company
- Additional Instructions: Blend historical accuracy with magical elements, include colonial tensions

Generate the complete story bible as specified in the system prompt, then begin my adventure.
""",
}

print("Available world prompts:")
for name in WORLD_PROMPTS.keys():
    print(f"  - {name}")

## Test 1: Claude with High Reasoning Effort

Test Claude models with extended thinking visible:

In [None]:
# Configure test
MODEL_TO_TEST = "claude-sonnet"  # or "claude-haiku"
PROMPT_TO_TEST = "medieval_fantasy"

# Run test with reasoning
result = test_with_reasoning(
    model_name=MODEL_TO_TEST,
    user_prompt=WORLD_PROMPTS[PROMPT_TO_TEST],
    effort="high",  # Use high reasoning effort
    max_tokens=9000,
    stream=True,
    show_reasoning=True,  # Show thinking output
)

## Test 2: Compare Reasoning Effort Levels

Test how different reasoning effort levels affect the output:

In [None]:
effort_levels = ["minimal", "low", "medium", "high"]
model = "claude-haiku"
prompt = WORLD_PROMPTS["medieval_fantasy"]

results = {}

for effort in effort_levels:
    print(f"\n\n‚ö° Testing effort level: {effort.upper()}...\n")
    result = test_with_reasoning(
        model_name=model,
        user_prompt=prompt,
        effort=effort,
        max_tokens=4000,
        stream=False,
        show_reasoning=True,
    )
    results[effort] = result

# Summary
print("\n" + "="*80)
print("EFFORT LEVEL COMPARISON")
print("="*80)
for effort, result in results.items():
    if result:
        print(f"\n{effort.upper()}:")
        print(f"  - Response length: {len(result['response'])} chars")
        print(f"  - Reasoning steps: {len(result['reasoning'])}")
        print(f"  - Duration: {result['duration']:.2f}s")

## Test 3: Compare Multiple Models with Reasoning

Compare Claude, GPT, and Gemini with reasoning enabled:

In [None]:
models_to_compare = ["claude-haiku", "claude-sonnet", "o1-mini", "gemini-pro"]
prompt_to_use = "cultivation"

comparison_results = {}

for model in models_to_compare:
    if model in REASONING_MODELS:
        print(f"\n\nüß™ Testing {model} with reasoning...\n")
        result = test_with_reasoning(
            model_name=model,
            user_prompt=WORLD_PROMPTS[prompt_to_use],
            effort="high",
            max_tokens=5000,
            stream=False,
            show_reasoning=True,
        )
        comparison_results[model] = result
    else:
        print(f"\n\nüß™ Testing {model} (no reasoning)...\n")
        result = test_model_legacy(
            model_name=model,
            user_prompt=WORLD_PROMPTS[prompt_to_use],
            temperature=0.9,
            max_tokens=5000,
            stream=False,
        )
        comparison_results[model] = {"response": result, "reasoning": [], "duration": 0}

# Summary
print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
for model, result in comparison_results.items():
    if result:
        response = result.get('response', result if isinstance(result, str) else '')
        reasoning = result.get('reasoning', []) if isinstance(result, dict) else []
        has_spoiler = "```spoiler" in str(response)
        print(f"\n{model}:")
        print(f"  - Length: {len(str(response))} characters")
        print(f"  - Has spoiler blocks: {'‚úì' if has_spoiler else '‚úó'}")
        print(f"  - Reasoning steps: {len(reasoning)}")
        print(f"  - Preview: {str(response)[:150]}...")

## Test 4: Custom Prompt with Reasoning

Test your own custom prompt with reasoning enabled:

In [None]:
# Write your custom prompt here
custom_prompt = """
Please generate a complete world with the following parameters:
- World Type: [Your world type]
- Power System: [Your power system]
- Starting Class: [Your starting class]
- Additional Instructions: [Your custom instructions]

Generate the complete story bible as specified in the system prompt, then begin my adventure.
"""

# Run test
result = test_with_reasoning(
    model_name="claude-sonnet",
    user_prompt=custom_prompt,
    effort="high",
    max_tokens=9000,
    stream=True,
    show_reasoning=True,
)

## Analysis Tools

Helper functions to analyze responses:

In [None]:
import re

def analyze_response(response: str, reasoning: List[str] = None):
    """
    Analyze a world generation response for key components.
    """
    if not response:
        return None
    
    analysis = {
        "length": len(response),
        "has_spoiler_blocks": "```spoiler" in response,
        "spoiler_count": response.count("```spoiler"),
        "has_story_bible": "story bible" in response.lower() or "section 1" in response.lower(),
        "has_choices": any(marker in response.lower() for marker in ["[a]", "[b]", "[c]", "what do you do"]),
        "has_character_intro": any(word in response.lower() for word in ["you are", "your name", "your character"]),
        "reasoning_steps": len(reasoning) if reasoning else 0,
    }
    
    # Extract spoiler content
    spoilers = re.findall(r'```spoiler\n([\s\S]*?)\n```', response)
    analysis["spoiler_lengths"] = [len(s) for s in spoilers]
    
    return analysis

# Example usage:
# if result:
#     analysis = analyze_response(result['response'], result.get('reasoning'))
#     print(json.dumps(analysis, indent=2))

print("‚úì Analysis tools ready!")

## Save Results

Save test results with reasoning data:

In [None]:
def save_test_result(model_name, prompt_name, result, metadata=None):
    """
    Save a test result to a JSON file.
    """
    timestamp = datetime.now().isoformat()
    filename = f"test_results/{model_name}_{prompt_name}_{timestamp.replace(':', '-')}.json"
    
    os.makedirs("test_results", exist_ok=True)
    
    # Handle both dict results (with reasoning) and string results (legacy)
    if isinstance(result, dict):
        response = result.get('response', '')
        reasoning = result.get('reasoning', [])
    else:
        response = result
        reasoning = []
    
    save_data = {
        "timestamp": timestamp,
        "model": model_name,
        "prompt_name": prompt_name,
        "response": response,
        "reasoning": reasoning,
        "analysis": analyze_response(response, reasoning),
        "metadata": metadata or {},
    }
    
    with open(filename, 'w') as f:
        json.dump(save_data, f, indent=2)
    
    print(f"‚úì Saved to {filename}")

# Example usage:
# save_test_result("claude-sonnet", "medieval_fantasy", result, {"effort": "high"})

print("‚úì Save functions ready!")