# 🔍 Prompt Inspector

This notebook provides an interactive way to inspect prompts, model responses, user feedback, and LLM evaluations from the database.

## Features:
- 📋 View complete prompt information
- 🤖 See all model responses
- ⭐ Check user ratings and feedback
- 🧠 Review LLM evaluations
- 🕒 List recent prompts

## Setup and Imports

In [23]:
import os
import re
from dotenv import load_dotenv
from supabase import create_client, Client
import json
from datetime import datetime
import pandas as pd

# Load environment variables
load_dotenv()

print("✅ Imports loaded successfully!")

✅ Imports loaded successfully!


## PromptInspector Class

In [7]:
class PromptInspector:
    def __init__(self):
        self.supabase_url = os.getenv("SUPABASE_URL")
        self.supabase_key = os.getenv("SUPABASE_ANON_KEY")
        
        if not self.supabase_url or not self.supabase_key:
            raise ValueError("Missing SUPABASE_URL or SUPABASE_ANON_KEY environment variables")
        
        self.supabase: Client = create_client(self.supabase_url, self.supabase_key)
    
    def get_prompt_info(self, prompt_id=None, prompt_text=None):
        """Retrieve complete prompt information"""
        
        if prompt_id:
            # Get by prompt ID
            prompt_result = self.supabase.table("prompts").select("*").eq("id", prompt_id).execute()
            if not prompt_result.data:
                print(f"❌ No prompt found with ID: {prompt_id}")
                return None
            prompt_data = prompt_result.data[0]
            
        elif prompt_text:
            # Get most recent prompt matching the text
            prompt_result = self.supabase.table("prompts")\
                .select("*")\
                .eq("prompt_text", prompt_text)\
                .order("created_at", desc=True)\
                .limit(1)\
                .execute()
            if not prompt_result.data:
                print(f"❌ No prompt found with text: {prompt_text}")
                return None
            prompt_data = prompt_result.data[0]
            prompt_id = prompt_data["id"]
        else:
            print("❌ Please provide either prompt_id or prompt_text")
            return None
        
        # Get all responses for this prompt
        responses_result = self.supabase.table("model_responses")\
            .select("*")\
            .eq("prompt_id", prompt_id)\
            .order("created_at")\
            .execute()
        
        # Get feedback for this prompt
        feedback_result = self.supabase.table("response_feedback")\
            .select("*")\
            .eq("prompt_id", prompt_id)\
            .execute()
        
        # Get LLM evaluations if they exist
        evaluations_result = self.supabase.table("llm_evaluations")\
            .select("*")\
            .eq("prompt_id", prompt_id)\
            .execute()
        
        return {
            'prompt': prompt_data,
            'responses': responses_result.data,
            'feedback': feedback_result.data,
            'evaluations': evaluations_result.data
        }
    
    def display_prompt_info(self, info):
        """Display prompt information in a formatted way"""
        if not info:
            return
        
        prompt = info['prompt']
        responses = info['responses']
        feedback = info['feedback']
        evaluations = info['evaluations']
        
        print("=" * 80)
        print("📋 PROMPT INFORMATION")
        print("=" * 80)
        
        # Prompt details
        print(f"🆔 ID: {prompt['id']}")
        print(f"👤 Username: {prompt.get('username', 'Anonymous')}")
        print(f"📅 Created: {self._format_datetime(prompt['created_at'])}")
        print(f"📊 Status: {prompt['status']}")
        print(f"🔢 Total Models: {prompt['total_models']}")
        print(f"🎯 Selected Models: {', '.join(prompt['selected_models'])}")
        print()
        print(f"💬 Prompt Text:")
        print("-" * 40)
        print(prompt['prompt_text'])
        print("-" * 40)
        print()
        
        # Responses
        print("🤖 MODEL RESPONSES")
        print("=" * 50)
        
        for i, response in enumerate(responses, 1):
            print(f"\n{i}. {response['model_name']}")
            print(f"   ⏱️ Response Time: {response.get('response_time_ms', 'N/A')} ms")
            print(f"   📅 Created: {self._format_datetime(response['created_at'])}")
            
            if response['response_error']:
                print(f"   ❌ Error: {response['response_error']}")
            else:
                content = response['response_content']
                if len(content) > 200:
                    print(f"   📝 Content (first 200 chars): {content[:200]}...")
                else:
                    print(f"   📝 Content: {content}")
            print()
        
        # Feedback
        if feedback:
            print("💭 USER FEEDBACK")
            print("=" * 50)
            
            feedback_by_response = {}
            for fb in feedback:
                response_id = fb['response_id']
                if response_id not in feedback_by_response:
                    feedback_by_response[response_id] = []
                feedback_by_response[response_id].append(fb)
            
            for response in responses:
                response_id = response['id']
                model_name = response['model_name']
                
                if response_id in feedback_by_response:
                    print(f"\n🤖 {model_name}")
                    for fb in feedback_by_response[response_id]:
                        print(f"   👤 User: {fb.get('username', 'Anonymous')}")
                        if fb['rating']:
                            stars = "⭐" * fb['rating']
                            print(f"   ⭐ Rating: {stars} ({fb['rating']}/5)")
                        if fb['rank_position']:
                            rank_emoji = "🥇" if fb['rank_position'] == 1 else "🥈" if fb['rank_position'] == 2 else "🥉" if fb['rank_position'] == 3 else "🏅"
                            print(f"   🏆 Rank: #{fb['rank_position']} {rank_emoji}")
                        if fb['feedback_text']:
                            print(f"   💬 Comment: {fb['feedback_text']}")
                        print(f"   📅 Given: {self._format_datetime(fb['created_at'])}")
                        print()
        else:
            print("\n💭 USER FEEDBACK: None")
        
        # LLM Evaluations
        if evaluations:
            print("\n🧠 LLM EVALUATIONS")
            print("=" * 50)
            
            for eval_data in evaluations:
                print(f"\n🤖 {eval_data['model_name']}")
                print(f"   🧑‍⚖️ Judge: {eval_data['judge_model']}")
                print(f"   📅 Evaluated: {self._format_datetime(eval_data['created_at'])}")
                
                if eval_data['scores']:
                    scores = json.loads(eval_data['scores']) if isinstance(eval_data['scores'], str) else eval_data['scores']
                    print(f"   📊 Scores:")
                    if 'overall' in scores:
                        print(f"      Overall: {scores['overall']}/10")
                    if 'confusion_recognition' in scores:
                        print(f"      Confusion Recognition: {scores['confusion_recognition']}/10")
                    if 'adaptive_response' in scores:
                        print(f"      Adaptive Response: {scores['adaptive_response']}/10")
                    if 'learning_facilitation' in scores:
                        print(f"      Learning Facilitation: {scores['learning_facilitation']}/10")
                    if 'strategic_decision' in scores:
                        print(f"      Strategic Decision: {scores['strategic_decision']}/10")
                    if 'engagement_eq' in scores:
                        print(f"      Engagement & EQ: {scores['engagement_eq']}/10")
                
                # Show first 300 chars of evaluation
                eval_text = eval_data['evaluation_text']
                if len(eval_text) > 300:
                    print(f"   📝 Evaluation (first 300 chars): {eval_text[:300]}...")
                else:
                    print(f"   📝 Evaluation: {eval_text}")
                print()
        else:
            print("\n🧠 LLM EVALUATIONS: None")
        
        print("=" * 80)
    
    def _format_datetime(self, dt_string):
        """Format datetime string for display"""
        try:
            dt = datetime.fromisoformat(dt_string.replace('Z', '+00:00'))
            return dt.strftime("%Y-%m-%d %H:%M:%S")
        except:
            return dt_string
    
    def list_recent_prompts(self, limit=10):
        """List recent prompts"""
        result = self.supabase.table("prompts")\
            .select("id, username, prompt_text, created_at, total_models")\
            .order("created_at", desc=True)\
            .limit(limit)\
            .execute()
        
        print("🕒 RECENT PROMPTS")
        print("=" * 60)
        
        for i, prompt in enumerate(result.data, 1):
            prompt_preview = prompt['prompt_text'][:60] + "..." if len(prompt['prompt_text']) > 60 else prompt['prompt_text']
            print(f"{i}. ID: {prompt['id']}")
            print(f"   👤 {prompt.get('username', 'Anonymous')} | 🤖 {prompt['total_models']} models | 📅 {self._format_datetime(prompt['created_at'])}")
            print(f"   💬 {prompt_preview}")
            print()
    
    def get_responses_dataframe(self, prompt_id):
        """Get responses as a pandas DataFrame for easy analysis"""
        info = self.get_prompt_info(prompt_id=prompt_id)
        if not info or not info['responses']:
            return None
        
        responses_data = []
        for response in info['responses']:
            responses_data.append({
                'model_name': response['model_name'],
                'response_time_ms': response.get('response_time_ms'),
                'has_error': bool(response['response_error']),
                'content_length': len(response['response_content']) if response['response_content'] else 0,
                'created_at': response['created_at']
            })
        
        return pd.DataFrame(responses_data)

# Initialize the inspector
inspector = PromptInspector()
print("✅ PromptInspector initialized successfully!")

✅ PromptInspector initialized successfully!


## 🕒 List Recent Prompts

Get an overview of recent prompts to find the one you want to inspect:

In [8]:
# List the 10 most recent prompts
inspector.list_recent_prompts(10)

🕒 RECENT PROMPTS
1. ID: 965c4175-1e48-4cd5-a4d5-af229ffee8b4
   👤 None | 🤖 4 models | 📅 2025-09-27 02:56:29
   💬 
I am writing a fun educational article on 'Matrices and Gen...

2. ID: 1b7442da-f930-42a3-ab8b-4a7d457ee648
   👤 Satvik | 🤖 4 models | 📅 2025-09-27 01:17:15
   💬 Explain transformers architecture like I am 16. Use basic te...

3. ID: 27a38909-e520-4a65-8663-8fbae0a4ded4
   👤 Satvik | 🤖 3 models | 📅 2025-09-26 08:38:51
   💬 Student: "I simplified x² + x³ and got x⁵ because 2 + 3 = 5"...

4. ID: c9fc23c4-d7c5-4df4-ac2f-20ad4a424b58
   👤 Satvik | 🤖 3 models | 📅 2025-09-25 10:33:11
   💬 A student has made the same mistake 5 times even after your ...

5. ID: 9b26b181-bb4f-4270-8a62-99b14f5c9c90
   👤 None | 🤖 3 models | 📅 2025-09-25 09:26:54
   💬 Student: "I keep getting different answers when I solve 2x +...

6. ID: 836d3ed0-8be0-4b92-9535-d9dcfd40706b
   👤 None | 🤖 4 models | 📅 2025-09-25 09:12:55
   💬 Student: "Heavy objects fall faster because gravity pulls th...

7. ID: 39b3

## 🔍 Inspect Specific Prompt

Choose one of the methods below to inspect a specific prompt:

### Method 1: By Prompt ID

In [9]:
# Replace with your actual prompt ID
prompt_id = "1b7442da-f930-42a3-ab8b-4a7d457ee648"

info = inspector.get_prompt_info(prompt_id=prompt_id)
inspector.display_prompt_info(info)

📋 PROMPT INFORMATION
🆔 ID: 1b7442da-f930-42a3-ab8b-4a7d457ee648
👤 Username: Satvik
📅 Created: 2025-09-27 01:17:15
📊 Status: completed
🔢 Total Models: 4
🎯 Selected Models: openai/gpt-5, anthropic/claude-sonnet-4, deepseek/deepseek-v3.1-terminus, moonshotai/kimi-k2-0905

💬 Prompt Text:
----------------------------------------
Explain transformers architecture like I am 16. Use basic terms of architecture like encoder, decoder, self-attention, parallelization
----------------------------------------

🤖 MODEL RESPONSES

1. openai/gpt-5
   ⏱️ Response Time: 6986 ms
   📅 Created: 2025-09-27 01:17:43
   📝 Content (first 200 chars): Here’s the transformer idea in plain terms, using the key parts you named.

What problem it solves
- We want a model that can read a sequence (like a sentence) and write a sequence (like a translation...


2. anthropic/claude-sonnet-4
   ⏱️ Response Time: 7041 ms
   📅 Created: 2025-09-27T01:17:43.95074+00:00
   📝 Content (first 200 chars): # Transformers: The Brain

### Method 2: By Prompt Text (finds most recent match)

In [10]:
# Replace with your actual prompt text
prompt_text = "Explain transformers architecture like I am 16"

info = inspector.get_prompt_info(prompt_text=prompt_text)
inspector.display_prompt_info(info)

❌ No prompt found with text: Explain transformers architecture like I am 16


## 📊 Response Analysis DataFrame

Get a pandas DataFrame for easy analysis of response metrics:

In [11]:
# Get responses as DataFrame
prompt_id = "1b7442da-f930-42a3-ab8b-4a7d457ee648"  # Replace with your prompt ID

df = inspector.get_responses_dataframe(prompt_id)
if df is not None:
    print("📊 Response Metrics:")
    display(df)
    
    print("\n📈 Summary Statistics:")
    print(f"Average response time: {df['response_time_ms'].mean():.1f} ms")
    print(f"Fastest model: {df.loc[df['response_time_ms'].idxmin(), 'model_name']}")
    print(f"Slowest model: {df.loc[df['response_time_ms'].idxmax(), 'model_name']}")
    print(f"Average content length: {df['content_length'].mean():.0f} characters")
    print(f"Models with errors: {df['has_error'].sum()}")
else:
    print("❌ No response data found")

📊 Response Metrics:


Unnamed: 0,model_name,response_time_ms,has_error,content_length,created_at
0,openai/gpt-5,6986,False,4105,2025-09-27T01:17:43.792374+00:00
1,anthropic/claude-sonnet-4,7041,False,3430,2025-09-27T01:17:43.95074+00:00
2,deepseek/deepseek-v3.1-terminus,7068,False,4775,2025-09-27T01:17:44.070435+00:00
3,moonshotai/kimi-k2-0905,7103,False,2106,2025-09-27T01:17:44.197266+00:00



📈 Summary Statistics:
Average response time: 7049.5 ms
Fastest model: openai/gpt-5
Slowest model: moonshotai/kimi-k2-0905
Average content length: 3604 characters
Models with errors: 0


## 🧠 Extract Full Response Content

Get the full response content for detailed analysis:

In [12]:
# Extract full response content for a specific prompt
prompt_id = "1b7442da-f930-42a3-ab8b-4a7d457ee648"  # Replace with your prompt ID

info = inspector.get_prompt_info(prompt_id=prompt_id)
if info and info['responses']:
    prompt_text = info['prompt']['prompt_text']
    print(f"📝 PROMPT: {prompt_text}")
    print("=" * 80)
    
    for i, response in enumerate(info['responses'], 1):
        if response['response_content'] and not response['response_error']:
            print(f"\n🤖 {i}. {response['model_name']}")
            print("-" * 50)
            print(response['response_content'])
            print("\n" + "=" * 80)
        else:
            print(f"\n🤖 {i}. {response['model_name']} - ❌ Error: {response['response_error']}")
else:
    print("❌ No responses found")

📝 PROMPT: Explain transformers architecture like I am 16. Use basic terms of architecture like encoder, decoder, self-attention, parallelization

🤖 1. openai/gpt-5
--------------------------------------------------
Here’s the transformer idea in plain terms, using the key parts you named.

What problem it solves
- We want a model that can read a sequence (like a sentence) and write a sequence (like a translation or the next words), while noticing relationships between words far apart. We also want it to run fast on modern hardware.

Core pieces
- Tokens and embeddings: Words (or word-pieces) are turned into numbers called embeddings so the model can do math with them.
- Positional encoding: Since a bag of words ignores order, we add position info so the model knows “this is the 3rd word,” “this is the 4th,” etc.
- Self-attention: For each word, the model looks at all other words and decides how much to pay attention to each. It then blends information from the most relevant words. Thin

## 🎯 Quick Access Functions

Use these cells for quick access to common operations:

In [13]:
# Quick function to inspect any prompt ID
def quick_inspect(prompt_id):
    info = inspector.get_prompt_info(prompt_id=prompt_id)
    inspector.display_prompt_info(info)
    return info

# Example usage:
# quick_inspect("your-prompt-id-here")

In [14]:
# Quick function to get just the response content
def get_response_content(prompt_id, model_name=None):
    info = inspector.get_prompt_info(prompt_id=prompt_id)
    if not info or not info['responses']:
        return None
    
    if model_name:
        # Get specific model response
        for response in info['responses']:
            if response['model_name'] == model_name:
                return response['response_content']
        return f"Model '{model_name}' not found"
    else:
        # Get all responses
        responses = {}
        for response in info['responses']:
            responses[response['model_name']] = response['response_content']
        return responses

# Example usage:
# content = get_response_content("your-prompt-id", "openai/gpt-5")
# all_content = get_response_content("your-prompt-id")

## 🧠 LLM Evaluation Engine

Evaluate model responses using Grok-4-Fast as a judge with the evaluation system prompt:

In [16]:
import os
from langchain_openai import ChatOpenAI

class LLMEvaluator:
    def __init__(self):
        self.api_key = os.getenv("OPENROUTER_API_KEY")
        
        if not self.api_key:
            raise ValueError("Missing OPENROUTER_API_KEY environment variable")
        
        # Initialize Grok-4-Fast as judge model
        self.judge_model = ChatOpenAI(
            model="x-ai/grok-4-fast:free",
            openai_api_key=self.api_key,
            openai_api_base="https://openrouter.ai/api/v1",
            headers={
                "HTTP-Referer": "https://github.com/satvik314/educhain-tutorbench",
                "X-Title": "Educhain TutorBench Evaluator"
            }
        )
        
        # Load evaluation system prompt
        self.system_prompt = self._load_evaluation_prompt()
    
    def _load_evaluation_prompt(self):
        """Load the evaluation system prompt from file"""
        try:
            with open('/Users/satvikp/Desktop/mygit/educhain-tutorbench/prompts/evaluation_prompt.md', 'r') as f:
                return f.read()
        except FileNotFoundError:
            print("⚠️ evaluation_prompt.md not found. Using default evaluation prompt.")
            return """
You are an expert educational assessment specialist. Evaluate AI tutoring responses based on teaching effectiveness.

Score each response on these dimensions (1-10):
1. Confusion Recognition: How well does it identify student's confusion?
2. Adaptive Response: How well matched to student's level and needs?
3. Learning Facilitation: Will the student understand and learn?
4. Strategic Decision-Making: Good choice of teaching approach?
5. Engagement & Emotional Intelligence: Appropriate tone and motivation?

Provide scores and brief explanations for each dimension.
"""
    
    def evaluate_single_response(self, prompt_text, model_name, response_content):
        """Evaluate a single model response"""
        evaluation_prompt = f"""
{self.system_prompt}

## Evaluation Task

**Teaching Scenario/Student Question:**
{prompt_text}

**AI Model Response ({model_name}):**
{response_content}

Please evaluate this response using the framework provided. Provide scores for all 5 dimensions and follow the output format specified.
"""
        
        try:
            response = self.judge_model.invoke(evaluation_prompt)
            return response.content
        except Exception as e:
            return f"Error during evaluation: {str(e)}"
    
    def evaluate_multiple_responses(self, prompt_text, responses):
        """Evaluate multiple responses comparatively"""
        response_text = ""
        for i, resp in enumerate(responses, 1):
            response_text += f"\n**Response {chr(64+i)} ({resp['model_name']}):**\n{resp['response_content']}\n"
        
        evaluation_prompt = f"""
{self.system_prompt}

## Comparative Evaluation Task

**Teaching Scenario/Student Question:**
{prompt_text}

**AI Model Responses:**
{response_text}

Please evaluate these responses using the comparative evaluation framework. Rank them and provide detailed comparison.
"""
        
        try:
            response = self.judge_model.invoke(evaluation_prompt)
            return response.content
        except Exception as e:
            return f"Error during comparative evaluation: {str(e)}"
    
    def parse_scores(self, evaluation_text):
        """Parse numerical scores from evaluation text"""
        scores = {}
        
        # Multiple patterns to catch different formats
        patterns = {
            'confusion_recognition': [
                r'Confusion Recognition:?\s*(\d+)/10',
                r'Confusion Recognition.*?(\d+)\s*/\s*10',
                r'(?:^|\n)\s*-?\s*Confusion Recognition:?\s*(\d+)/10'
            ],
            'adaptive_response': [
                r'Adaptive Response:?\s*(\d+)/10',
                r'Adaptive Response.*?(\d+)\s*/\s*10',
                r'(?:^|\n)\s*-?\s*Adaptive Response:?\s*(\d+)/10'
            ],
            'learning_facilitation': [
                r'Learning Facilitation:?\s*(\d+)/10',
                r'Learning Facilitation.*?(\d+)\s*/\s*10',
                r'(?:^|\n)\s*-?\s*Learning Facilitation:?\s*(\d+)/10'
            ],
            'strategic_decision': [
                r'Strategic Decision(?:-Making)?:?\s*(\d+)/10',
                r'Strategic Decision.*?(\d+)\s*/\s*10',
                r'(?:^|\n)\s*-?\s*Strategic Decision(?:-Making)?:?\s*(\d+)/10'
            ],
            'engagement_eq': [
                r'Engagement.*?(?:EQ|Intelligence):?\s*(\d+)/10',
                r'Engagement.*?(?:EQ|Intelligence).*?(\d+)\s*/\s*10',
                r'(?:^|\n)\s*-?\s*Engagement.*?(?:EQ|Intelligence):?\s*(\d+)/10'
            ]
        }
        
        for dimension, pattern_list in patterns.items():
            for pattern in pattern_list:
                match = re.search(pattern, evaluation_text, re.IGNORECASE | re.MULTILINE)
                if match:
                    scores[dimension] = int(match.group(1))
                    break
        
        # Try to extract overall score
        overall_patterns = [
            r'Overall.*?(?:Effectiveness\s+)?Score.*?(\d+(?:\.\d+)?)/10',
            r'Overall.*?Score.*?(\d+(?:\.\d+)?)\s*/\s*10',
            r'\*\*Overall.*?Score\*\*:?\s*(\d+(?:\.\d+)?)/10'
        ]
        
        for pattern in overall_patterns:
            match = re.search(pattern, evaluation_text, re.IGNORECASE | re.MULTILINE)
            if match:
                scores['overall'] = float(match.group(1))
                break
        
        return scores
    
    def store_evaluation(self, prompt_id, model_name, evaluation_text, scores):
        """Store evaluation results in database"""
        evaluation_data = {
            "prompt_id": prompt_id,
            "model_name": model_name,
            "evaluation_text": evaluation_text,
            "scores": json.dumps(scores) if scores else None,
            "judge_model": "x-ai/grok-4-fast:free"
        }
        
        try:
            result = inspector.supabase.table("llm_evaluations").insert(evaluation_data).execute()
            return result.data[0]["id"]
        except Exception as e:
            print(f"Error storing evaluation: {e}")
            return None

# Initialize the evaluator
evaluator = LLMEvaluator()
print("✅ LLM Evaluator initialized successfully!")

✅ LLM Evaluator initialized successfully!


                headers was transferred to model_kwargs.
                Please confirm that headers is what you intended.
  exec(code_obj, self.user_global_ns, self.user_ns)


### 🔍 Evaluate Individual Response

Evaluate a single model response:

In [17]:
def evaluate_single_model(prompt_id, model_name, store_results=True):
    """Evaluate a single model response for a given prompt"""
    
    # Get prompt information
    info = inspector.get_prompt_info(prompt_id=prompt_id)
    if not info:
        return None
    
    prompt_text = info['prompt']['prompt_text']
    
    # Find the specific model response
    target_response = None
    for response in info['responses']:
        if response['model_name'] == model_name:
            target_response = response
            break
    
    if not target_response:
        print(f"❌ Model '{model_name}' not found in responses")
        return None
    
    if target_response['response_error']:
        print(f"❌ Cannot evaluate - model had error: {target_response['response_error']}")
        return None
    
    print(f"🔍 Evaluating {model_name} response...")
    print(f"📝 Prompt: {prompt_text[:100]}...")
    print()
    
    # Perform evaluation
    evaluation = evaluator.evaluate_single_response(
        prompt_text, 
        model_name, 
        target_response['response_content']
    )
    
    # Parse scores
    scores = evaluator.parse_scores(evaluation)
    
    # Store in database if requested
    eval_id = None
    if store_results:
        eval_id = evaluator.store_evaluation(prompt_id, model_name, evaluation, scores)
        if eval_id:
            print(f"✅ Evaluation stored in database (ID: {eval_id})")
    
    # Display results
    print("=" * 80)
    print(f"🧠 EVALUATION RESULTS - {model_name}")
    print("=" * 80)
    
    if scores:
        print("📊 SCORES:")
        for dimension, score in scores.items():
            dimension_name = dimension.replace('_', ' ').title()
            print(f"   {dimension_name}: {score}/10")
        print()
    
    print("📝 DETAILED EVALUATION:")
    print("-" * 60)
    print(evaluation)
    print("-" * 60)
    
    return {
        'evaluation': evaluation,
        'scores': scores,
        'eval_id': eval_id
    }

# Example usage:
# result = evaluate_single_model("1b7442da-f930-42a3-ab8b-4a7d457ee648", "openai/gpt-5")

### 🏆 Evaluate All Responses (Comparative)

Evaluate all model responses for a prompt comparatively:

In [18]:
def evaluate_all_responses(prompt_id, store_results=True, include_comparative=True):
    """Evaluate all model responses for a given prompt"""
    
    # Get prompt information
    info = inspector.get_prompt_info(prompt_id=prompt_id)
    if not info:
        return None
    
    prompt_text = info['prompt']['prompt_text']
    responses = info['responses']
    
    # Filter out error responses
    valid_responses = [r for r in responses if r['response_content'] and not r['response_error']]
    
    if not valid_responses:
        print("❌ No valid responses found to evaluate")
        return None
    
    print(f"🔍 Evaluating {len(valid_responses)} model responses...")
    print(f"📝 Prompt: {prompt_text[:100]}...")
    print()
    
    results = {}
    
    # Individual evaluations
    print("🤖 INDIVIDUAL EVALUATIONS")
    print("=" * 60)
    
    for response in valid_responses:
        model_name = response['model_name']
        print(f"\n📊 Evaluating {model_name}...")
        
        # Perform evaluation
        evaluation = evaluator.evaluate_single_response(
            prompt_text, 
            model_name, 
            response['response_content']
        )
        
        # Parse scores
        scores = evaluator.parse_scores(evaluation)
        
        # Store in database if requested
        eval_id = None
        if store_results:
            eval_id = evaluator.store_evaluation(prompt_id, model_name, evaluation, scores)
        
        results[model_name] = {
            'evaluation': evaluation,
            'scores': scores,
            'eval_id': eval_id
        }
        
        # Show brief scores
        if scores:
            overall = scores.get('overall', 'N/A')
            confusion = scores.get('confusion_recognition', 'N/A')
            adaptive = scores.get('adaptive_response', 'N/A')
            learning = scores.get('learning_facilitation', 'N/A')
            print(f"   Overall: {overall}/10 | Confusion: {confusion}/10 | Adaptive: {adaptive}/10 | Learning: {learning}/10")
        else:
            print("   ⚠️ Could not parse scores")
    
    # Comparative evaluation
    if include_comparative and len(valid_responses) > 1:
        print(f"\n🏆 COMPARATIVE EVALUATION")
        print("=" * 60)
        
        comparative_eval = evaluator.evaluate_multiple_responses(prompt_text, valid_responses)
        results['comparative'] = comparative_eval
        
        print(comparative_eval)
    
    print(f"\n✅ Evaluation completed!")
    if store_results:
        print(f"💾 Results stored in database")
    
    return results

# Example usage:
# results = evaluate_all_responses("1b7442da-f930-42a3-ab8b-4a7d457ee648")

### 📊 Evaluation Results Dashboard

View evaluation results in a structured format:

In [20]:
def show_evaluation_dashboard(prompt_id):
    """Display a comprehensive evaluation dashboard for a prompt"""
    
    # Get prompt information including existing evaluations
    info = inspector.get_prompt_info(prompt_id=prompt_id)
    if not info:
        return None
    
    prompt_text = info['prompt']['prompt_text']
    responses = info['responses']
    evaluations = info['evaluations']
    
    print("=" * 100)
    print("🧠 EVALUATION DASHBOARD")
    print("=" * 100)
    print(f"📝 Prompt: {prompt_text}")
    print(f"🤖 Models: {len(responses)} | 🧠 Evaluations: {len(evaluations)}")
    print()
    
    if not evaluations:
        print("❌ No evaluations found for this prompt.")
        print("💡 Use evaluate_single_model() or evaluate_all_responses() to generate evaluations.")
        return None
    
    # Create evaluation summary DataFrame
    eval_data = []
    for eval_item in evaluations:
        scores = json.loads(eval_item['scores']) if eval_item['scores'] else {}
        eval_data.append({
            'Model': eval_item['model_name'],
            'Overall': scores.get('overall', 'N/A'),
            'Confusion Recognition': scores.get('confusion_recognition', 'N/A'),
            'Adaptive Response': scores.get('adaptive_response', 'N/A'),
            'Learning Facilitation': scores.get('learning_facilitation', 'N/A'),
            'Strategic Decision': scores.get('strategic_decision', 'N/A'),
            'Engagement & EQ': scores.get('engagement_eq', 'N/A'),
            'Judge Model': eval_item['judge_model'],
            'Evaluated': eval_item['created_at'][:10]  # Date only
        })
    
    df = pd.DataFrame(eval_data)
    
    print("📊 EVALUATION SCORES SUMMARY:")
    print(df.to_string(index=False))
    print()
    
    # Show rankings
    numeric_scores = df[df['Overall'] != 'N/A'].copy()
    if not numeric_scores.empty:
        numeric_scores['Overall'] = pd.to_numeric(numeric_scores['Overall'])
        numeric_scores = numeric_scores.sort_values('Overall', ascending=False)
        
        print("🏆 RANKINGS (by Overall Score):")
        for i, (_, row) in enumerate(numeric_scores.iterrows(), 1):
            medal = "🥇" if i == 1 else "🥈" if i == 2 else "🥉" if i == 3 else f"{i}."
            print(f"   {medal} {row['Model']}: {row['Overall']}/10")
        print()
    
    # Show detailed evaluations option
    print("📝 DETAILED EVALUATIONS:")
    print("=" * 80)
    
    for eval_item in evaluations:
        print(f"\n🤖 {eval_item['model_name']}")
        print("-" * 50)
        print(eval_item['evaluation_text'])
        print("-" * 50)
    
    return df

# Example usage:
# dashboard = show_evaluation_dashboard("1b7442da-f930-42a3-ab8b-4a7d457ee648")

### 🚀 Quick Evaluation Functions

Ready-to-use functions for common evaluation tasks:

In [21]:
# Quick evaluation workflow functions

def quick_evaluate(prompt_id):
    """Quick evaluation of all responses for a prompt"""
    print(f"🚀 Starting quick evaluation for prompt: {prompt_id}")
    results = evaluate_all_responses(prompt_id, store_results=True, include_comparative=True)
    print("\n" + "=" * 80)
    dashboard = show_evaluation_dashboard(prompt_id)
    return results, dashboard

def evaluate_by_prompt_text(prompt_text):
    """Find and evaluate the most recent prompt matching the text"""
    info = inspector.get_prompt_info(prompt_text=prompt_text)
    if info:
        prompt_id = info['prompt']['id']
        return quick_evaluate(prompt_id)
    return None

def compare_models(prompt_id, model_names):
    """Compare specific models for a prompt"""
    results = {}
    for model_name in model_names:
        print(f"\n🔍 Evaluating {model_name}...")
        result = evaluate_single_model(prompt_id, model_name, store_results=True)
        if result:
            results[model_name] = result
    
    # Create comparison table
    comparison_data = []
    for model_name, result in results.items():
        scores = result['scores']
        comparison_data.append({
            'Model': model_name,
            'Overall': scores.get('overall', 'N/A'),
            'Confusion Recognition': scores.get('confusion_recognition', 'N/A'),
            'Adaptive Response': scores.get('adaptive_response', 'N/A'),
            'Learning Facilitation': scores.get('learning_facilitation', 'N/A')
        })
    
    if comparison_data:
        df = pd.DataFrame(comparison_data)
        print("\n📊 MODEL COMPARISON:")
        print(df.to_string(index=False))
    
    return results

# Example workflows:

# 1. Quick evaluate all responses for a prompt
# results, dashboard = quick_evaluate("1b7442da-f930-42a3-ab8b-4a7d457ee648")

# 2. Evaluate by prompt text
# results = evaluate_by_prompt_text("Explain transformers architecture")

# 3. Compare specific models
# comparison = compare_models("prompt-id-here", ["openai/gpt-5", "anthropic/claude-sonnet-4"])

print("✅ Quick evaluation functions ready to use!")

✅ Quick evaluation functions ready to use!


In [24]:
quick_evaluate("1b7442da-f930-42a3-ab8b-4a7d457ee648")

🚀 Starting quick evaluation for prompt: 1b7442da-f930-42a3-ab8b-4a7d457ee648
🔍 Evaluating 4 model responses...
📝 Prompt: Explain transformers architecture like I am 16. Use basic terms of architecture like encoder, decode...

🤖 INDIVIDUAL EVALUATIONS

📊 Evaluating openai/gpt-5...
   ⚠️ Could not parse scores

📊 Evaluating anthropic/claude-sonnet-4...
   ⚠️ Could not parse scores

📊 Evaluating deepseek/deepseek-v3.1-terminus...
   ⚠️ Could not parse scores

📊 Evaluating moonshotai/kimi-k2-0905...
   ⚠️ Could not parse scores

🏆 COMPARATIVE EVALUATION
Error during comparative evaluation: Completions.create() got an unexpected keyword argument 'headers'

✅ Evaluation completed!
💾 Results stored in database

🧠 EVALUATION DASHBOARD
📝 Prompt: Explain transformers architecture like I am 16. Use basic terms of architecture like encoder, decoder, self-attention, parallelization
🤖 Models: 4 | 🧠 Evaluations: 12

📊 EVALUATION SCORES SUMMARY:
                          Model Overall Confusion Recogn

({'openai/gpt-5': {'evaluation': "Error during evaluation: Completions.create() got an unexpected keyword argument 'headers'",
   'scores': {},
   'eval_id': '0a5621fd-4559-48e8-8416-776b8214bc72'},
  'anthropic/claude-sonnet-4': {'evaluation': "Error during evaluation: Completions.create() got an unexpected keyword argument 'headers'",
   'scores': {},
   'eval_id': 'acff79c5-659d-4853-8236-df6d85581a4c'},
  'deepseek/deepseek-v3.1-terminus': {'evaluation': "Error during evaluation: Completions.create() got an unexpected keyword argument 'headers'",
   'scores': {},
   'eval_id': '8cecc1d7-781a-440c-957e-9e40df99364c'},
  'moonshotai/kimi-k2-0905': {'evaluation': "Error during evaluation: Completions.create() got an unexpected keyword argument 'headers'",
   'scores': {},
   'eval_id': 'df96ff16-1c9a-4728-84fa-697c95fc5b2c'},
  'comparative': "Error during comparative evaluation: Completions.create() got an unexpected keyword argument 'headers'"},
                               Model 