# Evaluation Metrics

Learn how to define, measure, and track prompt quality systematically.

## What You'll Learn
- Defining custom evaluation metrics
- Automated quality assessment
- Scoring systems
- Tracking improvements over time

In [None]:
from prompt_playground.client import create_client, send_prompt
from prompt_playground.analysis import calculate_metrics, analyze_tone, extract_key_points
import pandas as pd
from rich import print as rprint

client = create_client()
rprint('[green]✓[/green] Ready for evaluation')

## Built-in Metrics

In [None]:
response = send_prompt(
    prompt='Explain neural networks in simple terms.',
    client=client
)

metrics = calculate_metrics(response)
rprint('\n[bold]Quantitative Metrics:[/bold]')
for key in ['word_count', 'char_count', 'sentence_count', 'avg_sentence_length']:
    rprint(f'{key}: {metrics[key]}')

tone = analyze_tone(response['text'])
rprint('\n[bold]Qualitative Metrics:[/bold]')
for key in ['formality', 'complexity', 'perspective']:
    rprint(f'{key}: {tone[key]}')

## Custom Evaluation Functions

In [None]:
def evaluate_clarity(text):
    words = text.split()
    avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
    score = 10 - min(avg_word_length - 4, 5)
    return max(0, min(10, score))

def evaluate_conciseness(response):
    metrics = calculate_metrics(response)
    words = metrics['word_count']
    if words < 50:
        return 10
    elif words < 100:
        return 7
    elif words < 200:
        return 5
    else:
        return 3

def evaluate_structure(text):
    has_intro = any(word in text.lower()[:100] for word in ['intro', 'first', 'begin'])
    has_conclusion = any(word in text.lower()[-100:] for word in ['conclu', 'summary', 'finally'])
    paragraphs = len([p for p in text.split('\n\n') if p.strip()])
    
    score = 0
    if has_intro:
        score += 3
    if has_conclusion:
        score += 3
    if paragraphs >= 2:
        score += 4
    
    return score

response = send_prompt(prompt='Explain photosynthesis.', client=client)
rprint(f'Clarity: {evaluate_clarity(response["text"])}/10')
rprint(f'Conciseness: {evaluate_conciseness(response)}/10')
rprint(f'Structure: {evaluate_structure(response["text"])}/10')

## Composite Scoring System

In [None]:
class ResponseEvaluator:
    def __init__(self, weights=None):
        self.weights = weights or {
            'clarity': 0.3,
            'conciseness': 0.2,
            'structure': 0.2,
            'tone_match': 0.3
        }
    
    def evaluate(self, response, target_tone='formal'):
        clarity = evaluate_clarity(response['text'])
        conciseness = evaluate_conciseness(response)
        structure = evaluate_structure(response['text'])
        
        tone = analyze_tone(response['text'])
        tone_match = 10 if tone['formality'] == target_tone else 5
        
        scores = {
            'clarity': clarity,
            'conciseness': conciseness,
            'structure': structure,
            'tone_match': tone_match
        }
        
        total = sum(scores[k] * self.weights[k] for k in scores)
        
        return {
            'total_score': total,
            'scores': scores,
            'grade': self._get_grade(total)
        }
    
    def _get_grade(self, score):
        if score >= 9:
            return 'A'
        elif score >= 7:
            return 'B'
        elif score >= 5:
            return 'C'
        else:
            return 'D'

evaluator = ResponseEvaluator()
response = send_prompt(prompt='Explain blockchain technology.', client=client)
result = evaluator.evaluate(response, target_tone='formal')

rprint(f'\n[bold]Overall Score: {result["total_score"]:.1f}/10[/bold]')
rprint(f'Grade: {result["grade"]}')
rprint('\nBreakdown:')
for metric, score in result['scores'].items():
    rprint(f'  {metric}: {score}/10')

## A/B Testing with Metrics

In [None]:
prompts = [
    'Explain quantum computing.',
    'Explain quantum computing in simple, non-technical terms with real-world examples.'
]

results = []
for i, prompt in enumerate(prompts):
    response = send_prompt(prompt=prompt, client=client)
    evaluation = evaluator.evaluate(response)
    results.append({
        'variant': chr(65+i),
        'score': evaluation['total_score'],
        'grade': evaluation['grade'],
        **evaluation['scores']
    })

df = pd.DataFrame(results)
display(df)

winner = df.loc[df['score'].idxmax()]
rprint(f'\n[green]Winner: Variant {winner["variant"]} (Score: {winner["score"]:.1f})[/green]')

## Tracking Improvements

In [None]:
versions = [
    'Explain AI.',
    'Explain artificial intelligence in simple terms.',
    'Explain artificial intelligence in simple terms with a real-world example.',
]

history = []
for i, prompt in enumerate(versions, 1):
    response = send_prompt(prompt=prompt, client=client)
    eval_result = evaluator.evaluate(response)
    history.append({
        'version': f'v{i}',
        'prompt': prompt[:50] + '...',
        'score': eval_result['total_score'],
        'grade': eval_result['grade']
    })

df = pd.DataFrame(history)
rprint('\n[bold]Iteration History:[/bold]')
display(df)

improvement = df['score'].iloc[-1] - df['score'].iloc[0]
rprint(f'\nImprovement: {improvement:+.1f} points')

## Best Practices

### 1. Define Clear Criteria

In [None]:
criteria = {
    'max_words': 150,
    'min_sentences': 3,
    'required_tone': 'formal',
    'must_include': ['example', 'benefit']
}

def meets_criteria(response, criteria):
    metrics = calculate_metrics(response)
    tone = analyze_tone(response['text'])
    text_lower = response['text'].lower()
    
    checks = {
        'word_count': metrics['word_count'] <= criteria['max_words'],
        'sentence_count': metrics['sentence_count'] >= criteria['min_sentences'],
        'tone': tone['formality'] == criteria['required_tone'],
        'keywords': all(kw in text_lower for kw in criteria['must_include'])
    }
    
    return checks, all(checks.values())

rprint('[green]✓[/green] Clear criteria defined')

### 2. Use Multiple Runs

In [None]:
prompt = 'Explain cloud computing briefly.'
runs = 3
scores = []

for _ in range(runs):
    response = send_prompt(prompt=prompt, temperature=0.7, client=client)
    result = evaluator.evaluate(response)
    scores.append(result['total_score'])

avg_score = sum(scores) / len(scores)
std_dev = (sum((s - avg_score) ** 2 for s in scores) / len(scores)) ** 0.5

rprint(f'Average: {avg_score:.1f} ± {std_dev:.1f}')
rprint(f'Scores: {scores}')

### 3. Document Evaluation Logic

In [None]:
evaluation_docs = {
    'clarity': 'Scores based on average word length (simpler = higher)',
    'conciseness': 'Word count thresholds: <50=10pts, <100=7pts, <200=5pts',
    'structure': 'Checks for intro, conclusion, multiple paragraphs',
    'tone_match': '10pts if matches target, 5pts otherwise'
}

for metric, description in evaluation_docs.items():
    rprint(f'{metric}: {description}')

## Summary

You've learned:
- ✓ Using built-in metrics
- ✓ Creating custom evaluation functions
- ✓ Building composite scoring systems
- ✓ Comparing prompt variants with metrics
- ✓ Tracking improvements over iterations
- ✓ Best practices for evaluation

## Congratulations!

You've completed all example notebooks. You now have the tools to:
- Create and test prompts systematically
- Use templates for consistency
- Compare variants objectively
- Process prompts at scale
- Evaluate quality metrics

Start applying these techniques to your own use cases!