# Requirements Evaluation Framework

**Purpose:** Systematic evaluation of prompt effectiveness for learning requirements gathering

**Approach:** Data-driven measurement using concrete criteria, not subjective opinions

**Goal:** Identify which prompt versions generate the most actionable learning requirements

## Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import json
import re
from datetime import datetime
import glob
import os

# Load the most recent test results
def load_latest_results():
    """Load the most recent prompt test results"""
    result_files = glob.glob('data/prompt_test_results_*.csv')
    if not result_files:
        print("❌ No test results found. Run the prompts notebook first.")
        return None
    
    # Get most recent file
    latest_file = max(result_files, key=os.path.getctime)
    print(f"📊 Loading results from: {latest_file}")
    
    df = pd.read_csv(latest_file)
    print(f"✅ Loaded {len(df)} prompt test results")
    return df

# Load data
results_df = load_latest_results()

if results_df is not None:
    print("\n📋 Available prompt versions:")
    for version in results_df['version'].unique():
        print(f"- {version}")

## Evaluation Criteria

Based on the learning scenario, we'll evaluate prompts on:

1. **Specificity** - How specific and concrete are the requirements?
2. **Actionability** - Can someone immediately act on these requirements?
3. **Relevance** - How well do they serve the current learning scenario?
4. **Structure** - How well organized and readable is the response?
5. **Practicality** - How realistic are these for daily implementation?

Each criterion scored 1-5 (5 being best), total possible score: 25

In [None]:
def evaluate_specificity(response):
    """Score 1-5 based on how specific and concrete the requirements are"""
    # Count specific, actionable terms
    specific_terms = len(re.findall(r'\b(build|create|implement|practice|complete|learn|master|use)\b', response.lower()))
    concrete_terms = len(re.findall(r'\b(jupyter|notebook|api|python|git|github|claude|anthropic)\b', response.lower()))
    
    total_specific = specific_terms + concrete_terms
    
    if total_specific >= 15: return 5
    elif total_specific >= 10: return 4
    elif total_specific >= 6: return 3
    elif total_specific >= 3: return 2
    else: return 1

def evaluate_actionability(response):
    """Score 1-5 based on how immediately actionable the requirements are"""
    # Look for actionable language patterns
    action_patterns = [
        r'start by',
        r'begin with',
        r'first.*do',
        r'step \d+',
        r'today.*can',
        r'immediately',
        r'next action'
    ]
    
    action_count = sum(len(re.findall(pattern, response.lower())) for pattern in action_patterns)
    
    if action_count >= 5: return 5
    elif action_count >= 3: return 4
    elif action_count >= 2: return 3
    elif action_count >= 1: return 2
    else: return 1

def evaluate_relevance(response):
    """Score 1-5 based on relevance to current learning scenario"""
    # Look for scenario-specific terms
    scenario_terms = [
        'prompt engineering',
        'requirements',
        'anthropic',
        'claude',
        'evaluation',
        'api',
        'consultant',
        'developer',
        'transition'
    ]
    
    relevance_score = sum(1 for term in scenario_terms if term in response.lower())
    
    if relevance_score >= 6: return 5
    elif relevance_score >= 4: return 4
    elif relevance_score >= 3: return 3
    elif relevance_score >= 2: return 2
    else: return 1

def evaluate_structure(response):
    """Score 1-5 based on organization and readability"""
    # Count structured elements
    headers = len(re.findall(r'^#+\s|^\*\*.*\*\*', response, re.MULTILINE))
    lists = len(re.findall(r'^[-*]\s|^\d+\.\s', response, re.MULTILINE))
    sections = len(re.findall(r'\n\n', response))
    
    structure_score = headers + min(lists // 3, 3) + min(sections // 2, 2)
    
    if structure_score >= 8: return 5
    elif structure_score >= 6: return 4
    elif structure_score >= 4: return 3
    elif structure_score >= 2: return 2
    else: return 1

def evaluate_practicality(response):
    """Score 1-5 based on practical implementability"""
    # Look for time-bounded, realistic suggestions
    practical_indicators = [
        r'daily',
        r'week',
        r'30 minutes',
        r'hour',
        r'small steps',
        r'incrementally',
        r'gradually'
    ]
    
    practical_count = sum(len(re.findall(pattern, response.lower())) for pattern in practical_indicators)
    
    if practical_count >= 4: return 5
    elif practical_count >= 3: return 4
    elif practical_count >= 2: return 3
    elif practical_count >= 1: return 2
    else: return 1

def evaluate_response(response):
    """Comprehensive evaluation of a prompt response"""
    scores = {
        'specificity': evaluate_specificity(response),
        'actionability': evaluate_actionability(response),
        'relevance': evaluate_relevance(response),
        'structure': evaluate_structure(response),
        'practicality': evaluate_practicality(response)
    }
    
    scores['total'] = sum(scores.values())
    scores['percentage'] = round((scores['total'] / 25) * 100, 1)
    
    return scores

print("✅ Evaluation functions ready")
print("📊 Scoring: Each criterion 1-5, total possible: 25 points")

## Run Evaluations

In [None]:
if results_df is not None:
    # Evaluate all prompt responses
    evaluations = []
    
    for idx, row in results_df.iterrows():
        scores = evaluate_response(row['response'])
        
        evaluation = {
            'version': row['version'],
            'timestamp': row['timestamp'],
            'token_count': row['token_count'],
            **scores
        }
        
        evaluations.append(evaluation)
        
        print(f"\n🧪 {row['version']} Evaluation:")
        print(f"  Specificity: {scores['specificity']}/5")
        print(f"  Actionability: {scores['actionability']}/5")
        print(f"  Relevance: {scores['relevance']}/5")
        print(f"  Structure: {scores['structure']}/5")
        print(f"  Practicality: {scores['practicality']}/5")
        print(f"  📊 Total: {scores['total']}/25 ({scores['percentage']}%)")
    
    # Create evaluation DataFrame
    eval_df = pd.DataFrame(evaluations)
    
    print("\n" + "="*60)
    print("🏆 EVALUATION SUMMARY")
    print("="*60)
    
    # Sort by total score
    eval_df_sorted = eval_df.sort_values('total', ascending=False)
    
    for idx, row in eval_df_sorted.iterrows():
        print(f"\n{row['version']}: {row['total']}/25 ({row['percentage']}%)")
        print(f"  Best areas: ", end="")
        
        # Find top scoring areas
        criteria_scores = {
            'Specificity': row['specificity'],
            'Actionability': row['actionability'],
            'Relevance': row['relevance'],
            'Structure': row['structure'],
            'Practicality': row['practicality']
        }
        
        top_areas = sorted(criteria_scores.items(), key=lambda x: x[1], reverse=True)[:2]
        print(", ".join([f"{area} ({score}/5)" for area, score in top_areas]))
    
    # Save evaluation results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    eval_filename = f'data/evaluation_results_{timestamp}.csv'
    eval_df.to_csv(eval_filename, index=False)
    
    print(f"\n💾 Evaluation results saved to: {eval_filename}")
else:
    print("❌ No data to evaluate. Run the prompts notebook first.")

## Performance Analysis

In [None]:
if 'eval_df' in locals() and not eval_df.empty:
    print("📈 PERFORMANCE ANALYSIS")
    print("="*50)
    
    # Overall statistics
    print(f"\n📊 Overall Statistics:")
    print(f"  Average score: {eval_df['total'].mean():.1f}/25 ({eval_df['percentage'].mean():.1f}%)")
    print(f"  Best score: {eval_df['total'].max()}/25 ({eval_df['percentage'].max()}%)")
    print(f"  Score range: {eval_df['total'].max() - eval_df['total'].min()} points")
    
    # Criteria analysis
    criteria = ['specificity', 'actionability', 'relevance', 'structure', 'practicality']
    print(f"\n🎯 Criteria Performance:")
    
    for criterion in criteria:
        avg_score = eval_df[criterion].mean()
        print(f"  {criterion.capitalize()}: {avg_score:.1f}/5 avg")
    
    # Best performing prompt
    best_prompt = eval_df.loc[eval_df['total'].idxmax()]
    print(f"\n🏆 Best Performing Prompt: {best_prompt['version']}")
    print(f"  Score: {best_prompt['total']}/25 ({best_prompt['percentage']}%)")
    print(f"  Strengths: Highest scores in multiple criteria")
    
    # Recommendations
    print(f"\n💡 Recommendations:")
    if eval_df['total'].max() >= 20:
        print(f"  ✅ Strong performance - {best_prompt['version']} ready for deployment")
    elif eval_df['total'].max() >= 15:
        print(f"  🔧 Good foundation - minor refinements needed")
    else:
        print(f"  🚧 Needs improvement - consider major prompt revisions")
    
    # Identify weakest areas for improvement
    weakest_criterion = min(criteria, key=lambda x: eval_df[x].mean())
    print(f"  🎯 Focus improvement on: {weakest_criterion} (lowest average score)")
    
else:
    print("❌ No evaluation data available for analysis")

## Export Results Summary

In [None]:
if 'eval_df' in locals() and not eval_df.empty:
    # Create summary report
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    summary_report = f"""
# Prompt Evaluation Summary Report
Generated: {timestamp}

## Test Results
- Total prompts tested: {len(eval_df)}
- Average score: {eval_df['total'].mean():.1f}/25 ({eval_df['percentage'].mean():.1f}%)
- Best performing: {eval_df.loc[eval_df['total'].idxmax(), 'version']} ({eval_df['total'].max()}/25)

## Ranking
"""
    
    # Add ranking details
    for i, (idx, row) in enumerate(eval_df.sort_values('total', ascending=False).iterrows(), 1):
        summary_report += f"{i}. {row['version']}: {row['total']}/25 ({row['percentage']}%)\n"
    
    summary_report += f"""
## Next Actions
- Deploy best performing prompt: {eval_df.loc[eval_df['total'].idxmax(), 'version']}
- Use for actual learning requirements gathering
- Continue A/B testing with refined versions

## Data Files
- Raw results: {csv_filename if 'csv_filename' in locals() else 'prompt_test_results_*.csv'}
- Evaluations: {eval_filename if 'eval_filename' in locals() else 'evaluation_results_*.csv'}
"""
    
    # Save summary report
    report_filename = f'data/evaluation_summary_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md'
    with open(report_filename, 'w') as f:
        f.write(summary_report)
    
    print(f"\n📋 Summary report saved to: {report_filename}")
    print(f"\n✅ Evaluation complete! Check data/ folder for all results.")
else:
    print("❌ Cannot generate summary - no evaluation data available")

## Ready for Next Phase

This evaluation framework provides:

✅ **Systematic scoring** of prompt effectiveness
✅ **Data-driven comparison** between prompt versions  
✅ **Clear recommendations** for which prompts to deploy
✅ **Performance tracking** over time
✅ **Structured data** for further analysis

**Next Steps:**
1. Run this evaluation after each prompt testing session
2. Use results to refine prompts systematically
3. Deploy best-performing prompts for actual requirements gathering
4. Build historical performance database for continuous improvement