# Prompting and Inference Control for LLMs

## Overview

Advanced prompting techniques enable precise control over LLM behavior and outputs. This notebook covers:

- **Prompt Engineering**: Template optimization and dynamic prompt construction
- **In-Context Learning**: Few-shot learning and example selection strategies
- **Chain-of-Thought**: Reasoning enhancement and self-consistency methods
- **Tool Calling**: Function integration and orchestration frameworks

Let's implement practical prompting and inference control systems.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import re
import random
from typing import Dict, List, Any, Optional, Callable
from collections import defaultdict, Counter
from dataclasses import dataclass
import time
from datetime import datetime

print("Libraries imported successfully!")

## 1. Advanced Prompt Engineering System

Let's implement a comprehensive prompt engineering framework with templates and optimization:

In [None]:
@dataclass
class PromptTemplate:
    name: str
    template: str
    variables: List[str]
    task_type: str
    effectiveness_score: float = 0.0
    usage_count: int = 0

class PromptEngineeringSystem:
    """Advanced prompt engineering with template optimization"""
    
    def __init__(self):
        self.templates = {}
        self.performance_history = defaultdict(list)
        self.optimization_strategies = {
            'clarity': self.optimize_for_clarity,
            'specificity': self.optimize_for_specificity,
            'structure': self.optimize_for_structure,
            'examples': self.optimize_with_examples
        }
        self.load_default_templates()
    
    def load_default_templates(self):
        """Load default prompt templates for common tasks"""
        default_templates = [
            PromptTemplate(
                name="basic_qa",
                template="Question: {question}\nAnswer:",
                variables=["question"],
                task_type="question_answering"
            ),
            PromptTemplate(
                name="structured_qa",
                template="Context: {context}\n\nQuestion: {question}\n\nPlease provide a clear and accurate answer based on the context above.\n\nAnswer:",
                variables=["context", "question"],
                task_type="question_answering"
            ),
            PromptTemplate(
                name="classification",
                template="Classify the following text into one of these categories: {categories}\n\nText: {text}\n\nCategory:",
                variables=["categories", "text"],
                task_type="classification"
            ),
            PromptTemplate(
                name="reasoning",
                template="Problem: {problem}\n\nLet's think step by step:\n1.",
                variables=["problem"],
                task_type="reasoning"
            ),
            PromptTemplate(
                name="creative_writing",
                template="Write a {style} {type} about {topic}. Make it {tone} and approximately {length} words.\n\n",
                variables=["style", "type", "topic", "tone", "length"],
                task_type="generation"
            )
        ]
        
        for template in default_templates:
            self.templates[template.name] = template
    
    def create_prompt(self, template_name, variables, optimization_level="basic"):
        """Create optimized prompt from template"""
        if template_name not in self.templates:
            raise ValueError(f"Template '{template_name}' not found")
        
        template = self.templates[template_name]
        
        # Basic prompt creation
        prompt = template.template.format(**variables)
        
        # Apply optimizations
        if optimization_level != "basic":
            prompt = self.apply_optimizations(prompt, template, optimization_level)
        
        # Track usage
        template.usage_count += 1
        
        return {
            'prompt': prompt,
            'template_name': template_name,
            'variables': variables,
            'optimization_level': optimization_level
        }
    
    def apply_optimizations(self, prompt, template, optimization_level):
        """Apply various optimization strategies"""
        optimized_prompt = prompt
        
        if optimization_level == "enhanced":
            # Apply clarity and structure optimizations
            optimized_prompt = self.optimization_strategies['clarity'](optimized_prompt)
            optimized_prompt = self.optimization_strategies['structure'](optimized_prompt)
        
        elif optimization_level == "advanced":
            # Apply all optimizations
            for strategy in self.optimization_strategies.values():
                optimized_prompt = strategy(optimized_prompt)
        
        return optimized_prompt
    
    def optimize_for_clarity(self, prompt):
        """Optimize prompt for clarity"""
        # Add clear instructions
        if not prompt.strip().endswith(":"):
            prompt += "\n\nPlease provide a clear and detailed response."
        
        # Add formatting hints
        if "classify" in prompt.lower():
            prompt += "\n\nProvide only the category name."
        
        return prompt
    
    def optimize_for_specificity(self, prompt):
        """Optimize prompt for specificity"""
        # Add specific constraints
        specificity_additions = [
            "Be specific and precise in your response.",
            "Include relevant details and examples where appropriate.",
            "Focus on accuracy and completeness."
        ]
        
        addition = random.choice(specificity_additions)
        return prompt + f"\n\n{addition}"
    
    def optimize_for_structure(self, prompt):
        """Optimize prompt structure"""
        # Add structural elements
        if "step by step" not in prompt.lower():
            if any(word in prompt.lower() for word in ["analyze", "explain", "describe"]):
                prompt += "\n\nStructure your response clearly with numbered points or sections."
        
        return prompt
    
    def optimize_with_examples(self, prompt):
        """Add examples to prompt"""
        # This is a simplified version - in practice, you'd have a database of examples
        if "classify" in prompt.lower():
            example = "\n\nExample:\nText: 'The movie was fantastic!'\nCategory: Positive"
            prompt = prompt.replace("Category:", example + "\n\nNow classify:\nCategory:")
        
        return prompt
    
    def evaluate_prompt_performance(self, template_name, variables, response, quality_score):
        """Evaluate and record prompt performance"""
        performance_record = {
            'timestamp': datetime.now(),
            'template_name': template_name,
            'variables': variables,
            'quality_score': quality_score,
            'response_length': len(response),
            'response_quality': self.assess_response_quality(response)
        }
        
        self.performance_history[template_name].append(performance_record)
        
        # Update template effectiveness
        if template_name in self.templates:
            template = self.templates[template_name]
            scores = [record['quality_score'] for record in self.performance_history[template_name]]
            template.effectiveness_score = np.mean(scores)
        
        return performance_record
    
    def assess_response_quality(self, response):
        """Simple response quality assessment"""
        quality_indicators = {
            'length_appropriate': 50 <= len(response) <= 1000,
            'has_structure': any(marker in response for marker in ['1.', '2.', '-', '•']),
            'complete_sentences': response.count('.') >= 1,
            'no_repetition': len(set(response.split())) / len(response.split()) > 0.7 if response.split() else False
        }
        
        return sum(quality_indicators.values()) / len(quality_indicators)
    
    def get_best_template(self, task_type):
        """Get best performing template for task type"""
        candidates = [t for t in self.templates.values() if t.task_type == task_type]
        
        if not candidates:
            return None
        
        # Sort by effectiveness score
        candidates.sort(key=lambda x: x.effectiveness_score, reverse=True)
        return candidates[0]
    
    def get_performance_report(self):
        """Generate performance report"""
        report = {
            'total_templates': len(self.templates),
            'total_evaluations': sum(len(history) for history in self.performance_history.values()),
            'template_performance': {},
            'best_templates': {}
        }
        
        # Template performance
        for name, template in self.templates.items():
            report['template_performance'][name] = {
                'effectiveness_score': template.effectiveness_score,
                'usage_count': template.usage_count,
                'task_type': template.task_type
            }
        
        # Best templates by task type
        task_types = set(t.task_type for t in self.templates.values())
        for task_type in task_types:
            best_template = self.get_best_template(task_type)
            if best_template:
                report['best_templates'][task_type] = best_template.name
        
        return report

# Initialize prompt engineering system
prompt_system = PromptEngineeringSystem()
print("Prompt engineering system initialized!")

### Testing Prompt Engineering

Let's test our prompt engineering system with various tasks and optimization levels:

In [None]:
# Test different prompt engineering scenarios
test_scenarios = [
    {
        'template': 'basic_qa',
        'variables': {'question': 'What is machine learning?'},
        'optimization': 'basic'
    },
    {
        'template': 'structured_qa',
        'variables': {
            'context': 'Machine learning is a subset of AI that enables computers to learn from data.',
            'question': 'How does machine learning relate to artificial intelligence?'
        },
        'optimization': 'enhanced'
    },
    {
        'template': 'classification',
        'variables': {
            'categories': 'Positive, Negative, Neutral',
            'text': 'This product is amazing and works perfectly!'
        },
        'optimization': 'advanced'
    },
    {
        'template': 'reasoning',
        'variables': {'problem': 'If a train travels 60 mph for 2 hours, how far does it go?'},
        'optimization': 'enhanced'
    },
    {
        'template': 'creative_writing',
        'variables': {
            'style': 'descriptive',
            'type': 'short story',
            'topic': 'a robot learning to paint',
            'tone': 'inspiring',
            'length': '200'
        },
        'optimization': 'advanced'
    }
]

print("Testing prompt engineering with different optimization levels...\n")

prompt_results = []

for i, scenario in enumerate(test_scenarios, 1):
    print(f"Test {i}: {scenario['template']} (optimization: {scenario['optimization']})")
    
    # Create prompt
    prompt_result = prompt_system.create_prompt(
        scenario['template'],
        scenario['variables'],
        scenario['optimization']
    )
    
    print(f"Generated Prompt:")
    print(f"{prompt_result['prompt']}")
    print()
    
    # Simulate response and evaluation
    simulated_response = f"This is a simulated response for {scenario['template']} task."
    quality_score = random.uniform(0.6, 0.95)  # Simulate quality assessment
    
    # Evaluate performance
    performance = prompt_system.evaluate_prompt_performance(
        scenario['template'],
        scenario['variables'],
        simulated_response,
        quality_score
    )
    
    prompt_results.append({
        'template': scenario['template'],
        'optimization': scenario['optimization'],
        'prompt_length': len(prompt_result['prompt']),
        'quality_score': quality_score,
        'response_quality': performance['response_quality']
    })
    
    print(f"Quality Score: {quality_score:.3f}")
    print(f"Response Quality: {performance['response_quality']:.3f}")
    print("-" * 60)

# Generate performance report
report = prompt_system.get_performance_report()

print("\n=== PROMPT ENGINEERING REPORT ===")
print(f"Total templates: {report['total_templates']}")
print(f"Total evaluations: {report['total_evaluations']}")

print("\nTemplate Performance:")
for name, perf in report['template_performance'].items():
    print(f"  {name}: {perf['effectiveness_score']:.3f} (used {perf['usage_count']} times)")

print("\nBest Templates by Task:")
for task_type, template_name in report['best_templates'].items():
    print(f"  {task_type}: {template_name}")

# Visualize results
df_prompts = pd.DataFrame(prompt_results)

plt.figure(figsize=(15, 10))

# Quality scores by template
plt.subplot(2, 3, 1)
plt.bar(df_prompts['template'], df_prompts['quality_score'], color='skyblue')
plt.xlabel('Template')
plt.ylabel('Quality Score')
plt.title('Quality Scores by Template')
plt.xticks(rotation=45)

# Optimization level impact
plt.subplot(2, 3, 2)
opt_quality = df_prompts.groupby('optimization')['quality_score'].mean()
plt.bar(opt_quality.index, opt_quality.values, color=['lightcoral', 'lightgreen', 'gold'])
plt.xlabel('Optimization Level')
plt.ylabel('Average Quality Score')
plt.title('Impact of Optimization Level')

# Prompt length vs quality
plt.subplot(2, 3, 3)
plt.scatter(df_prompts['prompt_length'], df_prompts['quality_score'], alpha=0.7, s=60)
plt.xlabel('Prompt Length (characters)')
plt.ylabel('Quality Score')
plt.title('Prompt Length vs Quality')

# Template usage distribution
plt.subplot(2, 3, 4)
template_usage = [perf['usage_count'] for perf in report['template_performance'].values()]
template_names = list(report['template_performance'].keys())
plt.pie(template_usage, labels=template_names, autopct='%1.1f%%')
plt.title('Template Usage Distribution')

# Effectiveness scores
plt.subplot(2, 3, 5)
effectiveness_scores = [perf['effectiveness_score'] for perf in report['template_performance'].values()]
plt.bar(template_names, effectiveness_scores, color='lightgreen')
plt.xlabel('Template')
plt.ylabel('Effectiveness Score')
plt.title('Template Effectiveness')
plt.xticks(rotation=45)

# Quality improvement by optimization
plt.subplot(2, 3, 6)
basic_quality = df_prompts[df_prompts['optimization'] == 'basic']['quality_score'].mean()
enhanced_quality = df_prompts[df_prompts['optimization'] == 'enhanced']['quality_score'].mean()
advanced_quality = df_prompts[df_prompts['optimization'] == 'advanced']['quality_score'].mean()

improvements = [0, enhanced_quality - basic_quality, advanced_quality - basic_quality]
levels = ['Basic', 'Enhanced', 'Advanced']
plt.bar(levels, improvements, color=['gray', 'orange', 'red'])
plt.xlabel('Optimization Level')
plt.ylabel('Quality Improvement')
plt.title('Quality Improvement by Optimization')

plt.tight_layout()
plt.show()

print(f"\n=== OPTIMIZATION ANALYSIS ===")
print(f"Average quality improvement with enhanced optimization: {enhanced_quality - basic_quality:.3f}")
print(f"Average quality improvement with advanced optimization: {advanced_quality - basic_quality:.3f}")
print(f"Most effective template: {max(report['template_performance'].items(), key=lambda x: x[1]['effectiveness_score'])[0]}")

## 2. In-Context Learning System

Let's implement an advanced in-context learning system with dynamic example selection:

In [None]:
class InContextLearningSystem:
    """Advanced in-context learning with dynamic example selection"""
    
    def __init__(self):
        self.example_database = defaultdict(list)
        self.selection_strategies = {
            'random': self.select_random_examples,
            'similarity': self.select_similar_examples,
            'diversity': self.select_diverse_examples,
            'difficulty': self.select_by_difficulty,
            'performance': self.select_by_performance
        }
        self.performance_tracker = defaultdict(list)
    
    def add_example(self, task_type, input_text, output_text, metadata=None):
        """Add example to the database"""
        example = {
            'input': input_text,
            'output': output_text,
            'metadata': metadata or {},
            'difficulty': self.estimate_difficulty(input_text, output_text),
            'performance_score': 0.0,
            'usage_count': 0
        }
        
        self.example_database[task_type].append(example)
        return len(self.example_database[task_type]) - 1  # Return index
    
    def create_few_shot_prompt(self, task_type, query, num_examples=3, strategy='similarity'):
        """Create few-shot prompt with selected examples"""
        if task_type not in self.example_database:
            return f"Task: {query}\nAnswer:"
        
        # Select examples using specified strategy
        selected_examples = self.selection_strategies[strategy](
            task_type, query, num_examples
        )
        
        # Build prompt
        prompt_parts = []
        
        # Add examples
        for i, example in enumerate(selected_examples, 1):
            prompt_parts.append(f"Example {i}:")
            prompt_parts.append(f"Input: {example['input']}")
            prompt_parts.append(f"Output: {example['output']}")
            prompt_parts.append("")  # Empty line
        
        # Add current query
        prompt_parts.append(f"Now solve this:")
        prompt_parts.append(f"Input: {query}")
        prompt_parts.append(f"Output:")
        
        return "\n".join(prompt_parts)
    
    def select_random_examples(self, task_type, query, num_examples):
        """Select random examples"""
        examples = self.example_database[task_type]
        return random.sample(examples, min(num_examples, len(examples)))
    
    def select_similar_examples(self, task_type, query, num_examples):
        """Select examples similar to query"""
        examples = self.example_database[task_type]
        
        # Simple similarity based on word overlap
        query_words = set(query.lower().split())
        
        similarities = []
        for example in examples:
            example_words = set(example['input'].lower().split())
            similarity = len(query_words.intersection(example_words)) / len(query_words.union(example_words))
            similarities.append((similarity, example))
        
        # Sort by similarity and select top examples
        similarities.sort(key=lambda x: x[0], reverse=True)
        return [ex for _, ex in similarities[:num_examples]]
    
    def select_diverse_examples(self, task_type, query, num_examples):
        """Select diverse examples to cover different patterns"""
        examples = self.example_database[task_type]
        
        if len(examples) <= num_examples:
            return examples
        
        selected = []
        remaining = examples.copy()
        
        # Select first example randomly
        first_example = random.choice(remaining)
        selected.append(first_example)
        remaining.remove(first_example)
        
        # Select remaining examples to maximize diversity
        while len(selected) < num_examples and remaining:
            best_candidate = None
            best_diversity_score = -1
            
            for candidate in remaining:
                # Calculate diversity score (minimum similarity to selected examples)
                diversity_score = min(
                    self.calculate_similarity(candidate, selected_ex)
                    for selected_ex in selected
                )
                
                if diversity_score > best_diversity_score:
                    best_diversity_score = diversity_score
                    best_candidate = candidate
            
            if best_candidate:
                selected.append(best_candidate)
                remaining.remove(best_candidate)
        
        return selected
    
    def select_by_difficulty(self, task_type, query, num_examples):
        """Select examples by difficulty progression"""
        examples = self.example_database[task_type]
        
        # Sort by difficulty
        sorted_examples = sorted(examples, key=lambda x: x['difficulty'])
        
        # Select examples with progressive difficulty
        if len(sorted_examples) <= num_examples:
            return sorted_examples
        
        # Select evenly spaced examples across difficulty range
        indices = np.linspace(0, len(sorted_examples) - 1, num_examples, dtype=int)
        return [sorted_examples[i] for i in indices]
    
    def select_by_performance(self, task_type, query, num_examples):
        """Select examples that have shown good performance"""
        examples = self.example_database[task_type]
        
        # Sort by performance score
        sorted_examples = sorted(examples, key=lambda x: x['performance_score'], reverse=True)
        
        return sorted_examples[:num_examples]
    
    def calculate_similarity(self, example1, example2):
        """Calculate similarity between two examples"""
        words1 = set(example1['input'].lower().split())
        words2 = set(example2['input'].lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        return len(words1.intersection(words2)) / len(words1.union(words2))
    
    def estimate_difficulty(self, input_text, output_text):
        """Estimate difficulty of an example"""
        # Simple heuristics for difficulty estimation
        input_complexity = len(input_text.split()) / 10.0  # Normalize by average sentence length
        output_complexity = len(output_text.split()) / 10.0
        
        # Check for complex patterns
        complexity_indicators = [
            len(re.findall(r'\d+', input_text)) > 2,  # Multiple numbers
            len(input_text.split(',')) > 3,  # Multiple clauses
            any(word in input_text.lower() for word in ['analyze', 'compare', 'evaluate', 'synthesize'])
        ]
        
        complexity_bonus = sum(complexity_indicators) * 0.2
        
        return min(1.0, input_complexity + output_complexity + complexity_bonus)
    
    def update_example_performance(self, task_type, example_indices, performance_scores):
        """Update performance scores for examples"""
        for idx, score in zip(example_indices, performance_scores):
            if 0 <= idx < len(self.example_database[task_type]):
                example = self.example_database[task_type][idx]
                example['usage_count'] += 1
                
                # Update running average of performance
                current_score = example['performance_score']
                usage_count = example['usage_count']
                example['performance_score'] = (current_score * (usage_count - 1) + score) / usage_count
    
    def get_learning_analytics(self):
        """Get analytics about in-context learning performance"""
        analytics = {
            'total_examples': sum(len(examples) for examples in self.example_database.values()),
            'task_types': list(self.example_database.keys()),
            'examples_per_task': {task: len(examples) for task, examples in self.example_database.items()},
            'difficulty_distribution': {},
            'performance_distribution': {},
            'usage_statistics': {}
        }
        
        # Analyze difficulty and performance distributions
        all_examples = []
        for examples in self.example_database.values():
            all_examples.extend(examples)
        
        if all_examples:
            difficulties = [ex['difficulty'] for ex in all_examples]
            performances = [ex['performance_score'] for ex in all_examples]
            usage_counts = [ex['usage_count'] for ex in all_examples]
            
            analytics['difficulty_distribution'] = {
                'mean': np.mean(difficulties),
                'std': np.std(difficulties),
                'min': np.min(difficulties),
                'max': np.max(difficulties)
            }
            
            analytics['performance_distribution'] = {
                'mean': np.mean(performances),
                'std': np.std(performances),
                'min': np.min(performances),
                'max': np.max(performances)
            }
            
            analytics['usage_statistics'] = {
                'mean_usage': np.mean(usage_counts),
                'total_usage': np.sum(usage_counts),
                'unused_examples': sum(1 for count in usage_counts if count == 0)
            }
        
        return analytics

# Initialize in-context learning system
icl_system = InContextLearningSystem()
print("In-context learning system initialized!")

### Testing In-Context Learning

Let's test our in-context learning system with different example selection strategies:

In [None]:
# Add example data for different tasks
def populate_example_database():
    """Populate the example database with sample data"""
    
    # Math word problems
    math_examples = [
        ("A store has 15 apples and sells 7. How many are left?", "15 - 7 = 8 apples"),
        ("If a car travels 60 miles in 2 hours, what is its speed?", "60 miles ÷ 2 hours = 30 mph"),
        ("A rectangle has length 8 and width 5. What is its area?", "8 × 5 = 40 square units"),
        ("John has 3 bags with 4 marbles each. How many marbles total?", "3 × 4 = 12 marbles"),
        ("A pizza is cut into 8 slices. If 3 are eaten, what fraction remains?", "5/8 of the pizza remains")
    ]
    
    # Sentiment analysis
    sentiment_examples = [
        ("This movie is absolutely fantastic!", "Positive"),
        ("I hate waiting in long lines.", "Negative"),
        ("The weather is okay today.", "Neutral"),
        ("Best purchase I've ever made!", "Positive"),
        ("This product is terrible and broke immediately.", "Negative"),
        ("It's an average restaurant, nothing special.", "Neutral")
    ]
    
    # Text summarization
    summary_examples = [
        ("Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.", 
         "Machine learning automates data analysis using AI to help systems learn from data and make decisions."),
        ("Climate change refers to long-term shifts in global or regional climate patterns. It is largely attributed to increased levels of atmospheric carbon dioxide produced by the use of fossil fuels.",
         "Climate change involves long-term shifts in climate patterns, mainly caused by increased atmospheric CO2 from fossil fuels."),
        ("Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water. It generally involves the green pigment chlorophyll and generates oxygen as a byproduct.",
         "Photosynthesis is how plants use sunlight, CO2, and water to make food using chlorophyll, producing oxygen.")
    ]
    
    # Add examples to database
    for input_text, output_text in math_examples:
        icl_system.add_example('math', input_text, output_text)
    
    for input_text, output_text in sentiment_examples:
        icl_system.add_example('sentiment', input_text, output_text)
    
    for input_text, output_text in summary_examples:
        icl_system.add_example('summarization', input_text, output_text)

# Populate database
populate_example_database()

# Test different selection strategies
test_queries = [
    ('math', 'A bakery makes 24 cupcakes and sells 9. How many cupcakes are left?'),
    ('sentiment', 'This book is incredibly boring and poorly written.'),
    ('summarization', 'Artificial intelligence is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of intelligent agents.')
]

strategies = ['random', 'similarity', 'diversity', 'difficulty', 'performance']

print("Testing different example selection strategies...\n")

icl_results = []

for task_type, query in test_queries:
    print(f"Task: {task_type}")
    print(f"Query: {query[:80]}...")
    print()
    
    for strategy in strategies:
        prompt = icl_system.create_few_shot_prompt(task_type, query, num_examples=2, strategy=strategy)
        
        print(f"Strategy: {strategy}")
        print(f"Prompt length: {len(prompt)} characters")
        print(f"First 200 chars: {prompt[:200]}...")
        
        # Simulate performance evaluation
        performance_score = random.uniform(0.6, 0.95)
        
        icl_results.append({
            'task_type': task_type,
            'strategy': strategy,
            'prompt_length': len(prompt),
            'performance_score': performance_score
        })
        
        print(f"Simulated performance: {performance_score:.3f}")
        print("-" * 40)
    
    print("=" * 60)

# Get analytics
analytics = icl_system.get_learning_analytics()

print("\n=== IN-CONTEXT LEARNING ANALYTICS ===")
print(f"Total examples: {analytics['total_examples']}")
print(f"Task types: {analytics['task_types']}")
print(f"Examples per task: {analytics['examples_per_task']}")
print(f"Average difficulty: {analytics['difficulty_distribution']['mean']:.3f}")
print(f"Average performance: {analytics['performance_distribution']['mean']:.3f}")

# Visualize results
df_icl = pd.DataFrame(icl_results)

plt.figure(figsize=(15, 10))

# Performance by strategy
plt.subplot(2, 3, 1)
strategy_performance = df_icl.groupby('strategy')['performance_score'].mean()
plt.bar(strategy_performance.index, strategy_performance.values, color='lightblue')
plt.xlabel('Selection Strategy')
plt.ylabel('Average Performance')
plt.title('Performance by Selection Strategy')
plt.xticks(rotation=45)

# Performance by task type
plt.subplot(2, 3, 2)
task_performance = df_icl.groupby('task_type')['performance_score'].mean()
plt.bar(task_performance.index, task_performance.values, color='lightgreen')
plt.xlabel('Task Type')
plt.ylabel('Average Performance')
plt.title('Performance by Task Type')

# Prompt length distribution
plt.subplot(2, 3, 3)
plt.hist(df_icl['prompt_length'], bins=10, alpha=0.7, color='salmon')
plt.xlabel('Prompt Length (characters)')
plt.ylabel('Frequency')
plt.title('Prompt Length Distribution')

# Strategy effectiveness heatmap
plt.subplot(2, 3, 4)
pivot_data = df_icl.pivot_table(values='performance_score', index='task_type', columns='strategy', aggfunc='mean')
im = plt.imshow(pivot_data.values, cmap='RdYlGn', aspect='auto')
plt.colorbar(im)
plt.xlabel('Strategy')
plt.ylabel('Task Type')
plt.title('Strategy Effectiveness Heatmap')
plt.xticks(range(len(pivot_data.columns)), pivot_data.columns, rotation=45)
plt.yticks(range(len(pivot_data.index)), pivot_data.index)

# Examples per task distribution
plt.subplot(2, 3, 5)
task_counts = list(analytics['examples_per_task'].values())
task_names = list(analytics['examples_per_task'].keys())
plt.pie(task_counts, labels=task_names, autopct='%1.1f%%')
plt.title('Examples per Task Distribution')

# Performance vs prompt length
plt.subplot(2, 3, 6)
plt.scatter(df_icl['prompt_length'], df_icl['performance_score'], alpha=0.6, s=50)
plt.xlabel('Prompt Length')
plt.ylabel('Performance Score')
plt.title('Performance vs Prompt Length')

plt.tight_layout()
plt.show()

print(f"\n=== STRATEGY ANALYSIS ===")
best_strategy = strategy_performance.idxmax()
best_performance = strategy_performance.max()
print(f"Best performing strategy: {best_strategy} ({best_performance:.3f})")
print(f"Strategy performance range: {strategy_performance.min():.3f} - {strategy_performance.max():.3f}")
print(f"Average prompt length: {df_icl['prompt_length'].mean():.0f} characters")
print(f"Most effective task type: {task_performance.idxmax()} ({task_performance.max():.3f})")