# 12. Error Analysis

Deep dive into model errors to understand failure modes and identify improvement opportunities.

## Contents
1. [Setup](#1-setup)
2. [Error Pattern Analysis](#2-error-pattern-analysis)
3. [Hard Example Mining](#3-hard-example-mining)
4. [Failure Mode Taxonomy](#4-failure-mode-taxonomy)
5. [Per-Operation Error Analysis](#5-per-operation-error-analysis)
6. [Confusion Pattern Mining](#6-confusion-pattern-mining)
7. [Dataset Bias Analysis](#7-dataset-bias-analysis)
8. [Actionable Recommendations](#8-actionable-recommendations)

---

## 1. Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from typing import Dict, List, Tuple, Optional
import json
from tqdm.notebook import tqdm

# Environment check
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"Device: {'MPS' if torch.backends.mps.is_available() else 'CUDA' if torch.cuda.is_available() else 'CPU'}")

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

In [None]:
# Load model and vocabulary
from miracle.model.backbone import MMDTAELSTMBackbone
from miracle.model.multihead_lm import MultiHeadGCodeLM

# Paths
VOCAB_PATH = project_root / 'data' / 'gcode_vocab_v2.json'
CHECKPOINT_PATH = project_root / 'outputs' / 'final_model' / 'checkpoint_best.pt'
DATA_DIR = project_root / 'outputs' / 'processed_v2'

# Load vocabulary
with open(VOCAB_PATH) as f:
    vocab = json.load(f)
    
token_to_id = vocab['token_to_id']
id_to_token = {v: k for k, v in token_to_id.items()}

print(f"Vocabulary size: {len(token_to_id)}")

# Load checkpoint
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')

if CHECKPOINT_PATH.exists():
    checkpoint = torch.load(CHECKPOINT_PATH, map_location=device, weights_only=False)
    config = checkpoint.get('config', {})
    print(f"Loaded checkpoint from epoch {checkpoint.get('epoch', 'unknown')}")
else:
    print(f"Checkpoint not found at {CHECKPOINT_PATH}")
    config = {'hidden_dim': 256, 'num_layers': 4, 'num_heads': 8}

In [None]:
# Initialize models
backbone = MMDTAELSTMBackbone(
    continuous_dim=155,
    categorical_dims=[10, 10, 50, 50],
    d_model=config.get('hidden_dim', 256),
    num_layers=config.get('num_layers', 4),
    num_heads=config.get('num_heads', 8),
    dropout=config.get('dropout', 0.1)
).to(device)

lm = MultiHeadGCodeLM(
    d_model=config.get('hidden_dim', 256),
    vocab_sizes=vocab.get('head_vocab_sizes', {'type': 10, 'command': 50, 'param_type': 30, 'param_value': 100})
).to(device)

# Load weights if available
if CHECKPOINT_PATH.exists():
    backbone.load_state_dict(checkpoint['backbone_state_dict'])
    lm.load_state_dict(checkpoint['lm_state_dict'])
    print("Model weights loaded")

backbone.eval()
lm.eval()
print(f"Models loaded on {device}")

## 2. Error Pattern Analysis

Systematically analyze error patterns across different prediction heads.

In [None]:
class ErrorAnalyzer:
    """Comprehensive error analysis for multi-head G-code prediction."""
    
    def __init__(self, backbone, lm, vocab, device):
        self.backbone = backbone
        self.lm = lm
        self.vocab = vocab
        self.device = device
        self.errors = defaultdict(list)
        self.predictions = []
        
    def analyze_batch(self, continuous, categorical, targets):
        """Analyze errors in a batch of predictions."""
        with torch.no_grad():
            hidden = self.backbone(continuous.to(self.device), categorical.to(self.device))
            preds = self.lm(hidden)
            
        results = []
        for head in ['type', 'command', 'param_type', 'param_value']:
            if head in preds and head in targets:
                pred_ids = preds[head].argmax(dim=-1).cpu()
                target_ids = targets[head].cpu()
                
                # Find errors
                errors = pred_ids != target_ids
                error_indices = torch.where(errors)
                
                for batch_idx, seq_idx in zip(*error_indices):
                    batch_idx, seq_idx = batch_idx.item(), seq_idx.item()
                    pred_id = pred_ids[batch_idx, seq_idx].item()
                    target_id = target_ids[batch_idx, seq_idx].item()
                    
                    # Get confidence
                    probs = torch.softmax(preds[head][batch_idx, seq_idx], dim=-1)
                    confidence = probs[pred_id].item()
                    
                    self.errors[head].append({
                        'batch_idx': batch_idx,
                        'seq_idx': seq_idx,
                        'pred_id': pred_id,
                        'target_id': target_id,
                        'confidence': confidence,
                        'target_prob': probs[target_id].item()
                    })
        
        return results
    
    def get_error_summary(self):
        """Get summary statistics for errors."""
        summary = {}
        for head, errors in self.errors.items():
            if errors:
                confidences = [e['confidence'] for e in errors]
                target_probs = [e['target_prob'] for e in errors]
                summary[head] = {
                    'total_errors': len(errors),
                    'avg_confidence': np.mean(confidences),
                    'avg_target_prob': np.mean(target_probs),
                    'high_conf_errors': sum(1 for c in confidences if c > 0.8),
                    'low_conf_errors': sum(1 for c in confidences if c < 0.3)
                }
        return summary
    
    def get_confusion_pairs(self, head, top_k=10):
        """Get most common confusion pairs for a head."""
        pairs = Counter()
        for error in self.errors[head]:
            pairs[(error['target_id'], error['pred_id'])] += 1
        return pairs.most_common(top_k)

# Initialize analyzer
analyzer = ErrorAnalyzer(backbone, lm, vocab, device)
print("ErrorAnalyzer initialized")

In [None]:
# Load test data and run analysis
from torch.utils.data import DataLoader, TensorDataset

# Load test set
test_path = DATA_DIR / 'test.pt'

if test_path.exists():
    test_data = torch.load(test_path, weights_only=False)
    print(f"Test data loaded: {len(test_data.get('continuous', []))} samples")
    
    # Create DataLoader
    test_dataset = TensorDataset(
        torch.tensor(test_data['continuous'], dtype=torch.float32),
        torch.tensor(test_data['categorical'], dtype=torch.long),
        torch.tensor(test_data.get('type_targets', np.zeros((len(test_data['continuous']), 64))), dtype=torch.long),
        torch.tensor(test_data.get('command_targets', np.zeros((len(test_data['continuous']), 64))), dtype=torch.long),
        torch.tensor(test_data.get('param_type_targets', np.zeros((len(test_data['continuous']), 64))), dtype=torch.long),
        torch.tensor(test_data.get('param_value_targets', np.zeros((len(test_data['continuous']), 64))), dtype=torch.long)
    )
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    # Run analysis
    for batch in tqdm(test_loader, desc="Analyzing errors"):
        continuous, categorical, type_t, cmd_t, pt_t, pv_t = batch
        targets = {
            'type': type_t,
            'command': cmd_t,
            'param_type': pt_t,
            'param_value': pv_t
        }
        analyzer.analyze_batch(continuous, categorical, targets)
    
    # Print summary
    summary = analyzer.get_error_summary()
    for head, stats in summary.items():
        print(f"\n{head.upper()} Head:")
        for key, value in stats.items():
            print(f"  {key}: {value:.4f}" if isinstance(value, float) else f"  {key}: {value}")
else:
    print(f"Test data not found at {test_path}")
    print("Using synthetic data for demonstration...")
    
    # Create synthetic errors for demo
    for head in ['type', 'command', 'param_type', 'param_value']:
        for _ in range(100):
            analyzer.errors[head].append({
                'batch_idx': np.random.randint(0, 10),
                'seq_idx': np.random.randint(0, 64),
                'pred_id': np.random.randint(0, 20),
                'target_id': np.random.randint(0, 20),
                'confidence': np.random.random(),
                'target_prob': np.random.random() * 0.3
            })

In [None]:
# Visualize error distribution by confidence
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for ax, head in zip(axes.flat, ['type', 'command', 'param_type', 'param_value']):
    if analyzer.errors[head]:
        confidences = [e['confidence'] for e in analyzer.errors[head]]
        target_probs = [e['target_prob'] for e in analyzer.errors[head]]
        
        ax.scatter(confidences, target_probs, alpha=0.5, s=20)
        ax.axhline(y=0.5, color='r', linestyle='--', alpha=0.5, label='50% target prob')
        ax.axvline(x=0.5, color='g', linestyle='--', alpha=0.5, label='50% confidence')
        ax.set_xlabel('Prediction Confidence')
        ax.set_ylabel('Target Probability')
        ax.set_title(f'{head.upper()} Head Error Distribution')
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.legend()

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'error_confidence_scatter.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nError Type Classification:")
print("- High confidence + Low target prob: Model confidently wrong (concerning)")
print("- Low confidence + Low target prob: Model uncertain (expected)")
print("- High confidence + High target prob: Near-miss errors (close alternatives)")

## 3. Hard Example Mining

Identify and analyze the hardest examples for the model.

In [None]:
class HardExampleMiner:
    """Mine hard examples based on various criteria."""
    
    def __init__(self, errors_by_head):
        self.errors = errors_by_head
        
    def get_high_confidence_errors(self, head, threshold=0.9, top_k=20):
        """Get errors where model was highly confident but wrong."""
        high_conf = [e for e in self.errors[head] if e['confidence'] > threshold]
        return sorted(high_conf, key=lambda x: -x['confidence'])[:top_k]
    
    def get_repeated_errors(self, head, min_count=3):
        """Find error patterns that repeat frequently."""
        pattern_counts = Counter()
        for e in self.errors[head]:
            pattern = (e['target_id'], e['pred_id'])
            pattern_counts[pattern] += 1
        return [(p, c) for p, c in pattern_counts.items() if c >= min_count]
    
    def get_sequential_errors(self, head, window=3):
        """Find cases where multiple sequential positions have errors."""
        by_batch = defaultdict(list)
        for e in self.errors[head]:
            by_batch[e['batch_idx']].append(e['seq_idx'])
        
        sequential_cases = []
        for batch_idx, positions in by_batch.items():
            positions = sorted(positions)
            for i in range(len(positions) - window + 1):
                if positions[i + window - 1] - positions[i] <= window:
                    sequential_cases.append((batch_idx, positions[i:i+window]))
        
        return sequential_cases[:20]
    
    def get_cross_head_errors(self):
        """Find positions where multiple heads fail."""
        position_errors = defaultdict(set)
        for head, errors in self.errors.items():
            for e in errors:
                key = (e['batch_idx'], e['seq_idx'])
                position_errors[key].add(head)
        
        multi_head_failures = [
            (pos, heads) for pos, heads in position_errors.items()
            if len(heads) >= 2
        ]
        return sorted(multi_head_failures, key=lambda x: -len(x[1]))[:20]

# Initialize miner
miner = HardExampleMiner(analyzer.errors)

# Analyze high-confidence errors
print("High-Confidence Errors (>90% confidence but wrong):")
print("="*60)
for head in ['type', 'command', 'param_type', 'param_value']:
    high_conf = miner.get_high_confidence_errors(head, threshold=0.9)
    print(f"\n{head.upper()}: {len(high_conf)} high-confidence errors")
    if high_conf:
        for e in high_conf[:3]:
            print(f"  Predicted: {e['pred_id']} vs Target: {e['target_id']} (conf: {e['confidence']:.3f})")

In [None]:
# Visualize hard examples
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cross-head failures
cross_head = miner.get_cross_head_errors()
if cross_head:
    failure_counts = Counter([len(heads) for _, heads in cross_head])
    axes[0].bar(failure_counts.keys(), failure_counts.values(), color='coral')
    axes[0].set_xlabel('Number of Heads Failing')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Cross-Head Failure Distribution')
    axes[0].set_xticks(list(failure_counts.keys()))
else:
    axes[0].text(0.5, 0.5, 'No cross-head failures found', ha='center', va='center')
    axes[0].set_title('Cross-Head Failure Distribution')

# Sequential errors
seq_errors = defaultdict(int)
for head in ['type', 'command', 'param_type', 'param_value']:
    seq = miner.get_sequential_errors(head)
    seq_errors[head] = len(seq)

axes[1].bar(seq_errors.keys(), seq_errors.values(), color='steelblue')
axes[1].set_xlabel('Prediction Head')
axes[1].set_ylabel('Sequential Error Clusters')
axes[1].set_title('Sequential Error Patterns by Head')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'hard_examples_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Failure Mode Taxonomy

Categorize errors into distinct failure modes for targeted improvement.

In [None]:
class FailureModeTaxonomy:
    """Classify errors into distinct failure modes."""
    
    MODES = {
        'confident_wrong': 'High confidence (>0.8) but incorrect',
        'uncertain_wrong': 'Low confidence (<0.3) and incorrect',
        'near_miss': 'Target was 2nd or 3rd choice (target_prob > 0.2)',
        'complete_miss': 'Target probability very low (<0.05)',
        'boundary_error': 'Error at sequence boundaries (first/last 5 tokens)',
        'rare_class': 'Target is a rare class (appears <1% in training)',
    }
    
    def __init__(self, errors, class_frequencies=None):
        self.errors = errors
        self.class_freq = class_frequencies or {}
        self.classified = defaultdict(lambda: defaultdict(list))
        
    def classify_errors(self, seq_length=64):
        """Classify all errors into failure modes."""
        for head, error_list in self.errors.items():
            for error in error_list:
                modes = self._get_modes(error, seq_length, head)
                for mode in modes:
                    self.classified[head][mode].append(error)
        return self.classified
    
    def _get_modes(self, error, seq_length, head):
        """Determine which failure modes apply to an error."""
        modes = []
        
        if error['confidence'] > 0.8:
            modes.append('confident_wrong')
        elif error['confidence'] < 0.3:
            modes.append('uncertain_wrong')
            
        if error['target_prob'] > 0.2:
            modes.append('near_miss')
        elif error['target_prob'] < 0.05:
            modes.append('complete_miss')
            
        if error['seq_idx'] < 5 or error['seq_idx'] >= seq_length - 5:
            modes.append('boundary_error')
            
        if head in self.class_freq and error['target_id'] in self.class_freq[head]:
            if self.class_freq[head][error['target_id']] < 0.01:
                modes.append('rare_class')
        
        return modes if modes else ['unclassified']
    
    def get_summary(self):
        """Get summary of failure modes."""
        summary = {}
        for head, modes in self.classified.items():
            summary[head] = {mode: len(errors) for mode, errors in modes.items()}
        return summary

# Classify errors
taxonomy = FailureModeTaxonomy(analyzer.errors)
taxonomy.classify_errors()
failure_summary = taxonomy.get_summary()

print("Failure Mode Summary:")
print("="*60)
for head, modes in failure_summary.items():
    print(f"\n{head.upper()}:")
    for mode, count in sorted(modes.items(), key=lambda x: -x[1]):
        desc = taxonomy.MODES.get(mode, 'Unknown mode')
        print(f"  {mode}: {count} ({desc})")

In [None]:
# Visualize failure mode distribution
fig, ax = plt.subplots(figsize=(14, 6))

heads = list(failure_summary.keys())
modes = list(taxonomy.MODES.keys())
x = np.arange(len(heads))
width = 0.12
colors = plt.cm.Set3(np.linspace(0, 1, len(modes)))

for i, mode in enumerate(modes):
    values = [failure_summary.get(head, {}).get(mode, 0) for head in heads]
    ax.bar(x + i * width, values, width, label=mode, color=colors[i])

ax.set_xlabel('Prediction Head')
ax.set_ylabel('Error Count')
ax.set_title('Failure Mode Distribution by Head')
ax.set_xticks(x + width * (len(modes) - 1) / 2)
ax.set_xticklabels([h.upper() for h in heads])
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'failure_modes.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Per-Operation Error Analysis

Analyze errors by G-code operation type (G0, G1, G2, M commands, etc.).

In [None]:
def analyze_errors_by_operation(errors, id_to_token):
    """Group errors by operation type."""
    operation_errors = defaultdict(list)
    
    for error in errors.get('command', []):
        target_id = error['target_id']
        if target_id in id_to_token:
            token = id_to_token[target_id]
            # Extract operation prefix (G0, G1, M3, etc.)
            if token.startswith('G') or token.startswith('M'):
                op_prefix = ''.join([c for c in token[:3] if c.isalpha() or c.isdigit()])
                operation_errors[op_prefix].append(error)
            else:
                operation_errors['OTHER'].append(error)
        else:
            operation_errors['UNKNOWN'].append(error)
    
    return operation_errors

# Analyze by operation
op_errors = analyze_errors_by_operation(analyzer.errors, id_to_token)

print("Errors by Operation Type:")
print("="*60)
for op, errors in sorted(op_errors.items(), key=lambda x: -len(x[1])):
    avg_conf = np.mean([e['confidence'] for e in errors]) if errors else 0
    print(f"{op:10} | {len(errors):5} errors | avg confidence: {avg_conf:.3f}")

In [None]:
# Visualize operation-specific errors
if op_errors:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Error count by operation
    ops = list(op_errors.keys())[:10]  # Top 10
    counts = [len(op_errors[op]) for op in ops]
    axes[0].barh(ops, counts, color='steelblue')
    axes[0].set_xlabel('Error Count')
    axes[0].set_title('Errors by G-code Operation')
    axes[0].invert_yaxis()
    
    # Confidence distribution by operation
    conf_data = []
    labels = []
    for op in ops[:5]:
        if op_errors[op]:
            conf_data.append([e['confidence'] for e in op_errors[op]])
            labels.append(op)
    
    if conf_data:
        axes[1].boxplot(conf_data, labels=labels)
        axes[1].set_ylabel('Prediction Confidence')
        axes[1].set_title('Error Confidence by Operation')
    
    plt.tight_layout()
    plt.savefig(project_root / 'reports' / 'operation_errors.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No operation-specific errors to visualize")

## 6. Confusion Pattern Mining

Identify systematic confusion patterns between token classes.

In [None]:
def mine_confusion_patterns(errors, id_to_token, top_k=15):
    """Find most common confusion patterns."""
    patterns = {}
    
    for head in ['type', 'command', 'param_type', 'param_value']:
        if head in errors:
            pair_counts = Counter()
            for e in errors[head]:
                target = e['target_id']
                pred = e['pred_id']
                pair_counts[(target, pred)] += 1
            
            patterns[head] = pair_counts.most_common(top_k)
    
    return patterns

# Mine patterns
confusion_patterns = mine_confusion_patterns(analyzer.errors, id_to_token)

print("Top Confusion Patterns:")
print("="*60)
for head, patterns in confusion_patterns.items():
    print(f"\n{head.upper()} Head:")
    for (target, pred), count in patterns[:5]:
        target_name = id_to_token.get(target, f"ID:{target}")
        pred_name = id_to_token.get(pred, f"ID:{pred}")
        print(f"  {target_name:15} → {pred_name:15} ({count} times)")

In [None]:
# Create confusion pattern heatmap
def create_confusion_heatmap(patterns, head, top_k=10):
    """Create heatmap for top confusion patterns."""
    if head not in patterns or not patterns[head]:
        return None
    
    # Get top classes involved in confusions
    involved_classes = set()
    for (target, pred), count in patterns[head][:top_k]:
        involved_classes.add(target)
        involved_classes.add(pred)
    
    involved_classes = sorted(involved_classes)
    n = len(involved_classes)
    
    # Build confusion matrix
    matrix = np.zeros((n, n))
    for (target, pred), count in patterns[head]:
        if target in involved_classes and pred in involved_classes:
            i = involved_classes.index(target)
            j = involved_classes.index(pred)
            matrix[i, j] = count
    
    return matrix, involved_classes

# Plot heatmaps
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

for ax, head in zip(axes.flat, ['type', 'command', 'param_type', 'param_value']):
    result = create_confusion_heatmap(confusion_patterns, head)
    if result:
        matrix, classes = result
        class_labels = [id_to_token.get(c, str(c))[:8] for c in classes]
        sns.heatmap(matrix, ax=ax, cmap='YlOrRd', 
                   xticklabels=class_labels, yticklabels=class_labels,
                   annot=True, fmt='.0f', cbar_kws={'label': 'Error Count'})
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Target')
        ax.set_title(f'{head.upper()} Confusion Patterns')
    else:
        ax.text(0.5, 0.5, 'No data', ha='center', va='center')
        ax.set_title(f'{head.upper()} Confusion Patterns')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'confusion_patterns.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Dataset Bias Analysis

Analyze if errors correlate with dataset characteristics.

In [None]:
class DatasetBiasAnalyzer:
    """Analyze correlation between errors and dataset properties."""
    
    def __init__(self, errors, data_stats=None):
        self.errors = errors
        self.data_stats = data_stats or {}
        
    def analyze_position_bias(self, seq_length=64):
        """Check if errors cluster at certain sequence positions."""
        position_counts = defaultdict(lambda: defaultdict(int))
        
        for head, error_list in self.errors.items():
            for e in error_list:
                # Bin positions into groups
                pos = e['seq_idx']
                if pos < 5:
                    bin_name = 'start (0-4)'
                elif pos < seq_length // 2:
                    bin_name = 'early (5-31)'
                elif pos < seq_length - 5:
                    bin_name = 'late (32-58)'
                else:
                    bin_name = 'end (59-63)'
                position_counts[head][bin_name] += 1
        
        return position_counts
    
    def analyze_class_balance(self):
        """Check if errors correlate with class frequency."""
        target_counts = defaultdict(lambda: defaultdict(int))
        
        for head, error_list in self.errors.items():
            for e in error_list:
                target_counts[head][e['target_id']] += 1
        
        return target_counts

# Analyze biases
bias_analyzer = DatasetBiasAnalyzer(analyzer.errors)

# Position bias
position_bias = bias_analyzer.analyze_position_bias()
print("Position Bias Analysis:")
print("="*60)
for head, positions in position_bias.items():
    print(f"\n{head.upper()}:")
    for pos, count in sorted(positions.items()):
        print(f"  {pos}: {count} errors")

In [None]:
# Visualize position bias
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Position bias bar chart
pos_order = ['start (0-4)', 'early (5-31)', 'late (32-58)', 'end (59-63)']
heads = list(position_bias.keys())
x = np.arange(len(pos_order))
width = 0.2

for i, head in enumerate(heads):
    values = [position_bias[head].get(pos, 0) for pos in pos_order]
    axes[0].bar(x + i * width, values, width, label=head.upper())

axes[0].set_xlabel('Sequence Position')
axes[0].set_ylabel('Error Count')
axes[0].set_title('Errors by Sequence Position')
axes[0].set_xticks(x + width * (len(heads) - 1) / 2)
axes[0].set_xticklabels(pos_order, rotation=15)
axes[0].legend()

# Class frequency vs error rate (if we had class frequencies)
# For now, show error distribution by target class
class_errors = bias_analyzer.analyze_class_balance()
if class_errors:
    head = 'command'  # Focus on command head
    if head in class_errors:
        top_classes = sorted(class_errors[head].items(), key=lambda x: -x[1])[:20]
        classes, counts = zip(*top_classes) if top_classes else ([], [])
        axes[1].bar(range(len(classes)), counts, color='coral')
        axes[1].set_xlabel('Target Class ID')
        axes[1].set_ylabel('Error Count')
        axes[1].set_title(f'Top 20 Error-Prone Classes ({head.upper()} Head)')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'dataset_bias.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Actionable Recommendations

Based on error analysis, generate specific improvement recommendations.

In [None]:
def generate_recommendations(error_summary, failure_modes, confusion_patterns, position_bias):
    """Generate actionable recommendations based on error analysis."""
    recommendations = []
    
    # 1. High confidence errors
    for head, stats in error_summary.items():
        if stats.get('high_conf_errors', 0) > 10:
            recommendations.append({
                'priority': 'HIGH',
                'category': 'Model Confidence',
                'issue': f"{head.upper()} head has {stats['high_conf_errors']} high-confidence errors",
                'recommendation': 'Consider label smoothing, temperature scaling, or focal loss to improve calibration',
                'expected_impact': 'Reduce overconfident wrong predictions'
            })
    
    # 2. Failure mode specific
    for head, modes in failure_modes.items():
        if modes.get('boundary_error', 0) > 20:
            recommendations.append({
                'priority': 'MEDIUM',
                'category': 'Sequence Modeling',
                'issue': f"{head.upper()} has {modes['boundary_error']} boundary errors",
                'recommendation': 'Add positional encoding or use bidirectional context for sequence boundaries',
                'expected_impact': 'Improve predictions at sequence start/end'
            })
        
        if modes.get('rare_class', 0) > 15:
            recommendations.append({
                'priority': 'HIGH',
                'category': 'Data Imbalance',
                'issue': f"{head.upper()} struggles with rare classes ({modes['rare_class']} errors)",
                'recommendation': 'Increase class weights for rare classes or oversample rare examples',
                'expected_impact': 'Better handling of long-tail distribution'
            })
    
    # 3. Confusion patterns
    for head, patterns in confusion_patterns.items():
        if patterns and patterns[0][1] > 15:  # Top confusion > 15 occurrences
            (target, pred), count = patterns[0]
            recommendations.append({
                'priority': 'MEDIUM',
                'category': 'Feature Engineering',
                'issue': f"{head.upper()}: Frequent confusion between class {target} and {pred} ({count} times)",
                'recommendation': 'Add discriminative features to distinguish these classes, or merge if semantically similar',
                'expected_impact': 'Reduce systematic confusion'
            })
    
    # 4. Position bias
    for head, positions in position_bias.items():
        start_errors = positions.get('start (0-4)', 0)
        end_errors = positions.get('end (59-63)', 0)
        middle_errors = positions.get('early (5-31)', 0) + positions.get('late (32-58)', 0)
        
        if (start_errors + end_errors) > middle_errors * 0.5:
            recommendations.append({
                'priority': 'MEDIUM',
                'category': 'Architecture',
                'issue': f"{head.upper()} has disproportionate boundary errors",
                'recommendation': 'Consider adding special boundary tokens or using different padding strategy',
                'expected_impact': 'More consistent predictions across sequence'
            })
    
    return sorted(recommendations, key=lambda x: {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2}[x['priority']])

# Generate recommendations
recommendations = generate_recommendations(
    analyzer.get_error_summary(),
    taxonomy.get_summary(),
    confusion_patterns,
    position_bias
)

print("\n" + "="*80)
print("ACTIONABLE RECOMMENDATIONS")
print("="*80)

for i, rec in enumerate(recommendations, 1):
    print(f"\n[{rec['priority']}] Recommendation #{i}")
    print(f"Category: {rec['category']}")
    print(f"Issue: {rec['issue']}")
    print(f"Action: {rec['recommendation']}")
    print(f"Expected Impact: {rec['expected_impact']}")

In [None]:
# Save error analysis report
report = {
    'error_summary': analyzer.get_error_summary(),
    'failure_modes': taxonomy.get_summary(),
    'top_confusion_patterns': {
        head: patterns[:5] for head, patterns in confusion_patterns.items()
    },
    'position_bias': dict(position_bias),
    'recommendations': recommendations
}

# Save to JSON
report_path = project_root / 'reports' / 'error_analysis_report.json'
report_path.parent.mkdir(parents=True, exist_ok=True)

# Convert defaultdicts to regular dicts for JSON serialization
def convert_to_serializable(obj):
    if isinstance(obj, defaultdict):
        return dict(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    return obj

with open(report_path, 'w') as f:
    json.dump(convert_to_serializable(report), f, indent=2)

print(f"\nError analysis report saved to: {report_path}")

---

## Summary

This notebook provides comprehensive error analysis:

1. **Error Pattern Analysis**: Systematic breakdown of errors by head and confidence
2. **Hard Example Mining**: Identifies the most challenging cases for the model
3. **Failure Mode Taxonomy**: Categorizes errors into actionable failure types
4. **Per-Operation Analysis**: Shows which G-code operations are most error-prone
5. **Confusion Mining**: Reveals systematic confusion patterns between classes
6. **Dataset Bias**: Identifies position and class-frequency related biases
7. **Recommendations**: Generates prioritized, actionable improvement suggestions

---

**Navigation:**
← [Previous: 11_model_interpretability](11_model_interpretability.ipynb) |
[Next: 13_deployment_guide](13_deployment_guide.ipynb) →