In [None]:
# ARC-AGI Model Analysis & Performance Evaluation

# This notebook analyzes model performance, training dynamics, and solution quality:
# - Training loss curves and convergence analysis
# - Solution quality metrics and scoring
# - Error analysis and failure modes
# - Model architecture impact studies
# - Comparison with baseline approaches

import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict, Counter
import torch
from typing import Dict, List, Tuple, Optional
import os
import sys

# Add project path
sys.path.append('../')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Analysis environment setup complete")

# Load performance data and metrics
def load_performance_data():
    """Load model performance data from various sources"""
    data = {
        'metrics': None,
        'solutions': None,
        'ground_truth': None,
        'training_logs': None
    }
    
    # Load detailed metrics if available
    try:
        with open('../results/detailed_metrics.json', 'r') as f:
            data['metrics'] = json.load(f)
        print(f"✓ Loaded metrics for {len(data['metrics'])} tasks")
    except FileNotFoundError:
        print("⚠ No detailed metrics found")
    
    # Load submission file
    try:
        with open('../results/submission.json', 'r') as f:
            data['solutions'] = json.load(f)
        print(f"✓ Loaded solutions for {len(data['solutions'])} tasks")
    except FileNotFoundError:
        print("⚠ No submission file found")
    
    # Load ground truth if available
    try:
        with open('../data/arc-agi_evaluation_solutions.json', 'r') as f:
            data['ground_truth'] = json.load(f)
        print(f"✓ Loaded ground truth for {len(data['ground_truth'])} tasks")
    except FileNotFoundError:
        print("⚠ No ground truth found")
    
    # Load training logs if available
    try:
        with open('../results/training_logs.json', 'r') as f:
            data['training_logs'] = json.load(f)
        print(f"✓ Loaded training logs")
    except FileNotFoundError:
        print("⚠ No training logs found")
    
    return data

# Create sample data if no real data available
def create_sample_data():
    """Create sample performance data for demonstration"""
    np.random.seed(42)
    n_tasks = 50
    
    # Sample metrics
    metrics = {}
    for i in range(n_tasks):
        task_id = f"task_{i:03d}"
        metrics[task_id] = {
            'steps_completed': np.random.randint(100, 1000),
            'final_loss': np.random.exponential(2.0),
            'convergence_step': np.random.randint(50, 800) if np.random.random() > 0.3 else None,
            'memory_used': np.random.uniform(0.5, 4.0),
            'elapsed_time': np.random.uniform(10, 60)
        }
    
    # Sample solutions
    solutions = {}
    for i in range(n_tasks):
        task_id = f"task_{i:03d}"
        solutions[task_id] = [{
            'attempt_1': [[np.random.randint(0, 10) for _ in range(3)] for _ in range(3)],
            'attempt_2': [[np.random.randint(0, 10) for _ in range(3)] for _ in range(3)]
        }]
    
    return {
        'metrics': metrics,
        'solutions': solutions,
        'ground_truth': None,
        'training_logs': None
    }

# Load or create data
print("Loading performance data...")
perf_data = load_performance_data()

if perf_data['metrics'] is None and perf_data['solutions'] is None:
    print("No performance data found. Creating sample data for demonstration.")
    perf_data = create_sample_data()

print("Data loading complete")

# Analyze training dynamics
def analyze_training_metrics(metrics_data):
    """Analyze training performance metrics"""
    if not metrics_data:
        return None
    
    analysis = {
        'total_tasks': len(metrics_data),
        'successful_tasks': 0,
        'failed_tasks': 0,
        'convergence_rate': 0,
        'avg_steps': 0,
        'avg_time': 0,
        'avg_memory': 0,
        'loss_distribution': [],
        'time_distribution': [],
        'convergence_steps': []
    }
    
    steps_list = []
    time_list = []
    memory_list = []
    loss_list = []
    convergence_count = 0
    
    for task_id, metrics in metrics_data.items():
        if 'error' in metrics:
            analysis['failed_tasks'] += 1
        else:
            analysis['successful_tasks'] += 1
        
        steps_list.append(metrics.get('steps_completed', 0))
        time_list.append(metrics.get('elapsed_time', 0))
        memory_list.append(metrics.get('memory_used', 0))
        
        if metrics.get('final_loss') is not None:
            loss_list.append(metrics['final_loss'])
        
        if metrics.get('convergence_step') is not None:
            convergence_count += 1
            analysis['convergence_steps'].append(metrics['convergence_step'])
    
    analysis['convergence_rate'] = convergence_count / len(metrics_data) if metrics_data else 0
    analysis['avg_steps'] = np.mean(steps_list) if steps_list else 0
    analysis['avg_time'] = np.mean(time_list) if time_list else 0
    analysis['avg_memory'] = np.mean(memory_list) if memory_list else 0
    analysis['loss_distribution'] = loss_list
    analysis['time_distribution'] = time_list
    
    return analysis

training_analysis = analyze_training_metrics(perf_data['metrics'])

if training_analysis:
    print("📊 TRAINING PERFORMANCE ANALYSIS")
    print("=" * 50)
    print(f"Total tasks: {training_analysis['total_tasks']}")
    print(f"Successful: {training_analysis['successful_tasks']} ({training_analysis['successful_tasks']/training_analysis['total_tasks']*100:.1f}%)")
    print(f"Failed: {training_analysis['failed_tasks']} ({training_analysis['failed_tasks']/training_analysis['total_tasks']*100:.1f}%)")
    print(f"Convergence rate: {training_analysis['convergence_rate']*100:.1f}%")
    print(f"Average steps: {training_analysis['avg_steps']:.0f}")
    print(f"Average time: {training_analysis['avg_time']:.1f}s")
    print(f"Average memory: {training_analysis['avg_memory']:.2f}GB")
else:
    print("No training metrics available for analysis")

# Visualize training performance
if training_analysis and training_analysis['total_tasks'] > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Training Performance Analysis', fontsize=16, fontweight='bold')
    
    # 1. Success/Failure pie chart
    success_data = [training_analysis['successful_tasks'], training_analysis['failed_tasks']]
    success_labels = ['Successful', 'Failed']
    colors = ['lightgreen', 'lightcoral']
    axes[0, 0].pie(success_data, labels=success_labels, colors=colors, autopct='%1.1f%%', startangle=90)
    axes[0, 0].set_title('Task Success Rate')
    
    # 2. Training steps distribution
    if perf_data['metrics']:
        steps = [m.get('steps_completed', 0) for m in perf_data['metrics'].values()]
        axes[0, 1].hist(steps, bins=20, alpha=0.7, edgecolor='black')
        axes[0, 1].set_xlabel('Training Steps')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].set_title('Training Steps Distribution')
        axes[0, 1].grid(True, alpha=0.3)
        axes[0, 1].axvline(np.mean(steps), color='red', linestyle='--', label=f'Mean: {np.mean(steps):.0f}')
        axes[0, 1].legend()
    
    # 3. Loss distribution
    if training_analysis['loss_distribution']:
        axes[0, 2].hist(training_analysis['loss_distribution'], bins=20, alpha=0.7, edgecolor='black')
        axes[0, 2].set_xlabel('Final Loss')
        axes[0, 2].set_ylabel('Frequency')
        axes[0, 2].set_title('Final Loss Distribution')
        axes[0, 2].grid(True, alpha=0.3)
        axes[0, 2].set_yscale('log')
    
    # 4. Time vs Performance scatter
    if perf_data['metrics']:
        times = [m.get('elapsed_time', 0) for m in perf_data['metrics'].values()]
        steps = [m.get('steps_completed', 0) for m in perf_data['metrics'].values()]
        scatter = axes[1, 0].scatter(times, steps, alpha=0.6, c=steps, cmap='viridis')
        axes[1, 0].set_xlabel('Elapsed Time (s)')
        axes[1, 0].set_ylabel('Training Steps')
        axes[1, 0].set_title('Time vs Training Steps')
        axes[1, 0].grid(True, alpha=0.3)
        plt.colorbar(scatter, ax=axes[1, 0])
    
    # 5. Memory usage distribution
    if perf_data['metrics']:
        memory = [m.get('memory_used', 0) for m in perf_data['metrics'].values()]
        axes[1, 1].hist(memory, bins=20, alpha=0.7, edgecolor='black')
        axes[1, 1].set_xlabel('Memory Usage (GB)')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].set_title('Memory Usage Distribution')
        axes[1, 1].grid(True, alpha=0.3)
        axes[1, 1].axvline(np.mean(memory), color='red', linestyle='--', label=f'Mean: {np.mean(memory):.2f}GB')
        axes[1, 1].legend()
    
    # 6. Convergence analysis
    if training_analysis['convergence_steps']:
        axes[1, 2].hist(training_analysis['convergence_steps'], bins=15, alpha=0.7, edgecolor='black')
        axes[1, 2].set_xlabel('Convergence Step')
        axes[1, 2].set_ylabel('Frequency')
        axes[1, 2].set_title(f'Convergence Steps (Rate: {training_analysis["convergence_rate"]*100:.1f}%)')
        axes[1, 2].grid(True, alpha=0.3)
        if training_analysis['convergence_steps']:
            mean_conv = np.mean(training_analysis['convergence_steps'])
            axes[1, 2].axvline(mean_conv, color='red', linestyle='--', label=f'Mean: {mean_conv:.0f}')
            axes[1, 2].legend()
    else:
        axes[1, 2].text(0.5, 0.5, 'No convergence\ndata available', 
                       ha='center', va='center', transform=axes[1, 2].transAxes)
        axes[1, 2].set_title('Convergence Analysis')
    
    plt.tight_layout()
    plt.show()
else:
    print("Insufficient data for training performance visualization")

# Solution quality analysis
def analyze_solution_quality(solutions_data, ground_truth_data=None):
    """Analyze the quality of generated solutions"""
    if not solutions_data:
        return None
    
    analysis = {
        'total_tasks': len(solutions_data),
        'total_attempts': 0,
        'solution_sizes': [],
        'color_diversity': [],
        'attempt_similarity': [],
        'accuracy': None
    }
    
    for task_id, task_solutions in solutions_data.items():
        for solution_set in task_solutions:
            analysis['total_attempts'] += 2  # attempt_1 and attempt_2
            
            # Analyze each attempt
            for attempt_key in ['attempt_1', 'attempt_2']:
                attempt = solution_set[attempt_key]
                
                # Solution size
                if attempt and len(attempt) > 0:
                    height = len(attempt)
                    width = len(attempt[0]) if attempt[0] else 0
                    analysis['solution_sizes'].append((height, width))
                    
                    # Color diversity
                    colors = set()
                    for row in attempt:
                        colors.update(row)
                    analysis['color_diversity'].append(len(colors))
            
            # Similarity between attempts
            if ('attempt_1' in solution_set and 'attempt_2' in solution_set and
                solution_set['attempt_1'] and solution_set['attempt_2']):
                
                try:
                    arr1 = np.array(solution_set['attempt_1'])
                    arr2 = np.array(solution_set['attempt_2'])
                    
                    if arr1.shape == arr2.shape:
                        similarity = np.mean(arr1 == arr2)
                        analysis['attempt_similarity'].append(similarity)
                except:
                    pass  # Skip if arrays can't be compared
    
    # Calculate accuracy if ground truth is available
    if ground_truth_data:
        correct_tasks = 0
        comparable_tasks = 0
        
        for task_id in solutions_data.keys():
            if task_id in ground_truth_data:
                comparable_tasks += 1
                
                # Check if either attempt matches ground truth
                ground_truth = ground_truth_data[task_id]
                solutions = solutions_data[task_id]
                
                for solution_set in solutions:
                    for attempt_key in ['attempt_1', 'attempt_2']:
                        try:
                            attempt = solution_set[attempt_key]
                            if attempt == ground_truth[0]:  # Assuming single test case
                                correct_tasks += 1
                                break
                        except:
                            continue
        
        if comparable_tasks > 0:
            analysis['accuracy'] = correct_tasks / comparable_tasks
    
    return analysis

solution_analysis = analyze_solution_quality(perf_data['solutions'], perf_data['ground_truth'])

if solution_analysis:
    print("\n🎯 SOLUTION QUALITY ANALYSIS")
    print("=" * 50)
    print(f"Total tasks: {solution_analysis['total_tasks']}")
    print(f"Total attempts: {solution_analysis['total_attempts']}")
    
    if solution_analysis['solution_sizes']:
        sizes = np.array(solution_analysis['solution_sizes'])
        print(f"Average solution size: {np.mean(sizes[:, 0]):.1f} x {np.mean(sizes[:, 1]):.1f}")
        print(f"Size range: {np.min(sizes[:, 0])}-{np.max(sizes[:, 0])} x {np.min(sizes[:, 1])}-{np.max(sizes[:, 1])}")
    
    if solution_analysis['color_diversity']:
        print(f"Average colors per solution: {np.mean(solution_analysis['color_diversity']):.1f}")
    
    if solution_analysis['attempt_similarity']:
        print(f"Average similarity between attempts: {np.mean(solution_analysis['attempt_similarity']):.3f}")
    
    if solution_analysis['accuracy'] is not None:
        print(f"Accuracy: {solution_analysis['accuracy']*100:.1f}%")
    else:
        print("Accuracy: Cannot be calculated (no ground truth)")
else:
    print("No solution data available for analysis")

# Visualize solution characteristics
if solution_analysis and solution_analysis['solution_sizes']:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Solution Quality Analysis', fontsize=16, fontweight='bold')
    
    # 1. Solution size distribution
    sizes = np.array(solution_analysis['solution_sizes'])
    axes[0, 0].scatter(sizes[:, 1], sizes[:, 0], alpha=0.6)
    axes[0, 0].set_xlabel('Width')
    axes[0, 0].set_ylabel('Height')
    axes[0, 0].set_title('Solution Size Distribution')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Color diversity distribution
    if solution_analysis['color_diversity']:
        axes[0, 1].hist(solution_analysis['color_diversity'], bins=range(1, 11), 
                       alpha=0.7, edgecolor='black')
        axes[0, 1].set_xlabel('Number of Unique Colors')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].set_title('Color Diversity in Solutions')
        axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Attempt similarity distribution
    if solution_analysis['attempt_similarity']:
        axes[1, 0].hist(solution_analysis['attempt_similarity'], bins=20, 
                       alpha=0.7, edgecolor='black')
        axes[1, 0].set_xlabel('Similarity Score')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Similarity Between Attempt 1 & 2')
        axes[1, 0].grid(True, alpha=0.3)
        mean_sim = np.mean(solution_analysis['attempt_similarity'])
        axes[1, 0].axvline(mean_sim, color='red', linestyle='--', 
                          label=f'Mean: {mean_sim:.3f}')
        axes[1, 0].legend()
    
    # 4. Solution area vs color diversity
    if solution_analysis['color_diversity']:
        areas = sizes[:, 0] * sizes[:, 1]
        # Ensure we have matching lengths
        min_len = min(len(areas), len(solution_analysis['color_diversity']))
        axes[1, 1].scatter(areas[:min_len], solution_analysis['color_diversity'][:min_len], 
                          alpha=0.6)
        axes[1, 1].set_xlabel('Solution Area (pixels)')
        axes[1, 1].set_ylabel('Number of Colors')
        axes[1, 1].set_title('Area vs Color Diversity')
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Insufficient solution data for visualization")

# Error analysis and failure modes
def analyze_failure_modes(metrics_data, solutions_data):
    """Analyze common failure patterns"""
    failure_analysis = {
        'timeout_failures': 0,
        'convergence_failures': 0,
        'error_failures': 0,
        'low_quality_solutions': 0,
        'failed_task_characteristics': {
            'avg_complexity': 0,
            'common_patterns': []
        }
    }
    
    if not metrics_data:
        return failure_analysis
    
    for task_id, metrics in metrics_data.items():
        # Categorize failures
        if 'error' in metrics:
            failure_analysis['error_failures'] += 1
        elif metrics.get('convergence_step') is None:
            if metrics.get('steps_completed', 0) >= 700:  # Near max iterations
                failure_analysis['timeout_failures'] += 1
            else:
                failure_analysis['convergence_failures'] += 1
    
    # Analyze solution quality issues
    if solutions_data:
        for task_id, task_solutions in solutions_data.items():
            for solution_set in task_solutions:
                # Check for low quality solutions (e.g., all zeros, very small, etc.)
                for attempt_key in ['attempt_1', 'attempt_2']:
                    attempt = solution_set[attempt_key]
                    if attempt:
                        # Check if solution is just zeros or very simple
                        flat_solution = [cell for row in attempt for cell in row]
                        if all(cell == 0 for cell in flat_solution) or len(set(flat_solution)) == 1:
                            failure_analysis['low_quality_solutions'] += 1
                            break
    
    return failure_analysis

failure_analysis = analyze_failure_modes(perf_data['metrics'], perf_data['solutions'])

print("\n🚨 FAILURE MODE ANALYSIS")
print("=" * 50)
print(f"Timeout failures: {failure_analysis['timeout_failures']}")
print(f"Convergence failures: {failure_analysis['convergence_failures']}")
print(f"Error failures: {failure_analysis['error_failures']}")
print(f"Low quality solutions: {failure_analysis['low_quality_solutions']}")

# Visualize failure modes
if any(failure_analysis[key] > 0 for key in ['timeout_failures', 'convergence_failures', 'error_failures', 'low_quality_solutions']):
    failure_types = ['Timeout', 'Convergence', 'Error', 'Low Quality']
    failure_counts = [
        failure_analysis['timeout_failures'],
        failure_analysis['convergence_failures'], 
        failure_analysis['error_failures'],
        failure_analysis['low_quality_solutions']
    ]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(failure_types, failure_counts, color=['orange', 'red', 'darkred', 'gray'], alpha=0.7)
    plt.xlabel('Failure Type')
    plt.ylabel('Count')
    plt.title('Distribution of Failure Modes')
    plt.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, count in zip(bars, failure_counts):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f'{count}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
else:
    print("No significant failure modes detected")

# Performance correlation analysis
def correlation_analysis(metrics_data):
    """Analyze correlations between different performance metrics"""
    if not metrics_data:
        return None
    
    # Extract numeric metrics
    data_for_correlation = []
    
    for task_id, metrics in metrics_data.items():
        if 'error' not in metrics:  # Only successful tasks
            row = {
                'steps_completed': metrics.get('steps_completed', 0),
                'elapsed_time': metrics.get('elapsed_time', 0),
                'memory_used': metrics.get('memory_used', 0),
                'final_loss': metrics.get('final_loss', np.nan),
                'converged': 1 if metrics.get('convergence_step') is not None else 0,
                'convergence_step': metrics.get('convergence_step', np.nan)
            }
            data_for_correlation.append(row)
    
    if len(data_for_correlation) < 2:
        return None
    
    df = pd.DataFrame(data_for_correlation)
    
    # Calculate correlation matrix
    correlation_matrix = df.corr()
    
    return df, correlation_matrix

correlation_result = correlation_analysis(perf_data['metrics'])

if correlation_result:
    df, corr_matrix = correlation_result
    
    print("\n📈 PERFORMANCE CORRELATION ANALYSIS")
    print("=" * 50)
    
    # Display key correlations
    print("Key correlations:")
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
            corr_val = corr_matrix.iloc[i, j]
            if not np.isnan(corr_val) and abs(corr_val) > 0.3:
                print(f"{col1} vs {col2}: {corr_val:.3f}")
    
    # Visualize correlation matrix
    plt.figure(figsize=(10, 8))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.2f', cbar_kws={"shrink": .5})
    plt.title('Performance Metrics Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Statistical summary
    print("\nStatistical Summary:")
    print(df.describe())
else:
    print("Insufficient data for correlation analysis")

# Model architecture impact analysis
def architecture_impact_analysis():
    """Analyze the impact of different architectural choices"""
    
    print("\n🏗️ ARCHITECTURE IMPACT ANALYSIS")
    print("=" * 50)
    
    # Theoretical analysis based on architecture design
    architecture_components = {
        'Multi-tensor System': {
            'purpose': 'Handle variable dimensionality efficiently',
            'impact': 'Enables processing of different grid sizes and patterns',
            'complexity': 'High'
        },
        'VAE Decoder': {
            'purpose': 'Learn compressed representations with KL regularization',
            'impact': 'Encourages meaningful latent representations',
            'complexity': 'Medium'
        },
        'Directional Layers': {
            'purpose': 'Spatial reasoning with cummax and shift operations',
            'impact': 'Captures spatial patterns and transformations',
            'complexity': 'High'
        },
        'Residual Connections': {
            'purpose': 'Enable deep networks and gradient flow',
            'impact': 'Improves training stability and convergence',
            'complexity': 'Low'
        },
        'Symmetrization': {
            'purpose': 'Ensure equivariance to x/y dimension swapping',
            'impact': 'Reduces overfitting to specific orientations',
            'complexity': 'Medium'
        }
    }
    
    # Display analysis
    for component, details in architecture_components.items():
        print(f"\n{component}:")
        print(f"  Purpose: {details['purpose']}")
        print(f"  Impact: {details['impact']}")
        print(f"  Complexity: {details['complexity']}")
    
    # Visualize architecture complexity vs impact
    components = list(architecture_components.keys())
    complexity_scores = {'Low': 1, 'Medium': 2, 'High': 3}
    complexities = [complexity_scores[architecture_components[comp]['complexity']] for comp in components]
    
    # Estimated impact scores (subjective)
    impact_scores = [3, 2, 3, 2, 2]  # High impact for multi-tensor and directional layers
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if c == 3 else 'orange' if c == 2 else 'green' for c in complexities]
    scatter = plt.scatter(complexities, impact_scores, c=colors, s=100, alpha=0.7)
    
    for i, comp in enumerate(components):
        plt.annotate(comp, (complexities[i], impact_scores[i]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=9)
    
    plt.xlabel('Implementation Complexity')
    plt.ylabel('Estimated Impact')
    plt.title('Architecture Components: Complexity vs Impact')
    plt.xticks([1, 2, 3], ['Low', 'Medium', 'High'])
    plt.yticks([1, 2, 3], ['Low', 'Medium', 'High'])
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

architecture_impact_analysis()

# Generate comprehensive analysis report
def generate_analysis_report(training_analysis, solution_analysis, failure_analysis):
    """Generate a comprehensive analysis report"""
    
    report = f"""
ARC-AGI Model Performance Analysis Report
Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
================================================

🎯 EXECUTIVE SUMMARY:
"""
    
    if training_analysis:
        success_rate = training_analysis['successful_tasks'] / training_analysis['total_tasks'] * 100
        report += f"""
• Tasks processed: {training_analysis['total_tasks']}
• Success rate: {success_rate:.1f}%
• Average training time: {training_analysis['avg_time']:.1f}s
• Convergence rate: {training_analysis['convergence_rate']*100:.1f}%
"""
    
    if solution_analysis and solution_analysis['accuracy'] is not None:
        report += f"• Solution accuracy: {solution_analysis['accuracy']*100:.1f}%\n"
    
    report += f"""

📊 DETAILED PERFORMANCE METRICS:
"""
    
    if training_analysis:
        report += f"""
Training Performance:
• Average steps per task: {training_analysis['avg_steps']:.0f}
• Average memory usage: {training_analysis['avg_memory']:.2f} GB
• Failed tasks: {training_analysis['failed_tasks']} ({training_analysis['failed_tasks']/training_analysis['total_tasks']*100:.1f}%)
"""
    
    if solution_analysis:
        report += f"""
Solution Quality:
• Total solution attempts: {solution_analysis['total_attempts']}
"""
        if solution_analysis['solution_sizes']:
            sizes = np.array(solution_analysis['solution_sizes'])
            report += f"• Average solution size: {np.mean(sizes[:, 0]):.1f} x {np.mean(sizes[:, 1]):.1f}\n"
        
        if solution_analysis['color_diversity']:
            report += f"• Average colors per solution: {np.mean(solution_analysis['color_diversity']):.1f}\n"
        
        if solution_analysis['attempt_similarity']:
            report += f"• Average similarity between attempts: {np.mean(solution_analysis['attempt_similarity']):.3f}\n"
    
    report += f"""

🚨 FAILURE ANALYSIS:
• Timeout failures: {failure_analysis['timeout_failures']}
• Convergence failures: {failure_analysis['convergence_failures']}
• Error failures: {failure_analysis['error_failures']}
• Low quality solutions: {failure_analysis['low_quality_solutions']}

💡 KEY INSIGHTS:
"""
    
    # Generate insights based on data
    insights = []
    
    if training_analysis and training_analysis['convergence_rate'] > 0.7:
        insights.append("High convergence rate indicates effective optimization")
    elif training_analysis and training_analysis['convergence_rate'] < 0.3:
        insights.append("Low convergence rate suggests need for hyperparameter tuning")
    
    if solution_analysis and solution_analysis['attempt_similarity']:
        avg_sim = np.mean(solution_analysis['attempt_similarity'])
        if avg_sim > 0.8:
            insights.append("High similarity between attempts indicates low diversity")
        elif avg_sim < 0.3:
            insights.append("Low similarity between attempts shows good exploration")
    
    if failure_analysis['timeout_failures'] > failure_analysis['convergence_failures']:
        insights.append("Many timeout failures suggest increasing iteration limits")
    
    if failure_analysis['low_quality_solutions'] > 0:
        insights.append("Low quality solutions indicate need for better regularization")
    
    for insight in insights:
        report += f"• {insight}\n"
    
    report += f"""

🔧 RECOMMENDATIONS:
• Consider early stopping mechanisms to reduce timeout failures
• Implement solution quality filtering to improve output
• Monitor memory usage for large-scale deployments
• Analyze failed tasks for common patterns
• Consider ensemble methods to improve accuracy

📈 ARCHITECTURE STRENGTHS:
• Multi-tensor system handles variable grid sizes effectively
• VAE approach provides principled uncertainty quantification
• Directional operations capture spatial reasoning patterns
• Residual connections improve training stability

⚠️ POTENTIAL IMPROVEMENTS:
• Adaptive training schedules based on task complexity
• Better initialization strategies for faster convergence
• Task-specific architecture modifications
• Enhanced solution selection criteria
"""
    
    return report

# Generate and display report
analysis_report = generate_analysis_report(training_analysis, solution_analysis, failure_analysis)
print(analysis_report)

# Save report to file
try:
    os.makedirs('../reports', exist_ok=True)
    with open('../reports/performance_analysis_report.txt', 'w') as f:
        f.write(analysis_report)
    print("\n💾 Analysis report saved to ../reports/performance_analysis_report.txt")
except Exception as e:
    print(f"\n⚠ Could not save report: {e}")

print("\n🎉 Performance analysis complete!")