# # Análisis de Resultados del Modelo
# 
# Este notebook analiza los resultados del entrenamiento y evaluación del modelo de transcripción de emergencias.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from jiwer import wer
from src.evaluation.metrics import calculate_detailed_metrics
from src.utils.config_loader import load_config


In [None]:
# Configuración
languages_config = load_config("languages")
training_config = load_config("training_config")

In [None]:
# Cargar resultados de evaluación
try:
    with open('evaluation_results.json', 'r') as f:
        results = json.load(f)
    print("Resultados de evaluación cargados")
except FileNotFoundError:
    print("Primero ejecuta la evaluación completa")
    results = {}

In [None]:
# Cargar logs de entrenamiento si existen
try:
    # Asumiendo que usas TensorBoard o guardas logs en JSON
    training_logs = pd.read_csv('training_logs.csv')
    print("Logs de entrenamiento cargados")
except FileNotFound:
    training_logs = pd.DataFrame()
    print("No se encontraron logs de entrenamiento")

In [None]:
# Visualizar progreso del entrenamiento
if not training_logs.empty:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Pérdida de entrenamiento
    if 'loss' in training_logs.columns:
        axes[0].plot(training_logs['step'], training_logs['loss'])
        axes[0].set_title('Pérdida de Entrenamiento')
        axes[0].set_xlabel('Step')
        axes[0].set_ylabel('Loss')
        axes[0].grid(True)
    
    # WER de validación
    if 'eval_wer' in training_logs.columns:
        axes[1].plot(training_logs['step'], training_logs['eval_wer'])
        axes[1].set_title('WER de Validación')
        axes[1].set_xlabel('Step')
        axes[1].set_ylabel('WER')
        axes[1].grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Análisis de resultados por idioma
if results and 'by_language' in results:
    language_results = results['by_language']
    
    languages = list(language_results.keys())
    wers = [language_results[lang]['wer'] for lang in languages]
    samples = [language_results[lang]['samples'] for lang in languages]
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # WER por idioma
    bars = axes[0].bar(languages, wers, color='skyblue', alpha=0.7)
    axes[0].set_title('WER por Idioma')
    axes[0].set_ylabel('Word Error Rate (WER)')
    axes[0].set_ylim(0, max(wers) * 1.1)
    
    # Añadir valores en las barras
    for bar, wer_val in zip(bars, wers):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                    f'{wer_val:.3f}', ha='center', va='bottom')
    
    # Muestras por idioma
    axes[1].bar(languages, samples, color='lightcoral', alpha=0.7)
    axes[1].set_title('Muestras por Idioma')
    axes[1].set_ylabel('Número de Muestras')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Análisis por nivel de ruido
if results and 'by_noise_level' in results:
    noise_results = results['by_noise_level']
    
    noise_levels = list(noise_results.keys())
    wers_noise = [noise_results[level]['wer'] for level in noise_levels]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(noise_levels, wers_noise, color=['green', 'yellow', 'orange', 'red', 'darkred'])
    plt.title('WER por Nivel de Ruido')
    plt.ylabel('Word Error Rate (WER)')
    plt.xlabel('Nivel de Ruido')
    plt.xticks(rotation=45)
    
    # Añadir valores
    for bar, wer_val in zip(bars, wers_noise):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                f'{wer_val:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()


In [None]:
# Análisis de frases de emergencia
if results and 'by_emergency_phrase' in results:
    phrase_results = results['by_emergency_phrase']
    
    if phrase_results:
        phrases = list(phrase_results.keys())
        phrase_wers = list(phrase_results.values())
        
        # Ordenar por WER
        sorted_indices = np.argsort(phrase_wers)
        sorted_phrases = [phrases[i] for i in sorted_indices]
        sorted_wers = [phrase_wers[i] for i in sorted_indices]
        
        plt.figure(figsize=(12, 8))
        bars = plt.barh(sorted_phrases, sorted_wers, color='lightblue')
        plt.title('WER por Frase de Emergencia')
        plt.xlabel('Word Error Rate (WER)')
        plt.ylabel('Frase de Emergencia')
        
        # Añadir valores
        for bar, wer_val in zip(bars, sorted_wers):
            plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                    f'{wer_val:.3f}', ha='left', va='center')
        
        plt.tight_layout()
        plt.show()

In [None]:
# Matriz de confusión entre idiomas (ejemplo simplificado)
def plot_language_confusion(transcriptions, predictions, languages):
    """Visualizar confusión entre idiomas"""
    from sklearn.metrics import confusion_matrix
    import itertools
    
    # Detectar idioma basado en caracteres (simplificado)
    def detect_language(text):
        text_lower = text.lower()
        for lang, config in languages_config['languages'].items():
            for phrase in config.get('emergency_phrases', []):
                if phrase.lower() in text_lower:
                    return lang
        return 'unknown'
    
    true_langs = [detect_language(text) for text in transcriptions]
    pred_langs = [detect_language(text) for text in predictions]
    
    # Filtrar unknown
    valid_indices = [i for i, (t, p) in enumerate(zip(true_langs, pred_langs)) 
                     if t != 'unknown' and p != 'unknown']
    
    if valid_indices:
        true_langs_filtered = [true_langs[i] for i in valid_indices]
        pred_langs_filtered = [pred_langs[i] for i in valid_indices]
        
        cm = confusion_matrix(true_langs_filtered, pred_langs_filtered, 
                            labels=list(languages_config['languages'].keys()))
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=languages_config['languages'].keys(),
                   yticklabels=languages_config['languages'].keys())
        plt.title('Matriz de Confusión entre Idiomas')
        plt.xlabel('Predicho')
        plt.ylabel('Real')
        plt.tight_layout()
        plt.show()

In [None]:
# Análisis de errores comunes
def analyze_error_patterns(transcriptions, predictions, n_common=10):
    """Analizar patrones de errores comunes"""
    from collections import Counter
    
    errors = []
    for true, pred in zip(transcriptions, predictions):
        if true != pred:
            errors.append({
                'true': true,
                'pred': pred,
                'length_diff': abs(len(true) - len(pred))
            })
    
    # Errores más comunes por longitud
    if errors:
        error_df = pd.DataFrame(errors)
        
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        error_df['length_diff'].hist(bins=20, alpha=0.7)
        plt.title('Distribución de Diferencias de Longitud')
        plt.xlabel('Diferencia de Longitud')
        plt.ylabel('Frecuencia')
        
        plt.subplot(1, 2, 2)
        # Palabras más comúnmente mal transcritas (simplificado)
        all_words = ' '.join([e['true'] for e in errors]).split()
        word_counts = Counter(all_words).most_common(n_common)
        words, counts = zip(*word_counts)
        
        plt.bar(range(len(words)), counts)
        plt.title(f'Top {n_common} Palabras en Errores')
        plt.xlabel('Palabra')
        plt.ylabel('Frecuencia')
        plt.xticks(range(len(words)), words, rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        return error_df
    
    return pd.DataFrame()

In [None]:
# Cargar ejemplos de test para análisis detallado
try:
    test_df = pd.read_csv('data/generated/test_metadata.csv')
    print(f"Dataset de test: {len(test_df)} muestras")
    
    # Ejemplo de análisis con un subconjunto
    sample_df = test_df.sample(min(50, len(test_df)))
    
except FileNotFoundError:
    print("No se encontró dataset de test")
    test_df = pd.DataFrame()

In [None]:
# Generar reporte final de rendimiento
def generate_performance_report(results):
    """Generar reporte completo de rendimiento"""
    report = {
        'overall_performance': results.get('overall', {}),
        'language_performance': {},
        'robustness_analysis': {},
        'recommendations': []
    }
    
    # Análisis por idioma
    if 'by_language' in results:
        lang_results = results['by_language']
        best_lang = min(lang_results.items(), key=lambda x: x[1]['wer'])
        worst_lang = max(lang_results.items(), key=lambda x: x[1]['wer'])
        
        report['language_performance'] = {
            'best_performing': {'language': best_lang[0], 'wer': best_lang[1]['wer']},
            'worst_performing': {'language': worst_lang[0], 'wer': worst_lang[1]['wer']},
            'performance_gap': worst_lang[1]['wer'] - best_lang[1]['wer']
        }
    
    # Análisis de robustez
    if 'by_noise_level' in results:
        noise_results = results['by_noise_level']
        clean_wer = noise_results.get('clean', {}).get('wer', 1.0)
        worst_noise_wer = max([r['wer'] for r in noise_results.values()])
        
        report['robustness_analysis'] = {
            'clean_performance': clean_wer,
            'worst_case_performance': worst_noise_wer,
            'performance_degradation': worst_noise_wer - clean_wer
        }
    
    # Recomendaciones
    overall_wer = results.get('overall', {}).get('wer', 1.0)
    if overall_wer < 0.1:
        report['recommendations'].append("✅ Excelente rendimiento general")
    elif overall_wer < 0.2:
        report['recommendations'].append("⚠️ Buen rendimiento, considerar mejoras en idiomas problemáticos")
    else:
        report['recommendations'].append("❌ Rendimiento necesita mejora, considerar más datos de entrenamiento")
    
    return report


In [None]:
# Generar y mostrar reporte
if results:
    performance_report = generate_performance_report(results)
    
    print("=" * 50)
    print("REPORTE FINAL DE RENDIMIENTO")
    print("=" * 50)
    
    print(f"\nRendimiento General: WER = {performance_report['overall_performance'].get('wer', 'N/A'):.3f}")
    
    print(f"\nAnálisis por Idioma:")
    lang_perf = performance_report['language_performance']
    if lang_perf:
        print(f"  Mejor: {lang_perf['best_performing']['language']} (WER: {lang_perf['best_performing']['wer']:.3f})")
        print(f"  Peor: {lang_perf['worst_performing']['language']} (WER: {lang_perf['worst_performing']['wer']:.3f})")
    
    print(f"\nAnálisis de Robustez:")
    robustness = performance_report['robustness_analysis']
    if robustness:
        print(f"  Condiciones limpias: WER = {robustness['clean_performance']:.3f}")
        print(f"  Peor caso: WER = {robustness['worst_case_performance']:.3f}")
        print(f"  Degradación: {robustness['performance_degradation']:.3f}")
    
    print(f"\nRecomendaciones:")
    for rec in performance_report['recommendations']:
        print(f"  {rec}")



In [None]:
# Guardar análisis completo
if results:
    analysis_output = {
        'performance_report': generate_performance_report(results),
        'detailed_results': results,
        'visualizations': {
            'language_analysis': True,
            'noise_analysis': True,
            'emergency_phrases_analysis': 'by_emergency_phrase' in results
        }
    }
    
    with open('results_analysis.json', 'w', encoding='utf-8') as f:
        json.dump(analysis_output, f, indent=2, ensure_ascii=False)
    
    print("\nAnálisis guardado en results_analysis.json")