In [22]:
"""
Multimodal Evaluation Metrics for LSTMABAR
Implements STS, Spectral Centroid Error, and MFCC Similarity
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
import librosa
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ttest_rel
import pandas as pd
from typing import Dict, List, Tuple, Optional
from pathlib import Path

from lstmabar_model import LSTMABAR
from improved_text_encoders import ImprovedTextEncoder, HFBackboneTextEncoder

In [23]:
class LSTMABAREvaluator:
    """
    Complete evaluation framework for LSTMABAR model
    Implements text-side and audio-side metrics
    """
    
    def __init__(
        self,
        model: LSTMABAR,
        text_model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',
        sample_rate: int = 44100
    ):
        self.model = model
        self.sample_rate = sample_rate
        
        # Text encoder for STS
        self.text_encoder = SentenceTransformer(text_model_name)
        
        # Archetype descriptors for mapping
        self.archetype_descriptors = {
            'sine': 'smooth pure warm mellow soft gentle flowing',
            'square': 'digital harsh buzzy retro electronic robotic',
            'sawtooth': 'bright cutting metallic sharp aggressive',
            'triangle': 'hollow woody muted filtered organic',
            'noise': 'rough textured grainy distorted chaotic'
        }
        
        # Expected spectral centroids for common descriptors (Hz)
        self.centroid_map = {
            'bright': 3500, 'dark': 800, 'warm': 1000, 'harsh': 4000,
            'mellow': 1200, 'sharp': 5000, 'smooth': 1500, 'soft': 1100,
            'aggressive': 3800, 'gentle': 1300, 'cutting': 4200
        }
    
    # ==================== TEXT-SIDE EVALUATION (STS) ====================
    
    def compute_sts(
        self,
        input_description: str,
        predicted_archetype_weights: np.ndarray
    ) -> float:
        """
        Semantic Textual Similarity: Does model understand description?
        
        Args:
            input_description: Original user description
            predicted_archetype_weights: Model's predicted archetype weights (5,)
        
        Returns:
            STS score (0-1, higher is better)
        """
        # Convert archetype weights to descriptive text
        predicted_description = self._archetypes_to_text(predicted_archetype_weights)
        
        # Encode both descriptions
        emb_input = self.text_encoder.encode(input_description, convert_to_tensor=True)
        emb_predicted = self.text_encoder.encode(predicted_description, convert_to_tensor=True)
        
        # Compute cosine similarity
        sts_score = util.cos_sim(emb_input, emb_predicted).item()
        
        return sts_score
    
    def _archetypes_to_text(
        self,
        archetype_weights: np.ndarray,
        threshold: float = 0.1
    ) -> str:
        """Convert archetype mixture to natural language"""
        archetype_names = ['sine', 'square', 'sawtooth', 'triangle', 'noise']
        
        # Get top contributing archetypes
        descriptions = []
        for i, (name, weight) in enumerate(zip(archetype_names, archetype_weights)):
            if weight > threshold:
                descriptions.append(self.archetype_descriptors[name])
        
        return ' '.join(descriptions) if descriptions else 'neutral sound'
    
    # ==================== AUDIO-SIDE EVALUATION ====================
    
    def spectral_centroid_error(
        self,
        output_audio: np.ndarray,
        target_description: str
    ) -> float:
        """
        Measures brightness accuracy (Hz error)
        
        Args:
            output_audio: Generated audio array
            target_description: Target description
        
        Returns:
            Absolute error in Hz (lower is better)
        """
        # Compute actual spectral centroid
        centroid = librosa.feature.spectral_centroid(
            y=output_audio,
            sr=self.sample_rate
        )
        actual_centroid = np.mean(centroid)
        
        # Determine expected centroid from description
        expected_centroid = self._description_to_centroid(target_description)
        
        # Calculate error
        error_hz = abs(actual_centroid - expected_centroid)
        
        return error_hz
    
    def _description_to_centroid(self, description: str) -> float:
        """Map description to expected spectral centroid"""
        words = description.lower().split()
        expected_centroids = []
        
        for word in words:
            if word in self.centroid_map:
                expected_centroids.append(self.centroid_map[word])
        
        if expected_centroids:
            return np.mean(expected_centroids)
        
        return 2000  # Neutral default
    
    def mfcc_similarity(
        self,
        output_audio: np.ndarray,
        reference_audio: np.ndarray,
        n_mfcc: int = 13
    ) -> float:
        """
        Measures timbral similarity (0-1, higher is better)
        
        Args:
            output_audio: Generated audio array
            reference_audio: Target/reference audio array
            n_mfcc: Number of MFCC coefficients
        
        Returns:
            Cosine similarity (1 is perfect match, >0.8 is good)
        """
        # Extract MFCCs from both signals
        mfcc_output = librosa.feature.mfcc(
            y=output_audio,
            sr=self.sample_rate,
            n_mfcc=n_mfcc
        )
        mfcc_reference = librosa.feature.mfcc(
            y=reference_audio,
            sr=self.sample_rate,
            n_mfcc=n_mfcc
        )
        
        # Average across time
        mfcc_output_mean = np.mean(mfcc_output, axis=1).reshape(1, -1)
        mfcc_reference_mean = np.mean(mfcc_reference, axis=1).reshape(1, -1)
        
        # Compute cosine similarity
        similarity = cosine_similarity(mfcc_output_mean, mfcc_reference_mean)[0][0]
        
        return similarity
    
    # ==================== COMPREHENSIVE EVALUATION ====================
    
    def evaluate_single_transformation(
        self,
        input_audio: np.ndarray,
        input_description: str,
        output_audio: np.ndarray,
        reference_audio: np.ndarray,
        predicted_weights: np.ndarray
    ) -> Dict[str, float]:
        """
        Complete evaluation of a single transformation
        
        Returns:
            Dict with all metrics
        """
        results = {
            # Text-side (NLP understanding)
            'sts_score': self.compute_sts(input_description, predicted_weights),
            
            # Audio-side (transformation quality)
            'spectral_centroid_error_hz': self.spectral_centroid_error(
                output_audio, input_description
            ),
            'mfcc_similarity': self.mfcc_similarity(output_audio, reference_audio)
        }
        
        return results
    
    def evaluate_model(
        self,
        test_samples: List[Dict],
        generate_reference: bool = True
    ) -> pd.DataFrame:
        """
        Evaluate model on test set
        
        Args:
            test_samples: List of dicts with 'audio', 'description', 'target_weights'
            generate_reference: If True, generate reference audio from target weights
        
        Returns:
            DataFrame with results for each sample
        """
        self.model.eval()
        results = []
        
        print(f"Evaluating model on {len(test_samples)} samples...")
        
        for i, sample in enumerate(test_samples):
            input_audio = sample['audio']
            description = sample['description']
            target_weights = sample['target_weights']
            
            # Convert to tensor
            audio_tensor = torch.from_numpy(input_audio).unsqueeze(0).float().to(self.model.device)
            
            # Model inference
            with torch.no_grad():
                transformed, metadata = self.model.inference([description], audio_tensor)
            
            # Convert back to numpy
            output_audio = transformed[0].cpu().numpy()
            predicted_weights = metadata['predicted_weights'][0]
            
            # Generate reference audio if needed
            if generate_reference:
                # Use archetype generator to create ideal sound
                reference_audio = self._generate_reference_audio(target_weights)
            else:
                reference_audio = input_audio  # Fallback
            
            # Evaluate
            metrics = self.evaluate_single_transformation(
                input_audio,
                description,
                output_audio,
                reference_audio,
                predicted_weights
            )
            
            # Add metadata
            metrics['description'] = description
            metrics['sample_idx'] = i
            
            results.append(metrics)
            
            if (i + 1) % 10 == 0:
                print(f"  Processed {i+1}/{len(test_samples)} samples")
        
        # Convert to DataFrame
        df = pd.DataFrame(results)
        
        return df
    
    def _generate_reference_audio(
        self,
        archetype_weights: np.ndarray,
        duration: float = 2.0
    ) -> np.ndarray:
        """Generate reference audio from archetype mixture"""
        n_samples = int(self.sample_rate * duration)
        t = np.linspace(0, duration, n_samples, endpoint=False)
        frequency = 440  # A4
        
        audio = np.zeros(n_samples)
        
        # Generate each archetype component
        archetypes = {
            'sine': np.sin(2 * np.pi * frequency * t),
            'square': np.sign(np.sin(2 * np.pi * frequency * t)),
            'sawtooth': 2 * (t * frequency - np.floor(0.5 + t * frequency)),
            'triangle': 2 * np.abs(2 * (t * frequency - np.floor(t * frequency + 0.5))) - 1,
            'noise': np.random.randn(n_samples) * 0.3
        }
        
        archetype_names = ['sine', 'square', 'sawtooth', 'triangle', 'noise']
        
        # Mix according to weights
        for i, name in enumerate(archetype_names):
            audio += archetype_weights[i] * archetypes[name]
        
        # Normalize
        audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95
        
        return audio
    
    def compare_with_baseline(
        self,
        test_samples: List[Dict],
        baseline_model: Optional[object] = None
    ) -> pd.DataFrame:
        """
        Compare LSTMABAR with baseline model
        
        Args:
            test_samples: Test samples
            baseline_model: Baseline model (if None, uses simple keyword matching)
        
        Returns:
            Comparison DataFrame
        """
        # Evaluate LSTMABAR
        lstmabar_results = self.evaluate_model(test_samples)
        
        # Evaluate baseline
        if baseline_model is None:
            baseline_results = self._evaluate_keyword_baseline(test_samples)
        else:
            baseline_results = baseline_model.evaluate(test_samples)
        
        # Compute statistics
        comparison = self._compute_comparison_stats(lstmabar_results, baseline_results)
        
        return comparison
    
    def _evaluate_keyword_baseline(self, test_samples: List[Dict]) -> pd.DataFrame:
        """Evaluate simple keyword-matching baseline"""
        results = []
        
        # Simple keyword ‚Üí archetype mapping
        keyword_map = {
            'bright': [0.1, 0.1, 0.6, 0.1, 0.1],  # Mostly sawtooth
            'warm': [0.6, 0.1, 0.1, 0.2, 0.0],    # Mostly sine
            'harsh': [0.1, 0.5, 0.2, 0.1, 0.1],   # Mostly square
            'smooth': [0.6, 0.1, 0.1, 0.2, 0.0],  # Mostly sine
            'distorted': [0.1, 0.2, 0.2, 0.1, 0.4] # High noise
        }
        
        for i, sample in enumerate(test_samples):
            description = sample['description'].lower()
            
            # Find matching keywords
            predicted_weights = np.array([0.2, 0.2, 0.2, 0.2, 0.2])  # Uniform default
            for keyword, weights in keyword_map.items():
                if keyword in description:
                    predicted_weights = np.array(weights)
                    break
            
            # Simple transformation (just apply gain based on brightness)
            output_audio = sample['audio'].copy()
            if 'bright' in description:
                # Boost high frequencies (simplified)
                output_audio = output_audio * 1.2
            
            reference_audio = self._generate_reference_audio(sample['target_weights'])
            
            # Evaluate
            metrics = self.evaluate_single_transformation(
                sample['audio'],
                description,
                output_audio,
                reference_audio,
                predicted_weights
            )
            
            metrics['description'] = description
            metrics['sample_idx'] = i
            results.append(metrics)
        
        return pd.DataFrame(results)
    
    def _compute_comparison_stats(
        self,
        lstmabar_results: pd.DataFrame,
        baseline_results: pd.DataFrame
    ) -> pd.DataFrame:
        """Compute statistical comparison"""
        metrics = ['sts_score', 'spectral_centroid_error_hz', 'mfcc_similarity']
        
        comparison = {
            'Metric': [],
            'Baseline_Mean': [],
            'Baseline_Std': [],
            'LSTMABAR_Mean': [],
            'LSTMABAR_Std': [],
            'Improvement': [],
            'p_value': []
        }
        
        for metric in metrics:
            baseline_vals = baseline_results[metric].values
            lstmabar_vals = lstmabar_results[metric].values
            
            baseline_mean = np.mean(baseline_vals)
            baseline_std = np.std(baseline_vals)
            lstmabar_mean = np.mean(lstmabar_vals)
            lstmabar_std = np.std(lstmabar_vals)
            
            comparison['Metric'].append(metric)
            comparison['Baseline_Mean'].append(f"{baseline_mean:.4f}")
            comparison['Baseline_Std'].append(f"{baseline_std:.4f}")
            comparison['LSTMABAR_Mean'].append(f"{lstmabar_mean:.4f}")
            comparison['LSTMABAR_Std'].append(f"{lstmabar_std:.4f}")
            
            # Calculate improvement
            if 'error' in metric.lower():
                # Lower is better
                improvement = ((baseline_mean - lstmabar_mean) / baseline_mean) * 100
            else:
                # Higher is better
                improvement = ((lstmabar_mean - baseline_mean) / baseline_mean) * 100
            
            comparison['Improvement'].append(f"{improvement:.2f}%")
            
            # Paired t-test
            _, p_val = ttest_rel(baseline_vals, lstmabar_vals)
            comparison['p_value'].append(f"{p_val:.4f}")
        
        return pd.DataFrame(comparison)
    
    def generate_evaluation_report(
        self,
        test_samples: List[Dict],
        save_path: Optional[str] = None
    ):
        """Generate comprehensive evaluation report"""
        print("\n" + "="*80)
        print("LSTMABAR MODEL EVALUATION REPORT")
        print("="*80)
        
        # Evaluate model
        results = self.evaluate_model(test_samples)
        
        # Compare with baseline
        comparison = self.compare_with_baseline(test_samples)
        
        print("\n" + "="*80)
        print("COMPARISON WITH BASELINE")
        print("="*80)
        print(comparison.to_string(index=False))
        
        print("\n" + "="*80)
        print("INTERPRETATION")
        print("="*80)
        
        # STS interpretation
        sts_mean = results['sts_score'].mean()
        print(f"\nüìù TEXT UNDERSTANDING (STS Score):")
        print(f"   Mean: {sts_mean:.3f}")
        if sts_mean > 0.75:
            print(f"   ‚úì Excellent semantic understanding!")
        elif sts_mean > 0.60:
            print(f"   ‚Üí Good semantic understanding")
        else:
            print(f"   ‚ö† Needs improvement in NLP comprehension")
        
        # Spectral centroid interpretation
        sc_mean = results['spectral_centroid_error_hz'].mean()
        print(f"\nüéµ BRIGHTNESS ACCURACY (Spectral Centroid Error):")
        print(f"   Mean: {sc_mean:.0f} Hz")
        if sc_mean < 200:
            print(f"   ‚úì Excellent brightness targeting!")
        elif sc_mean < 500:
            print(f"   ‚Üí Good brightness control")
        else:
            print(f"   ‚ö† Brightness targeting needs improvement")
        
        # MFCC interpretation
        mfcc_mean = results['mfcc_similarity'].mean()
        print(f"\nüé∏ TIMBRE QUALITY (MFCC Similarity):")
        print(f"   Mean: {mfcc_mean:.3f}")
        if mfcc_mean > 0.80:
            print(f"   ‚úì Excellent timbral matching!")
        elif mfcc_mean > 0.65:
            print(f"   ‚Üí Good timbral similarity")
        else:
            print(f"   ‚ö† Timbre quality needs improvement")
        
        print("\n" + "="*80)
        
        # Save detailed results
        if save_path:
            results.to_csv(save_path, index=False)
            comparison.to_csv(save_path.replace('.csv', '_comparison.csv'), index=False)
            print(f"\nDetailed results saved to {save_path}")

    def evaluate_model_on_test_set(
        self,
        test_data_path: str,
        max_samples: Optional[int] = None,
        save_results: bool = True,
        results_path: str = 'test_results.csv'
    ) -> pd.DataFrame:
        """
        Evaluate model on test set from MusicCaps
        
        Args:
            test_data_path: Path to test .npz file
            max_samples: Max samples to evaluate (None = all)
            save_results: Whether to save results
            results_path: Path to save results
        
        Returns:
            DataFrame with evaluation results
        """
        print(f"\n{'='*80}")
        print("EVALUATING ON TEST SET")
        print(f"{'='*80}\n")
        
        # Load test data
        print(f"Loading test data from {test_data_path}...")
        data = np.load(test_data_path, allow_pickle=True)
        
        vectors = data['archetype_vectors']
        descriptions = data['descriptions'].tolist()
        audio_paths = data['audio_paths'].tolist()
        
        n_samples = min(len(descriptions), max_samples) if max_samples else len(descriptions)
        print(f"Evaluating on {n_samples} test samples\n")
        
        # Prepare test samples
        test_samples = []
        for i in range(n_samples):
            audio_path = audio_paths[i]
            
            # Check if audio exists
            if not Path(audio_path).exists():
                print(f"Skipping {i}: audio file not found")
                continue
            
            # Load audio
            try:
                audio, sr = librosa.load(audio_path, sr=self.sample_rate, duration=2.0)
                
                # Pad or trim
                target_length = int(self.sample_rate * 2.0)
                if len(audio) < target_length:
                    audio = np.pad(audio, (0, target_length - len(audio)))
                else:
                    audio = audio[:target_length]
                
                test_samples.append({
                    'audio': audio,
                    'description': descriptions[i],
                    'target_weights': vectors[i]
                })
            except Exception as e:
                print(f"Error loading {audio_path}: {e}")
                continue
        
        print(f"Successfully loaded {len(test_samples)} test samples\n")
        
        # Evaluate
        results_df = self.evaluate_model(test_samples, generate_reference=True)
        
        # Save results
        if save_results:
            results_df.to_csv(results_path, index=False)
            print(f"\n‚úì Results saved to {results_path}")
        
        return results_df

In [24]:
### Helper: load with prefix-compat (for FineTune_B)

# --- compatibility remap for old text-encoder prefixes (FineTune_B) ---
def _compat_remap_text_encoder_keys(sd: dict) -> dict:
    if not any(k.startswith("text_encoder.backbone.") for k in sd.keys()):
        return sd
    m = [
        ("text_encoder.backbone.embeddings.", "text_encoder.sentence_model.0.auto_model.embeddings."),
        ("text_encoder.backbone.encoder.",    "text_encoder.sentence_model.0.auto_model.encoder."),
        ("text_encoder.backbone.pooler.",     "text_encoder.sentence_model.0.auto_model.pooler."),
    ]
    out = {}
    for k, v in sd.items():
        newk = k
        for old, new in m:
            if k.startswith(old):
                newk = new + k[len(old):]
                break
        out[newk] = v
    return out

def load_lstmabar_checkpoint(path: str, device: str = None):
    import torch
    from lstmabar_model import LSTMABAR
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    model = LSTMABAR(embedding_dim=768, audio_architecture='resnet', sample_rate=44100, device=device)
    ckpt = torch.load(path, map_location=device)
    sd = ckpt.get("model_state_dict", ckpt)
    sd = _compat_remap_text_encoder_keys(sd)
    incompat = model.load_state_dict(sd, strict=False)
    print(f"Loaded {path} (epoch={ckpt.get('epoch','?')}) | missing={len(incompat.missing_keys)} unexpected={len(incompat.unexpected_keys)}")
    return model

def load_improved_c_checkpoint(path: str, device: str = None):
    import torch
    from lstmabar_model import LSTMABAR
    
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create model with same architecture
    model = LSTMABAR(
        embedding_dim=768, 
        audio_architecture='resnet', 
        sample_rate=44100, 
        device=device
    )
    
    # Replace text encoder with improved version
    model.text_encoder = ImprovedTextEncoder(
        model_name='sentence-transformers/all-mpnet-base-v2',
        embedding_dim=768,
        projection_depth='deep',
        device=device
    )
    
    # Load checkpoint
    ckpt = torch.load(path, map_location=device)
    sd = ckpt.get("model_state_dict", ckpt)
    
    # Load with strict=False in case of minor key mismatches
    incompat = model.load_state_dict(sd, strict=False)
    print(f"Loaded {path} (epoch={ckpt.get('epoch','?')})")
    print(f"  Missing keys: {len(incompat.missing_keys)}")
    print(f"  Unexpected keys: {len(incompat.unexpected_keys)}")
    
    return model

def load_approach_d_checkpoint(path: str, device: str = None):
    """
    Load Approach D model (HFBackbone with 4 unfrozen layers) from checkpoint
    """
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create model with same architecture
    model = LSTMABAR(
        embedding_dim=768,
        audio_architecture='resnet',
        sample_rate=44100,
        device=device
    )
    
    # Replace text encoder with HFBackbone version
    model.text_encoder = HFBackboneTextEncoder(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        embedding_dim=768,
        device=device
    )
    
    # Configure fine-tuning (same as training)
    model.text_encoder.configure_fine_tuning(num_unfrozen_layers=4)
    
    # Load checkpoint
    ckpt = torch.load(path, map_location=device)
    sd = ckpt.get("model_state_dict", ckpt)
    
    # Load with strict=False in case of minor key mismatches
    incompat = model.load_state_dict(sd, strict=False)
    
    print(f"Loaded {path} (epoch={ckpt.get('epoch','?')})")
    print(f"  Missing keys: {len(incompat.missing_keys)}")
    print(f"  Unexpected keys: {len(incompat.unexpected_keys)}")
    
    if len(incompat.missing_keys) > 0:
        print(f"  ‚ö†Ô∏è  First few missing: {incompat.missing_keys[:3]}")
    if len(incompat.unexpected_keys) > 0:
        print(f"  ‚ö†Ô∏è  First few unexpected: {incompat.unexpected_keys[:3]}")
    
    return model

In [25]:
### Build a small shared evaluation set (prefer real val clips, fallback to 3 synthetic)

def build_shared_eval_samples(val_npz_path: str, k: int = 73, sample_rate: int = 44100, duration: float = 2.0):
    D = np.load(val_npz_path, allow_pickle=True)
    descs = D['descriptions'].tolist()
    paths = D['audio_paths'].tolist()
    vecs  = D['archetype_vectors']

    tgt_len = int(sample_rate * duration)
    samples = []
    for i, p in enumerate(paths):
        if Path(p).exists():
            try:
                y, sr = librosa.load(p, sr=sample_rate, duration=duration)
                if len(y) < tgt_len:
                    y = np.pad(y, (0, tgt_len - len(y)))
                else:
                    y = y[:tgt_len]
                samples.append({'audio': y, 'description': descs[i], 'target_weights': vecs[i]})
                if len(samples) >= k:
                    break
            except Exception:
                pass

    # Fallback (if nothing loads)
    if not samples:
        rng = np.random.default_rng(42)
        for desc, vec in [
            ("bright and cutting guitar with metallic tone", np.array([0.1,0.1,0.6,0.1,0.1])),
            ("warm smooth piano melody with gentle sustain", np.array([0.6,0.05,0.1,0.2,0.05])),
            ("harsh digital synth with buzzy retro sound",   np.array([0.1,0.55,0.15,0.1,0.1])),
        ]:
            y = rng.standard_normal(tgt_len).astype(np.float32)
            samples.append({'audio': y, 'description': desc, 'target_weights': vec})
    print(f"Shared eval set size: {len(samples)}")
    return samples

In [26]:
#### Model-vs-Model comparison on the shared set

import itertools
from scipy.stats import ttest_rel

def evaluate_side_by_side(models: list, samples: list, sample_rate: int = 44100):
    """
    models: list of (label, model)
    samples: list of dicts with 'audio', 'description', 'target_weights'
    """
    results_per_model = {}
    for label, mdl in models:
        print(f"\n[Eval] {label}")
        ev = LSTMABAREvaluator(mdl, sample_rate=sample_rate)
        # Turn numpy audios into the structure expected by evaluate_model
        df = ev.evaluate_model(samples, generate_reference=True)
        results_per_model[label] = df

    # Summaries
    def summarize(df):
        return {
            "sts_mean": df["sts_score"].mean(),
            "sts_std":  df["sts_score"].std(),
            "centroid_err_mean": df["spectral_centroid_error_hz"].mean(),
            "centroid_err_std":  df["spectral_centroid_error_hz"].std(),
            "mfcc_mean": df["mfcc_similarity"].mean(),
            "mfcc_std":  df["mfcc_similarity"].std(),
            "n": len(df),
        }

    summary_rows = []
    for label, df in results_per_model.items():
        s = summarize(df)
        s["model"] = label
        summary_rows.append(s)
    means_df = pd.DataFrame(summary_rows).set_index("model").round(4)

    # Pairwise deltas + paired t-tests
    pairs = []
    labels = list(results_per_model.keys())
    metrics = [
        ("sts_score", "higher_better"),
        ("spectral_centroid_error_hz", "lower_better"),
        ("mfcc_similarity", "higher_better"),
    ]
    for a, b in itertools.combinations(labels, 2):
        A = results_per_model[a]
        B = results_per_model[b]
        # align by sample_idx (just in case order changed)
        A2 = A.sort_values("sample_idx").reset_index(drop=True)
        B2 = B.sort_values("sample_idx").reset_index(drop=True)
        for metric, direction in metrics:
            a_vals = A2[metric].values
            b_vals = B2[metric].values
            # improvement relative to b (positive is better)
            if direction == "higher_better":
                delta = a_vals - b_vals
            else:
                delta = b_vals - a_vals  # lower error => better
            tstat, p = ttest_rel(a_vals, b_vals)
            pairs.append({
                "A": a, "B": b, "Metric": metric,
                "Œî_mean(A_vs_B)": np.mean(delta),
                "p_value": p
            })

    pairwise_df = pd.DataFrame(pairs)
    # Sort for readability
    pairwise_df = pairwise_df.sort_values(["Metric", "A", "B"]).reset_index(drop=True)
    return means_df, pairwise_df, results_per_model

In [27]:
### Test-set evaluation (your original path)

def eval_on_test_set(model, test_npz, max_samples=None, sample_rate=44100):
    ev = LSTMABAREvaluator(model, sample_rate=sample_rate)
    df = ev.evaluate_model_on_test_set(
        test_data_path=test_npz,
        max_samples=max_samples,
        save_results=False
    )
    return {
        "sts_mean": df["sts_score"].mean(),
        "centroid_err_mean": df["spectral_centroid_error_hz"].mean(),
        "mfcc_mean": df["mfcc_similarity"].mean(),
        "n": len(df)
    }, df

In [36]:
### Run the eval metrics:

# --- Small helper to summarize a results DF (same metrics everywhere) ---
def _summarize_df(df: pd.DataFrame) -> dict:
    return {
        "sts_mean": float(df["sts_score"].mean()),
        "sts_std":  float(df["sts_score"].std()),
        "centroid_err_mean": float(df["spectral_centroid_error_hz"].mean()),
        "centroid_err_std":  float(df["spectral_centroid_error_hz"].std()),
        "mfcc_mean": float(df["mfcc_similarity"].mean()),
        "mfcc_std":  float(df["mfcc_similarity"].std()),
        "n": int(len(df)),
    }

# --- helper to (re)build pairwise stats from a dict of label->df ---
def _pairwise_from_frames(frames: dict) -> pd.DataFrame:
    rows = []
    labels = list(frames.keys())
    metrics = [
        ("sts_score", "higher_better"),
        ("spectral_centroid_error_hz", "lower_better"),
        ("mfcc_similarity", "higher_better"),
    ]
    for a, b in itertools.combinations(labels, 2):
        A = frames[a].sort_values("sample_idx").reset_index(drop=True)
        B = frames[b].sort_values("sample_idx").reset_index(drop=True)
        for metric, direction in metrics:
            a_vals = A[metric].values
            b_vals = B[metric].values
            # improvement relative to B (positive means A better)
            if direction == "higher_better":
                delta = a_vals - b_vals
            else:
                delta = b_vals - a_vals  # lower error => better
            _, p = ttest_rel(a_vals, b_vals)
            rows.append({
                "A": a, "B": b, "Metric": metric,
                "Œî_mean(A_vs_B)": float(np.mean(delta)),
                "p_value": float(p)
            })
    return pd.DataFrame(rows).sort_values(["Metric","A","B"]).reset_index(drop=True)

# --- helper to load raw samples from an NPZ into a list for baselines ---
def _load_samples_from_npz(npz_path: str, max_samples=None, sample_rate=44100, duration=2.0):
    D = np.load(npz_path, allow_pickle=True)
    descs = D["descriptions"].tolist()
    paths = D["audio_paths"].tolist()
    vecs  = D["archetype_vectors"]
    tgt_len = int(sample_rate * duration)

    samples = []
    count = 0
    for i, p in enumerate(paths):
        if max_samples is not None and count >= max_samples:
            break
        if Path(p).exists():
            try:
                y, sr = librosa.load(p, sr=sample_rate, duration=duration)
                if len(y) < tgt_len:
                    y = np.pad(y, (0, tgt_len - len(y)))
                else:
                    y = y[:tgt_len]
                samples.append({
                    "audio": y,
                    "description": descs[i],
                    "target_weights": vecs[i],
                    "sample_idx": i
                })
                count += 1
            except Exception:
                pass
    return samples

# --- your existing paths ---
BASELINE_CKPT   = "checkpoints/best_model.pth"
FINETUNE_A_CKPT = "checkpoints/fine_tune_A/best_model.pth"
FINETUNE_B_CKPT = "checkpoints/fine_tune_B/best_model.pth"
FINETUNE_C_CKPT = "checkpoints/improved_approach_c/best_model.pth"
FINETUNE_D_CKPT = "checkpoints/approach_d_proper/best_model.pth"

VAL_NPZ  = "musiccaps_training_data_val.npz"
TEST_NPZ = "musiccaps_training_data_test.npz"

MAX_SAMPLES_SHARED = 73   # shared head-to-head set (val)
MAX_SAMPLES_TEST   = None # full test or cap with an int

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 1) Load models (with compat loader)
print("Loading models...")
m_base = load_lstmabar_checkpoint(BASELINE_CKPT, device=device)
m_A    = load_lstmabar_checkpoint(FINETUNE_A_CKPT, device=device)
m_B    = load_lstmabar_checkpoint(FINETUNE_B_CKPT, device=device)
m_C    = load_improved_c_checkpoint(FINETUNE_C_CKPT, device=device)
m_D    = load_approach_d_checkpoint(FINETUNE_D_CKPT, device=device)

# 2) Build shared eval set from VAL (or fallback synthetic inside the helper)
shared_samples = build_shared_eval_samples(VAL_NPZ, k=MAX_SAMPLES_SHARED, sample_rate=44100)

# 3) Model-vs-Model (LSTM-based models only, first pass)
models = [
    ("LSTMABAR_Baseline", m_base), 
    ("FineTune_A", m_A), 
    ("FineTune_B", m_B), 
    ("FineTune_C", m_C),
    ("FineTune_D", m_D)  # NEW
]

means_df, pairwise_df, per_model_frames = evaluate_side_by_side(models, shared_samples, sample_rate=44100)

# 4) Add the Keyword Baseline (non-LSTM) on the SAME shared samples
ev_tmp = LSTMABAREvaluator(m_base, sample_rate=44100)  # any LSTMABAR instance just to access evaluator
kb_shared_df = ev_tmp._evaluate_keyword_baseline(shared_samples)
# ensure sample_idx exists for alignment (added by evaluate_model; add here too)
if "sample_idx" not in kb_shared_df.columns:
    kb_shared_df["sample_idx"] = range(len(kb_shared_df))

per_model_frames["Keyword_Baseline"] = kb_shared_df

# 5) Recompute head-to-head means/pairwise including Keyword_Baseline
summary_rows = []
for label, df in per_model_frames.items():
    s = _summarize_df(df)
    s["model"] = label
    summary_rows.append(s)
means_df_all = pd.DataFrame(summary_rows).set_index("model").round(4)

pairwise_df_all = _pairwise_from_frames(per_model_frames)
pairwise_df_all["Œî_mean(A_vs_B)"] = pairwise_df_all["Œî_mean(A_vs_B)"].round(6)
pairwise_df_all["p_value"] = pairwise_df_all["p_value"].map(lambda x: float(x))

# 6) Save head-to-head (with keyword baseline)
Path("results").mkdir(exist_ok=True)
means_df_all.to_csv("results/model_means_shared.csv")          # overwrite with the extended table
pairwise_df_all.to_csv("results/pairwise_stats_shared.csv")    # overwrite with the extended table
print("Saved head-to-head (incl. keyword baseline): results/model_means_shared.csv, results/pairwise_stats_shared.csv")

# 7) Evaluate each model on the TEST set (LSTM models)
test_rows = []
test_frames = {}
for label, mdl in models:
    print(f"\n[Test] {label}")
    summ, df_test = eval_on_test_set(mdl, TEST_NPZ, max_samples=MAX_SAMPLES_TEST, sample_rate=44100)
    test_rows.append({"model": label,
                      "sts_mean": round(float(summ["sts_mean"]),4),
                      "centroid_err_mean": round(float(summ["centroid_err_mean"]),4),
                      "mfcc_mean": round(float(summ["mfcc_mean"]),4),
                      "n": int(summ["n"])})
    test_frames[label] = df_test

# 8) Keyword Baseline on the TEST set (non-LSTM)
print("\n[Test] Keyword_Baseline")
test_samples = _load_samples_from_npz(TEST_NPZ, max_samples=MAX_SAMPLES_TEST, sample_rate=44100, duration=2.0)
kb_test_df = ev_tmp._evaluate_keyword_baseline(test_samples)
test_frames["Keyword_Baseline"] = kb_test_df
kb_test_s = _summarize_df(kb_test_df)
test_rows.append({
    "model": "Keyword_Baseline",
    "sts_mean": round(kb_test_s["sts_mean"],4),
    "centroid_err_mean": round(kb_test_s["centroid_err_mean"],4),
    "mfcc_mean": round(kb_test_s["mfcc_mean"],4),
    "n": int(kb_test_s["n"])
})

test_df = pd.DataFrame(test_rows).set_index("model")
test_df.to_csv("results/model_means_test.csv")
print("Saved test means (incl. keyword baseline): results/model_means_test.csv")

# 9) Pretty displays
print("\n=== Head-to-Head (Shared Set) Means (incl. Keyword_Baseline) ===")
display(means_df_all)

print("\n=== Head-to-Head Pairwise Œî & p-values (incl. Keyword_Baseline) ===")
display(pairwise_df_all)

print("\n=== Test Set Means (incl. Keyword_Baseline) ===")
display(test_df)

# === 10) Human-readable evaluation report (presentation style) ===
print("\n" + "="*80)
print("MULTIMODAL EVALUATION SUMMARY")
print("="*80)

print("\nShared Validation Set (71 samples)")
print("="*80)
print("Comparing all models vs Keyword_Baseline\n")

base = means_df_all.loc["Keyword_Baseline"]
for model in ["LSTMABAR_Baseline", "FineTune_A", "FineTune_B", "FineTune_C", "FineTune_D"]:
    m = means_df_all.loc[model]
    print(f"\n[{model}]")
    print("-"*80)
    # % changes vs baseline (Keyword_Baseline)
    sts_delta = ((m.sts_mean - base.sts_mean) / base.sts_mean) * 100
    sc_delta  = ((base.centroid_err_mean - m.centroid_err_mean) / base.centroid_err_mean) * 100
    mfcc_delta= ((m.mfcc_mean - base.mfcc_mean) / base.mfcc_mean) * 100
    print(f"STS Mean: {m.sts_mean:.3f}  ({sts_delta:+.2f}% vs baseline)")
    print(f"Spectral Centroid Error: {m.centroid_err_mean:.0f} Hz  ({sc_delta:+.2f}% vs baseline)")
    print(f"MFCC Similarity: {m.mfcc_mean:.3f}  ({mfcc_delta:+.2f}% vs baseline)")
    print(f"n = {int(m.n)} samples")

print("\n" + "="*80)
print("PAIRWISE COMPARISONS (Shared Set)")
print("="*80)
print(pairwise_df_all.to_string(index=False))

print("\n" + "="*80)
print("TEST SET PERFORMANCE (73 samples)")
print("="*80)
print("All results compared against Keyword_Baseline\n")

tb = test_df.loc["Keyword_Baseline"]
for model in ["LSTMABAR_Baseline", "FineTune_A", "FineTune_B", "FineTune_C", "FineTune_D"]:
    tm = test_df.loc[model]
    sts_d = ((tm.sts_mean - tb.sts_mean) / tb.sts_mean) * 100
    sc_d  = ((tb.centroid_err_mean - tm.centroid_err_mean) / tb.centroid_err_mean) * 100
    mfcc_d= ((tm.mfcc_mean - tb.mfcc_mean) / tb.mfcc_mean) * 100
    print(f"\n[{model}]")
    print("-"*80)
    print(f"STS Mean: {tm.sts_mean:.3f} ({sts_d:+.2f}% vs baseline)")
    print(f"Spectral Centroid Error: {tm.centroid_err_mean:.0f} Hz ({sc_d:+.2f}% vs baseline)")
    print(f"MFCC Similarity: {tm.mfcc_mean:.3f} ({mfcc_d:+.2f}% vs baseline)")
    print(f"n = {int(tm.n)} samples")

print("\n" + "="*80)
print("INTERPRETATION")
print("="*80)
print("""
üìù TEXT UNDERSTANDING (STS)
 ‚Ä¢ ‚â• 0.75 ‚Üí Excellent
 ‚Ä¢ 0.60‚Äì0.75 ‚Üí Good
 ‚Ä¢ < 0.60 ‚Üí Needs improvement

üéµ BRIGHTNESS ACCURACY (Spectral Centroid Error)
 ‚Ä¢ < 200 Hz ‚Üí Excellent
 ‚Ä¢ < 500 Hz ‚Üí Good
 ‚Ä¢ > 500 Hz ‚Üí Needs improvement

üé∏ TIMBRE QUALITY (MFCC Similarity)
 ‚Ä¢ > 0.80 ‚Üí Excellent
 ‚Ä¢ > 0.65 ‚Üí Good
 ‚Ä¢ ‚â§ 0.65 ‚Üí Needs improvement
""")

print("="*80)
print("‚úì Detailed CSVs in /results :")
print("   ‚Ä¢ model_means_shared.csv")
print("   ‚Ä¢ pairwise_stats_shared.csv")
print("   ‚Ä¢ model_means_test.csv")
print("="*80)

Loading models...
Loading text encoder: sentence-transformers/all-MiniLM-L6-v2
Loaded checkpoints/best_model.pth (epoch=1) | missing=0 unexpected=0
Loading text encoder: sentence-transformers/all-MiniLM-L6-v2
Loaded checkpoints/fine_tune_A/best_model.pth (epoch=8) | missing=0 unexpected=0
Loading text encoder: sentence-transformers/all-MiniLM-L6-v2
Loaded checkpoints/fine_tune_B/best_model.pth (epoch=15) | missing=0 unexpected=0
Loading text encoder: sentence-transformers/all-MiniLM-L6-v2
Loading improved text encoder: sentence-transformers/all-mpnet-base-v2
Loaded checkpoints/improved_approach_c/best_model.pth (epoch=16)
  Missing keys: 0
  Unexpected keys: 0
Loading text encoder: sentence-transformers/all-MiniLM-L6-v2
Loading HF model: sentence-transformers/all-MiniLM-L6-v2
Unfroze 4 layers: 7,097,856 backbone params trainable
Unfroze 4 layers: 7,097,856 backbone params trainable
Loaded checkpoints/approach_d_proper/best_model.pth (epoch=18)
  Missing keys: 0
  Unexpected keys: 0
Sha

Unnamed: 0_level_0,sts_mean,sts_std,centroid_err_mean,centroid_err_std,mfcc_mean,mfcc_std,n
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LSTMABAR_Baseline,0.2272,0.0629,5096.1058,1927.5491,0.3765,0.4243,73
FineTune_A,0.2266,0.0641,5143.6208,2035.427,0.3611,0.3965,73
FineTune_B,0.2287,0.065,6402.9042,1561.7533,0.2703,0.4193,73
FineTune_C,0.2234,0.0627,5075.6175,1918.5905,0.393,0.4045,73
FineTune_D,0.2271,0.0641,5444.4381,1855.3751,0.3633,0.4066,73
Keyword_Baseline,0.2261,0.064,953.7882,855.162,0.5212,0.4161,73



=== Head-to-Head Pairwise Œî & p-values (incl. Keyword_Baseline) ===


Unnamed: 0,A,B,Metric,Œî_mean(A_vs_B),p_value
0,FineTune_A,FineTune_B,mfcc_similarity,0.090802,0.002192595
1,FineTune_A,FineTune_C,mfcc_similarity,-0.031917,0.02116701
2,FineTune_A,FineTune_D,mfcc_similarity,-0.002214,0.9190897
3,FineTune_A,Keyword_Baseline,mfcc_similarity,-0.160098,1.996997e-06
4,FineTune_B,FineTune_C,mfcc_similarity,-0.122719,1.873376e-05
5,FineTune_B,FineTune_D,mfcc_similarity,-0.093016,4.0244e-05
6,FineTune_B,Keyword_Baseline,mfcc_similarity,-0.2509,4.005433e-07
7,FineTune_C,FineTune_D,mfcc_similarity,0.029703,0.1540868
8,FineTune_C,Keyword_Baseline,mfcc_similarity,-0.128181,1.840986e-05
9,FineTune_D,Keyword_Baseline,mfcc_similarity,-0.157885,0.0002111803



=== Test Set Means (incl. Keyword_Baseline) ===


Unnamed: 0_level_0,sts_mean,centroid_err_mean,mfcc_mean,n
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LSTMABAR_Baseline,0.2199,5149.0363,0.3901,753
FineTune_A,0.2174,5138.9973,0.389,753
FineTune_B,0.2216,6242.5751,0.3091,753
FineTune_C,0.2178,5214.1119,0.3837,753
FineTune_D,0.2179,5496.3787,0.3657,753
Keyword_Baseline,0.2152,903.4466,0.5079,753



MULTIMODAL EVALUATION SUMMARY

Shared Validation Set (71 samples)
Comparing all models vs Keyword_Baseline


[LSTMABAR_Baseline]
--------------------------------------------------------------------------------
STS Mean: 0.227  (+0.49% vs baseline)
Spectral Centroid Error: 5096 Hz  (-434.30% vs baseline)
MFCC Similarity: 0.377  (-27.76% vs baseline)
n = 73 samples

[FineTune_A]
--------------------------------------------------------------------------------
STS Mean: 0.227  (+0.22% vs baseline)
Spectral Centroid Error: 5144 Hz  (-439.28% vs baseline)
MFCC Similarity: 0.361  (-30.72% vs baseline)
n = 73 samples

[FineTune_B]
--------------------------------------------------------------------------------
STS Mean: 0.229  (+1.15% vs baseline)
Spectral Centroid Error: 6403 Hz  (-571.31% vs baseline)
MFCC Similarity: 0.270  (-48.14% vs baseline)
n = 73 samples

[FineTune_C]
--------------------------------------------------------------------------------
STS Mean: 0.223  (-1.19% vs baseline