# üîä TTS Evaluation - Text-to-Speech Quality Analysis

**Complete TTS (Text-to-Speech) Evaluation Pipeline**

## Test Data:
- **Input**: `llm.txt` (Wikipedia LLM text)
- **Comparison**: `llm_recording_pranay.m4a` (Your original reading)
- **Goal**: Synthesize speech from text and compare with original recording

## Evaluation Metrics:
- **Audio Similarity**: Spectral and timing comparison
- **Naturalness**: Voice quality and prosody analysis
- **Timing**: Reading speed and pause patterns
- **Characteristics**: Voice timbre and intonation comparison

In [None]:
# Cell 1: Setup and Configuration
import torch
import torchaudio
import numpy as np
from pathlib import Path
import time
import json
from datetime import datetime

print('üîä TTS Evaluation: LLM Text Synthesis Analysis')
print('=' * 60)

# Device setup
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f'Device: {device.upper()}')

# Test files
TEXT_FILE = Path('data/text/PRIMARY/llm.txt')
ORIGINAL_AUDIO = Path('data/audio/PRIMARY/llm_recording_pranay.m4a')

print(f'Text: {TEXT_FILE.name}')
print(f'Original Audio: {ORIGINAL_AUDIO.name}')
print('‚úÖ Setup complete')

In [None]:
# Cell 2: Load LFM Model
from liquid_audio import LFM2AudioModel, LFM2AudioProcessor, ChatState

print('üì¶ Loading LFM components...')

HF_REPO = 'LiquidAI/LFM2.5-Audio-1.5B'
load_start = time.time()

processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()

if device != 'cpu':
    model = model.to(device)

load_time = time.time() - load_start
print(f'‚úÖ Model loaded: {load_time:.2f}s')
print(f'   Parameters: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# Cell 3: Load Test Files
def load_audio_for_comparison(audio_path):
    """Load audio for comparison."""
    waveform, sr = torchaudio.load(str(audio_path))
    
    # Convert to mono if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Resample to 24kHz for comparison
    if sr != 24000:
        resampler = torchaudio.transforms.Resample(sr, 24000)
        waveform = resampler(waveform)
        sr = 24000
    
    return waveform, sr

# Load text
print(f'üìù Loading text: {TEXT_FILE.name}')
with open(TEXT_FILE, 'r') as f:
    llm_text = f.read().strip()
print(f'‚úÖ Text loaded: {len(llm_text)} characters')
print(f'   Words: {len(llm_text.split())}')

# Load original audio
print(f'üéµ Loading original audio: {ORIGINAL_AUDIO.name}')
original_waveform, original_sr = load_audio_for_comparison(ORIGINAL_AUDIO)
print(f'‚úÖ Original audio loaded: {original_waveform.shape}')
print(f'   Duration: {original_waveform.shape[1]/original_sr:.1f}s ({original_waveform.shape[1]/original_sr/60:.1f} minutes)')

# Calculate reading statistics
reading_speed = len(llm_text.split()) / (original_waveform.shape[1]/original_sr)
print(f'   Reading speed: {reading_speed:.1f} words per minute')

In [None]:
# Cell 4: Test Different Voices
print('üé≠ Testing Different LFM Voices')
print('=' * 50)

# Available voices in LFM
voices = [
    'US male',
    'US female', 
    'UK male',
    'UK female'
]

print(f'Available voices: {len(voices)}')
for i, voice in enumerate(voices, 1):
    print(f'   {i}. {voice}')

# Select voice for synthesis (you can change this)
selected_voice = voices[0]  # Start with US male
print(f'\nSelected voice: {selected_voice}')

In [None]:
# Cell 5: TTS Synthesis
print('üîä Starting TTS Synthesis...')
print(f'Voice: {selected_voice}')
print('This may take several minutes for the full text...')
print()

# Create ChatState for TTS
chat_tts = ChatState(processor)

# System prompt for TTS with voice selection
chat_tts.new_turn("system")
chat_tts.add_text(f"Perform TTS. Use the {selected_voice} voice.")
chat_tts.end_turn()

# Add text input
chat_tts.new_turn("user")
chat_tts.add_text(llm_text)
chat_tts.end_turn()

chat_tts.new_turn("assistant")

print('üîä Synthesizing speech...')
start_time = time.time()

audio_out = []
token_count = 0

for t in model.generate_sequential(**chat_tts, max_new_tokens=2048, audio_temperature=0.8, audio_top_k=64):
    if t.numel() > 1:  # Audio token
        audio_out.append(t)
        token_count += 1
        
        # Progress indicator
        if token_count % 50 == 0:
            print(f'   Audio tokens: {token_count}')

synthesis_time = time.time() - start_time

print(f'\n‚úÖ Synthesis complete!')
print(f'   Processing time: {synthesis_time:.1f}s ({synthesis_time/60:.1f} minutes)')
print(f'   Audio tokens generated: {token_count}')

# Check if we got audio
if len(audio_out) == 0:
    print('‚ùå No audio generated!')
else:
    print(f'‚úÖ Audio segments: {len(audio_out)}')

In [None]:
# Cell 6: Detokenize and Save Synthesized Audio
if len(audio_out) > 0:
    print('üéµ Detokenizing synthesized audio...')
    
    # Detokenize audio (remove last "end-of-audio" codes)
    audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
    synthesized_waveform = processor.decode(audio_codes)
    
    print(f'‚úÖ Audio detokenized: {synthesized_waveform.shape}')
    print(f'   Duration: {synthesized_waveform.shape[1]/24000:.1f}s ({synthesized_waveform.shape[1]/24000/60:.1f} minutes)')
    
    # Save synthesized audio
    results_path = Path('results')
    results_path.mkdir(exist_ok=True)
    
    synthesized_file = results_path / f'tts_synthesized_{selected_voice.replace(" ", "_")}.wav'
    torchaudio.save(str(synthesized_file), synthesized_waveform.cpu(), 24000)
    
    print(f'‚úÖ Synthesized audio saved: {synthesized_file}')
    
else:
    print('‚ùå No audio to detokenize')
    # Create dummy for testing
    synthesized_waveform = torch.zeros(1, 24000 * 10)  # 10 seconds of silence
    synthesized_file = None

In [None]:
# Cell 7: Audio Comparison Analysis
def compare_audio_characteristics(original, synthesized, sr=24000):
    """Compare audio characteristics."""
    
    # Basic statistics
    original_rms = torch.sqrt(torch.mean(original**2)).item()
    synthesized_rms = torch.sqrt(torch.mean(synthesized**2)).item()
    
    # Duration comparison
    original_duration = original.shape[1] / sr
    synthesized_duration = synthesized.shape[1] / sr
    
    # Spectral analysis (simplified)
    original_fft = torch.fft.fft(original[0])
    synthesized_fft = torch.fft.fft(synthesized[0])
    
    original_centroid = torch.sum(torch.abs(original_fft[:len(original_fft)//2]) * 
                               torch.arange(len(original_fft)//2])) / torch.sum(torch.abs(original_fft[:len(original_fft)//2]))
    
    synthesized_centroid = torch.sum(torch.abs(synthesized_fft[:len(synthesized_fft)//2]) * 
                                  torch.arange(len(synthesized_fft)//2])) / torch.sum(torch.abs(synthesized_fft[:len(synthesized_fft)//2]))
    
    return {
        'original_rms': original_rms,
        'synthesized_rms': synthesized_rms,
        'rms_ratio': synthesized_rms / max(original_rms, 1e-6),
        'original_duration': original_duration,
        'synthesized_duration': synthesized_duration,
        'duration_ratio': synthesized_duration / max(original_duration, 1e-6),
        'original_centroid': original_centroid.item() / sr,
        'synthesized_centroid': synthesized_centroid.item() / sr,
    }

# Compare audio
if synthesized_waveform.shape[1] > 0:
    print('üìä AUDIO COMPARISON ANALYSIS:')
    print('=' * 50)
    
    # Ensure both are same sample rate for comparison
    if original_sr != 24000:
        resampler = torchaudio.transforms.Resample(original_sr, 24000)
        original_for_comparison = resampler(original_waveform)
    else:
        original_for_comparison = original_waveform
    
    # Trim to same length for comparison
    min_length = min(original_for_comparison.shape[1], synthesized_waveform.shape[1])
    original_trimmed = original_for_comparison[:, :min_length]
    synthesized_trimmed = synthesized_waveform[:, :min_length]
    
    comparison = compare_audio_characteristics(original_trimmed, synthesized_trimmed)
    
    print(f'Original Audio:')
    print(f'   Duration: {comparison["original_duration"]:.1f}s')
    print(f'   RMS Level: {comparison["original_rms"]:.4f}')
    print(f'   Spectral Centroid: {comparison["original_centroid"]:.1f} Hz')
    
    print(f'\nSynthesized Audio ({selected_voice}):')
    print(f'   Duration: {comparison["synthesized_duration"]:.1f}s')
    print(f'   RMS Level: {comparison["synthesized_rms"]:.4f}')
    print(f'   Spectral Centroid: {comparison["synthesized_centroid"]:.1f} Hz')
    
    print(f'\nComparison:')
    print(f'   Duration Ratio: {comparison["duration_ratio"]:.2f}x')
    print(f'   RMS Ratio: {comparison["rms_ratio"]:.2f}x')
    print(f'   Centroid Difference: {abs(comparison["original_centroid"] - comparison["synthesized_centroid"]):.1f} Hz')
else:
    print('‚ùå Cannot compare - no synthesized audio')

In [None]:
# Cell 8: Timing and Pacing Analysis
print('‚è±Ô∏è  TIMING AND PACING ANALYSIS:')
print('=' * 50)

# Text statistics
word_count = len(llm_text.split())
char_count = len(llm_text)

# Original recording timing
original_duration = original_waveform.shape[1] / original_sr
original_reading_speed = word_count / (original_duration / 60)  # words per minute

# Synthesized timing
if synthesized_waveform.shape[1] > 0:
    synthesized_duration = synthesized_waveform.shape[1] / 24000
    synthesized_reading_speed = word_count / (synthesized_duration / 60)
    
    print(f'Text Statistics:')
    print(f'   Words: {word_count}')
    print(f'   Characters: {char_count}')
    
    print(f'\nOriginal Recording (Your Voice):')
    print(f'   Duration: {original_duration:.1f}s ({original_duration/60:.1f} minutes)')
    print(f'   Reading Speed: {original_reading_speed:.1f} words per minute')
    
    print(f'\nSynthesized Audio ({selected_voice}):')
    print(f'   Duration: {synthesized_duration:.1f}s ({synthesized_duration/60:.1f} minutes)')
    print(f'   Reading Speed: {synthesized_reading_speed:.1f} words per minute')
    
    print(f'\nComparison:')
    speed_difference = synthesized_reading_speed - original_reading_speed
    print(f'   Speed Difference: {speed_difference:+.1f} WPM')
    print(f'   Duration Difference: {synthesized_duration - original_duration:+.1f}s ({(synthesized_duration - original_duration)/original_duration*100:+.1f}%)')
    
    # Speed assessment
    print(f'\nüéØ Reading Speed Assessment:')
    if 130 <= synthesized_reading_speed <= 170:
        print('   ‚úÖ NORMAL - Typical conversational speed')
    elif synthesized_reading_speed < 130:
        print('   ‚ö†Ô∏è  SLOW - Deliberate pacing')
    else:
        print('   ‚ö†Ô∏è  FAST - Rapid delivery')
else:
    print('‚ùå No synthesized audio for timing analysis')

In [None]:
# Cell 9: Naturalness Assessment
print('üé≠ NATURALNESS ASSESSMENT:')
print('=' * 40)

if synthesized_waveform.shape[1] > 0:
    # Calculate naturalness metrics
    
    # Dynamic range (amplitude variation)
    synthesized_max = synthesized_waveform.abs().max().item()
    synthesized_mean = synthesized_waveform.abs().mean().item()
    dynamic_range = synthesized_max / max(synthesized_mean, 1e-6)
    
    # Energy distribution
    energy_frames = torch.chunk(synthesized_waveform[0], 100)
    energy_variation = torch.std(torch.stack([frame.abs().mean() for frame in energy_frames])).item()
    
    print(f'Dynamic Range: {dynamic_range:.2f}x')
    print(f'Energy Variation: {energy_variation:.6f}')
    
    # Naturalness indicators
    print(f'\nüéØ Quality Indicators:')
    
    if dynamic_range > 3.0:
        print('   ‚úÖ GOOD dynamic range - Natural speech variation')
    elif dynamic_range > 1.5:
        print('   ‚ö†Ô∏è  MODERATE dynamic range - Some variation present')
    else:
        print('   ‚ùå LOW dynamic range - Monotonic delivery')
    
    if energy_variation > 0.01:
        print('   ‚úÖ GOOD energy variation - Natural prosody')
    elif energy_variation > 0.005:
        print('   ‚ö†Ô∏è  MODERATE energy variation - Some prosody present')
    else:
        print('   ‚ùå LOW energy variation - Flat delivery')
    
    # Overall quality assessment
    print(f'\nüèÜ Overall Quality:')
    if dynamic_range > 3.0 and energy_variation > 0.01:
        print('   ‚úÖ EXCELLENT - Natural, expressive speech')
    elif dynamic_range > 2.0 and energy_variation > 0.008:
        print('   ‚úÖ GOOD - Acceptable speech quality')
    else:
        print('   ‚ö†Ô∏è  NEEDS IMPROVEMENT - Robotic or monotonic')
else:
    print('‚ùå Cannot assess naturalness - no synthesized audio')

In [None]:
# Cell 10: Save Results and Create Comparison Report
def save_tts_results(results_dict, filename='tts_evaluation_results.json'):
    """Save TTS evaluation results."""
    results_path = Path('results')
    results_path.mkdir(exist_ok=True)
    
    output_file = results_path / filename
    with open(output_file, 'w') as f:
        json.dump(results_dict, f, indent=2)
    
    print(f'‚úÖ Results saved: {output_file}')
    return output_file

# Compile results
if synthesized_waveform.shape[1] > 0:
    results = {
        'test_info': {
            'model': HF_REPO,
            'device': device,
            'timestamp': datetime.now().isoformat(),
            'test_type': 'TTS_Evaluation',
        },
        'tts_info': {
            'voice_used': selected_voice,
            'text_file': str(TEXT_FILE),
            'word_count': word_count,
            'character_count': char_count,
            'synthesis_time_seconds': synthesis_time,
        },
        'comparison_info': {
            'original_recording': str(ORIGINAL_AUDIO),
            'synthesized_file': str(synthesized_file) if synthesized_file else None,
            'original_duration_seconds': original_duration,
            'synthesized_duration_seconds': synthesized_duration,
            'duration_ratio': synthesized_duration / original_duration,
        },
        'timing_analysis': {
            'original_reading_speed_wpm': original_reading_speed,
            'synthesized_reading_speed_wpm': synthesized_reading_speed,
            'speed_difference_wpm': speed_difference,
        },
        'audio_characteristics': comparison,
        'quality_assessment': {
            'dynamic_range': dynamic_range,
            'energy_variation': energy_variation,
            'overall_quality': 'EXCELLENT' if dynamic_range > 3.0 and energy_variation > 0.01 else 'GOOD' if dynamic_range > 2.0 else 'NEEDS_IMPROVEMENT',
        },
    }
    
    save_tts_results(results)
else:
    print('‚ùå Cannot save results - synthesis failed')

In [None]:
# Cell 11: Summary and Next Steps
print('üéØ TTS EVALUATION SUMMARY')
print('=' * 50)
print()
print('‚úÖ COMPLETED TTS EVALUATION:')
print(f'   Model: {HF_REPO}')
print(f'   Voice: {selected_voice}')
print(f'   Text: Wikipedia LLM article ({word_count} words)')
print()
print('üìä KEY RESULTS:')
if synthesized_waveform.shape[1] > 0:
    print(f'   Synthesis Time: {synthesis_time:.1f}s')
    print(f'   Original Duration: {original_duration:.1f}s')
    print(f'   Synthesized Duration: {synthesized_duration:.1f}s')
    print(f'   Reading Speed: {synthesized_reading_speed:.1f} WPM')
    print(f'   Quality: {"HIGH" if dynamic_range > 3.0 else "GOOD" if dynamic_range > 2.0 else "NEEDS WORK"}')
else:
    print('   ‚ùå Synthesis failed')
print()
print('üéØ Voice Quality Assessment:')
print('   Naturalness: Based on dynamic range and prosody')
print('   Timing: Compared to your original recording')
print('   Characteristics: Spectral and amplitude analysis')
print()
print('üìã NEXT STEPS:')
print('   1. Listen to synthesized audio')
print('   2. Compare with your original recording')
print('   3. Test other voices (US female, UK male/female)')
print('   4. Run conversation analysis on NotebookLM podcast')
print()
print('‚úÖ TTS evaluation complete!')