In [2]:
import os
import time
import json
import librosa
import numpy as np
from functools import partial
from tqdm import tqdm
from scipy.spatial.distance import cosine
from datasets import load_dataset
from pathlib import Path

import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

In [3]:
########################################################
# Hugging face login
#######################################################
from google.colab import userdata
import logging, warnings
from transformers import logging as hf_logging
from huggingface_hub import login
from dotenv import load_dotenv

# Silence transformers/TRL logs early
hf_logging.set_verbosity_error()
logging.getLogger("trl").setLevel(logging.ERROR)

# Hide specific noisy warnings
warnings.filterwarnings(
    "ignore",
    message=r".*loss_type=None.*ForCausalLMLoss.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r".*cuDNN SDPA backward got grad_output\.strides\(\) != output\.strides\(\).*",
    category=UserWarning,
)
os.environ["TQDM_NOTEBOOK"] = "0"

# setting key in secrets google colab
hf_key = userdata.get('HUGGINGFACE_API_KEY')
# Load .env file (if present)
# load_dotenv()
# hf_key = os.environ.get("HUGGINGFACE_API_KEY")
if hf_key:
    login(hf_key)
else:
    raise EnvironmentError("HUGGINGFACE_API_KEY not found. Copy .env.template to .env and add your token. See Instruction.md")


In [21]:
#######################################################
# CONFIGURATION
#######################################################

# Set your models here
TTS_MODELS = [
    "bark_small", # "suno/bark-small",                                  # expressive
    # "facebook/fastspeech2-en-ljspeech",                 # stable, fast
    # "coqui/XTTS-v2",                                    # highest quality
    "speecht5_tts"# "microsoft/speecht5_tts"                            # optional 4th model
]

models = {}

# Directory to store generated audio
OUTPUT_DIR = Path("tts_results")
OUTPUT_DIR.mkdir(exist_ok=True)

# Example dataset containing text prompts
DATASET_NAME = "MikhailT/lj-speech"
NUM_SAMPLES = 10   # subset for fast evaluation


---
## suno/bark-small

In [22]:
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToWaveform

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor and model
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModelForTextToWaveform.from_pretrained("suno/bark-small").to(device)
models['bark_small'] = {"model": model, "processor": processor}

# Example text
text = "Hello! This is a test of Bark-small text-to-speech."

# Preprocess input
inputs = processor(text=text, return_tensors="pt").to(device)

# Generate waveform
with torch.no_grad():
    waveform = model.generate(**inputs)

# waveform is a tensor with shape [num_samples]
# Save to WAV file
output_path = "bark_output.wav"
torchaudio.save(output_path, waveform.squeeze(0).cpu().unsqueeze(0), sample_rate=24000)

print(f"Saved Bark-small output to {output_path}")

Saved Bark-small output to bark_output.wav


In [23]:
from IPython.display import Audio, display
display(Audio(output_path))

---
## coqui/XTTS-v2

In [16]:
# !pip install TTS

In [17]:
# from TTS.api import TTS

# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
# tts.tts_to_file("Hello world!", file_path="xtts.wav")

## Microsoft/speecht5_tts

In [24]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf

# Load models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Prepare input text
inputs = processor(text="Hello, this is a SpeechT5 text-to-speech test.", return_tensors="pt")

models["speecht5_tts"] = {"processor": processor, "model": model, "vocoder": vocoder}

# Use a random speaker embedding (512 dimensions)
speaker_embeddings = torch.randn(1, 512)

# Generate speech
with torch.no_grad():
    speech = model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings,
        vocoder=vocoder
    )

# Save output
sf.write("speech.wav", speech.numpy(), 16000)

In [25]:
from IPython.display import Audio, display
display(Audio('speech.wav'))

---

In [None]:
!pip install torchcodec

In [52]:
#######################################################
# METRICS — Small, simple (expand as needed)
#######################################################

def audio_duration(path):
    """Returns duration in seconds."""
    y, sr = librosa.load(path, sr=None)
    return len(y) / sr

def mel_spectrogram_similarity(ref_path, gen_path):
    """
    Simple similarity metric comparing mel spectrogram cosine similarity.
    Not perfect, but useful for midterm presentation.
    """
    ref, sr1 = librosa.load(ref_path, sr=None)
    gen, sr2 = librosa.load(gen_path, sr=None)

    ref_mel = librosa.feature.melspectrogram(ref, sr=sr1)
    gen_mel = librosa.feature.melspectrogram(gen, sr=sr2)

    ref_vec = np.mean(ref_mel, axis=1)
    gen_vec = np.mean(gen_mel, axis=1)

    return 1 - cosine(ref_vec, gen_vec)  # 1 = identical, 0 = different

#######################################################
# TTS INFERENCE WRAPPER
#######################################################

def load_tts_model(model_name):
    """Loads processor + model, returns inference function."""
    processor = AutoProcessor.from_pretrained(model_name)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to("cuda")

    def infer(text):
        inputs = processor(text=text, return_tensors="pt").to("cuda")
        with torch.no_grad():
            audio = model.generate(**inputs)
        audio = processor.batch_decode(audio, skip_special_tokens=True)[0]
        return audio

    return infer


#######################################################
# MAIN PIPELINE
#######################################################
import datasets
def run_tts_benchmark():
    # Load dataset
    dataset = load_dataset(DATASET_NAME, split="full", streaming=False).select(range(NUM_SAMPLES))
    dataset = dataset.cast_column("audio", datasets.features.Audio(decode=False))

    results_summary = {}

    for model_name in TTS_MODELS:
        print(f"\n### Running inference for: {model_name}")
        model_dir = OUTPUT_DIR / model_name.replace("/", "_")
        model_dir.mkdir(exist_ok=True)

        # infer_fn = load_tts_model(model_name)

        model_results = []

        for i, sample in enumerate(tqdm(dataset)):
            print(sample)
            text = sample["spoken_text"]
            reference_audio_path = sample["audio"]["path"]

            # ----- Inference -----
            t0 = time.time()
            if model_name == 'bark_small':
              continue
            elif model_name == 'speecht5_tts':
              inputs = models[model_name]['processor'](text=text, return_tensors="pt")
              # Use a random speaker embedding (512 dimensions)
              speaker_embeddings = torch.randn(1, 512)

              # Generate speech
              with torch.no_grad():
                  generated_audio = models[model_name]['model'].generate_speech(
                      inputs["input_ids"],
                      speaker_embeddings,
                      vocoder=models[model_name]['vocoder']
                  )
            t1 = time.time()

            # Save generated audio
            output_audio_path = model_dir / f"sample_{i}.wav"
            if model_name == 'speecht5_tts':
              # Save output
              sf.write(output_audio_path, generated_audio.numpy(), 16000)

            # torchaudio.save(str(output_audio_path),
            #                 torch.tensor(generated_audio).unsqueeze(0),
            #                 22050)

            # Metrics
            duration = audio_duration(output_audio_path)
            # similarity = mel_spectrogram_similarity(reference_audio_path,output_audio_path)

            model_results.append({
                "text": text,
                "reference": reference_audio_path,
                "generated": str(output_audio_path),
                "inference_time": t1 - t0,
                "duration": duration,
                # "mel_similarity": float(similarity)
            })

        # Save model results
        with open(model_dir / "results.json", "w") as f:
            json.dump(model_results, f, indent=2)

        # Summary per model
        avg_time = np.mean([r["inference_time"] for r in model_results])
        # avg_sim = np.mean([r["mel_similarity"] for r in model_results])
        avg_len = np.mean([r["duration"] for r in model_results])

        results_summary[model_name] = {
            "avg_inference_time": float(avg_time),
            # "avg_mel_similarity": float(avg_sim),
            "avg_duration_s": float(avg_len)
        }

    # Save cross-model summary
    with open(OUTPUT_DIR / "summary.json", "w") as f:
        json.dump(results_summary, f, indent=2)

    print("\nDone! Summary saved in tts_results/summary.json")

In [53]:
run_tts_benchmark()

Output hidden; open in https://colab.research.google.com to view.