In [1]:
import os
import time
import json
import librosa
import numpy as np
from functools import partial
from tqdm import tqdm
from scipy.spatial.distance import cosine
from datasets import load_dataset
from pathlib import Path

import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

In [2]:
########################################################
# Hugging face login
#######################################################
import logging, warnings
from transformers import logging as hf_logging
from huggingface_hub import login

# Silence transformers/TRL logs early
hf_logging.set_verbosity_error()
logging.getLogger("trl").setLevel(logging.ERROR)

# Hide specific noisy warnings
warnings.filterwarnings(
    "ignore",
    message=r".*loss_type=None.*ForCausalLMLoss.*",
    category=UserWarning,
)
warnings.filterwarnings(
    "ignore",
    message=r".*cuDNN SDPA backward got grad_output\.strides\(\) != output\.strides\(\).*",
    category=UserWarning,
)
os.environ["TQDM_NOTEBOOK"] = "0"

# setting key in secrets google colab
from google.colab import userdata
hf_key = userdata.get('HUGGINGFACE_API_KEY')
# # Load .env file (if present)
# from dotenv import load_dotenv
# load_dotenv()
hf_key = os.environ.get("HUGGINGFACE_API_KEY")
if hf_key:
    login(hf_key)
else:
    raise EnvironmentError("HUGGINGFACE_API_KEY not found. Copy .env.template to .env and add your token. See Instruction.md")


In [3]:
#######################################################
# CONFIGURATION
#######################################################

# Set your models here
TTS_MODELS = [
    "bark_small", # "suno/bark-small",                                  # expressive
    # "facebook/fastspeech2-en-ljspeech",                 # stable, fast
    # "coqui/XTTS-v2",                                    # highest quality
    "speecht5_tts"# "microsoft/speecht5_tts"                            # optional 4th model
]

models = {}

# Directory to store generated audio
OUTPUT_DIR = Path("tts_results")
OUTPUT_DIR.mkdir(exist_ok=True)

# Example dataset containing text prompts
DATASET_NAME = "MikhailT/lj-speech"
NUM_SAMPLES = 10   # subset for fast evaluation


---
## suno/bark-small

In [4]:
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToWaveform

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor and model
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModelForTextToWaveform.from_pretrained("suno/bark-small").to(device)
models['bark_small'] = {"model": model, "processor": processor}

# Example text
text = "Hello! This is a test of Bark-small text-to-speech."

# Preprocess input
inputs = processor(text=text, return_tensors="pt").to(device)

# Generate waveform
with torch.no_grad():
    waveform = model.generate(**inputs)

# waveform is a tensor with shape [num_samples]
# Save to WAV file
output_path = "bark_output.wav"
torchaudio.save(output_path, waveform.squeeze(0).cpu().unsqueeze(0), sample_rate=24000)

print(f"Saved Bark-small output to {output_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

speaker_embeddings_path.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

Saved Bark-small output to bark_output.wav


  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


In [6]:
from IPython.display import Audio, display
display(Audio(output_path))

---
## coqui/XTTS-v2

In [None]:
# !pip install TTS

In [None]:
# from TTS.api import TTS

# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")
# tts.tts_to_file("Hello world!", file_path="xtts.wav")

## Microsoft/speecht5_tts

In [5]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import soundfile as sf

# Load models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Prepare input text
inputs = processor(text="Hello, this is a SpeechT5 text-to-speech test.", return_tensors="pt")

models["speecht5_tts"] = {"processor": processor, "model": model, "vocoder": vocoder}

# Use a random speaker embedding (512 dimensions)
speaker_embeddings = torch.randn(1, 512)

# Generate speech
with torch.no_grad():
    speech = model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings,
        vocoder=vocoder
    )

# Save output
sf.write("speech.wav", speech.numpy(), 16000)

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

In [7]:
from IPython.display import Audio, display
display(Audio('speech.wav'))

---

In [None]:
# !pip install torchcodec

In [10]:
#######################################################
# METRICS — Small, simple (expand as needed)
#######################################################

def audio_duration(path):
    """Returns duration in seconds."""
    y, sr = librosa.load(path, sr=None)
    return len(y) / sr

def mel_spectrogram_similarity(ref_path, gen_path):
    """
    Simple similarity metric comparing mel spectrogram cosine similarity.
    Not perfect, but useful for midterm presentation.
    """
    ref, sr1 = librosa.load(ref_path, sr=None)
    gen, sr2 = librosa.load(gen_path, sr=None)

    ref_mel = librosa.feature.melspectrogram(ref, sr=sr1)
    gen_mel = librosa.feature.melspectrogram(gen, sr=sr2)

    ref_vec = np.mean(ref_mel, axis=1)
    gen_vec = np.mean(gen_mel, axis=1)

    return 1 - cosine(ref_vec, gen_vec)  # 1 = identical, 0 = different

#######################################################
# TTS INFERENCE WRAPPER
#######################################################

def load_tts_model(model_name):
    """Loads processor + model, returns inference function."""
    processor = AutoProcessor.from_pretrained(model_name)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to("cuda")

    def infer(text):
        inputs = processor(text=text, return_tensors="pt").to("cuda")
        with torch.no_grad():
            audio = model.generate(**inputs)
        audio = processor.batch_decode(audio, skip_special_tokens=True)[0]
        return audio

    return infer


#######################################################
# MAIN PIPELINE
#######################################################
import datasets
def run_tts_benchmark():
    # Load dataset
    dataset = load_dataset(DATASET_NAME, split="full", streaming=False).select(range(NUM_SAMPLES))
    dataset = dataset.cast_column("audio", datasets.features.Audio(decode=False))

    results_summary = {}

    for model_name in TTS_MODELS:
        print(f"\n### Running inference for: {model_name}")
        model_dir = OUTPUT_DIR / model_name.replace("/", "_")
        model_dir.mkdir(exist_ok=True)

        # infer_fn = load_tts_model(model_name)

        model_results = []

        for i, sample in enumerate(tqdm(dataset)):
            print(sample)
            text = sample["spoken_text"]
            reference_audio_path = sample["audio"]["path"]

            # ----- Inference -----
            t0 = time.time()
            if model_name == 'speecht5_tts':
              inputs = models[model_name]['processor'](text=text, return_tensors="pt")
              # Use a random speaker embedding (512 dimensions)
              speaker_embeddings = torch.randn(1, 512)

              # Generate speech
              with torch.no_grad():
                  generated_audio = models[model_name]['model'].generate_speech(
                      inputs["input_ids"],
                      speaker_embeddings,
                      vocoder=models[model_name]['vocoder']
                  )
            elif model_name == 'bark_small':
              # Preprocess input
              inputs = models[model_name]['processor'](text=text, return_tensors="pt").to(device)

              # Generate waveform
              with torch.no_grad():
                  generated_audio = models[model_name]['model'].generate(**inputs)
            else:
              Warning("Model not implemented yet!")
              return

            t1 = time.time()

            # Save generated audio
            output_audio_path = model_dir / f"sample_{i}.wav"
            if model_name == 'speecht5_tts':
              # Save output
              sf.write(output_audio_path, generated_audio.numpy(), 16000)
            elif model_name == 'bark_small':
              # waveform is a tensor with shape [num_samples]
              torchaudio.save(output_audio_path, generated_audio.squeeze(0).cpu().unsqueeze(0), sample_rate=24000)

            # torchaudio.save(str(output_audio_path),
            #                 torch.tensor(generated_audio).unsqueeze(0),
            #                 22050)

            # Metrics
            duration = audio_duration(output_audio_path)
            # similarity = mel_spectrogram_similarity(reference_audio_path,output_audio_path)

            model_results.append({
                "text": text,
                "reference": reference_audio_path,
                "generated": str(output_audio_path),
                "inference_time": t1 - t0,
                "duration": duration,
                # "mel_similarity": float(similarity)
            })

        # Save model results
        with open(model_dir / "results.json", "w") as f:
            json.dump(model_results, f, indent=2)

        # Summary per model
        avg_time = np.mean([r["inference_time"] for r in model_results])
        # avg_sim = np.mean([r["mel_similarity"] for r in model_results])
        avg_len = np.mean([r["duration"] for r in model_results])

        results_summary[model_name] = {
            "avg_inference_time": float(avg_time),
            # "avg_mel_similarity": float(avg_sim),
            "avg_duration_s": float(avg_len)
        }

    # Save cross-model summary
    with open(OUTPUT_DIR / "summary.json", "w") as f:
        json.dump(results_summary, f, indent=2)

    print("\nDone! Summary saved in tts_results/summary.json")

In [11]:
run_tts_benchmark()

Output hidden; open in https://colab.research.google.com to view.

In [17]:
import pandas as pd
df_summary = pd.read_json('tts_results/summary.json')
df_bark = pd.read_json('tts_results/bark_small/results.json')
df_speecht5_tts = pd.read_json('tts_results/speecht5_tts/results.json')

In [18]:
df_summary

Unnamed: 0,bark_small,speecht5_tts
avg_inference_time,18.434163,6.078891
avg_duration_s,7.98,4.1568


In [19]:
df_bark

Unnamed: 0,text,reference,generated,inference_time,duration
0,"Printing, in the only sense with which we are ...",LJ001-0001.wav,tts_results/bark_small/sample_0.wav,28.635739,11.853333
1,in being comparatively modern.,LJ001-0002.wav,tts_results/bark_small/sample_1.wav,5.30149,2.0
2,For although the Chinese took impressions from...,LJ001-0003.wav,tts_results/bark_small/sample_2.wav,16.798028,7.453333
3,"produced the block books, which were the immed...",LJ001-0004.wav,tts_results/bark_small/sample_3.wav,20.187009,8.746667
4,the invention of movable metal letters in the ...,LJ001-0005.wav,tts_results/bark_small/sample_4.wav,22.575554,9.906667
5,"And it is worth mention in passing that, as an...",LJ001-0006.wav,tts_results/bark_small/sample_5.wav,17.416585,7.746667
6,"the earliest book printed with movable types, ...",LJ001-0007.wav,tts_results/bark_small/sample_6.wav,21.91059,9.586667
7,has never been surpassed.,LJ001-0008.wav,tts_results/bark_small/sample_7.wav,4.99095,2.16
8,"Printing, then, for our purpose, may be consid...",LJ001-0009.wav,tts_results/bark_small/sample_8.wav,22.167809,9.626667
9,"Now, as all books not primarily intended as pi...",LJ001-0010.wav,tts_results/bark_small/sample_9.wav,24.357879,10.72


In [20]:
df_speecht5_tts

Unnamed: 0,text,reference,generated,inference_time,duration
0,"Printing, in the only sense with which we are ...",LJ001-0001.wav,tts_results/speecht5_tts/sample_0.wav,11.027159,7.168
1,in being comparatively modern.,LJ001-0002.wav,tts_results/speecht5_tts/sample_1.wav,1.877389,1.408
2,For although the Chinese took impressions from...,LJ001-0003.wav,tts_results/speecht5_tts/sample_2.wav,9.024492,6.112
3,"produced the block books, which were the immed...",LJ001-0004.wav,tts_results/speecht5_tts/sample_3.wav,4.563891,3.264
4,the invention of movable metal letters in the ...,LJ001-0005.wav,tts_results/speecht5_tts/sample_4.wav,8.955588,6.08
5,"And it is worth mention in passing that, as an...",LJ001-0006.wav,tts_results/speecht5_tts/sample_5.wav,4.668909,3.264
6,"the earliest book printed with movable types, ...",LJ001-0007.wav,tts_results/speecht5_tts/sample_6.wav,5.773606,3.936
7,has never been surpassed.,LJ001-0008.wav,tts_results/speecht5_tts/sample_7.wav,1.839281,1.344
8,"Printing, then, for our purpose, may be consid...",LJ001-0009.wav,tts_results/speecht5_tts/sample_8.wav,6.843023,4.416
9,"Now, as all books not primarily intended as pi...",LJ001-0010.wav,tts_results/speecht5_tts/sample_9.wav,6.215573,4.576
