<a href="https://colab.research.google.com/gist/erogol/97516ad65b44dbddb8cd694953187c5b/tts_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hands-on example for 🐸 [Coqui TTS](https://github.com/coqui-ai/TTS)

This notebook trains Tacotron model on LJSpeech dataset.

In [2]:
import urllib.request

url = "http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
filename = "LJSpeech-1.1.tar.bz2"
urllib.request.urlretrieve(url, filename)


('LJSpeech-1.1.tar.bz2', <http.client.HTTPMessage at 0x20804075a10>)

In [3]:
import tarfile

with tarfile.open("LJSpeech-1.1.tar.bz2", "r:bz2") as tar:
    tar.extractall()


## Download LJSpeech

In [4]:
# download LJSpeech dataset
!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
# decompress
!tar -xjf LJSpeech-1.1.tar.bz2

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
# create train-val splits
!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
!tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv

'shuf' is not recognized as an internal or external command,
operable program or batch file.
'head' is not recognized as an internal or external command,
operable program or batch file.
'tail' is not recognized as an internal or external command,
operable program or batch file.


## Setup environment

In [6]:
!pip install TTS 

Defaulting to user installation because normal site-packages is not writeable
Collecting TTS
  Downloading TTS-0.22.0.tar.gz (1.7 MB)
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
     ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
     ---------------------------------------- 1.7/1.7 MB 9.9 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting cython>=0.29.30 (from TTS)
  Downloading Cython-3.0.11-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting scipy>=1.11.2 (from TTS)
  Downloading scipy-1.14.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting torch>=2.1 (from T

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
s3fs 2023.4.0 requires fsspec==2023.4.0, but you have fsspec 2024.10.0 which is incompatible.


In [9]:
# install espeak backend if you like to use phonemes instead of raw characters
!apt-get install espeak-ng

'apt-get' is not recognized as an internal or external command,
operable program or batch file.


## Train Tacotron DCA

In [3]:
import os

from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.datasets import LJSpeechDatasetConfig  # Updated import
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

output_path = "./"

# Use the dataset-specific config class
dataset_config = LJSpeechDatasetConfig(
    meta_file_train="metadata.csv",
    path=os.path.join(output_path, "content/LJSpeech-1.1")
)

audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

config = Tacotron2Config(
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    ga_alpha=0.0,
    decoder_loss_alpha=0.25,
    postnet_loss_alpha=0.25,
    postnet_diff_spec_alpha=0,
    decoder_diff_spec_alpha=0,
    decoder_ssim_alpha=0,
    postnet_ssim_alpha=0,
    r=2,
    attention_type="dynamic_convolution",
    double_decoder_consistency=False,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
)

ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

model = Tacotron2(config, ap, tokenizer)

trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples
)

trainer.fit()


ModuleNotFoundError: No module named 'TTS.tts.configs.datasets'

In [4]:
import TTS
print(TTS.__version__)


0.22.0


In [7]:
import TTS.tts.datasets
print(dir(TTS.tts.datasets))


['AudioProcessor', 'Callable', 'Counter', 'Dataset', 'Dict', 'ET', 'EnergyDataset', 'F0Dataset', 'List', 'Path', 'PhonemeDataset', 'TTSDataset', 'Tuple', 'Union', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_get_formatter_by_name', 'add_extra_keys', 'baker', 'base64', 'bel_tts_formatter', 'brspeech', 'calculate_energy', 'cml_tts', 'collections', 'common_voice', 'coqui', 'css10', 'custom_turkish', 'dataset', 'emotion', 'find_unique_chars', 'formatters', 'glob', 'kokoro', 'kss', 'libri_tts', 'ljspeech', 'ljspeech_test', 'load_attention_mask_meta_data', 'load_tts_samples', 'mailabs', 'mls', 'mozilla', 'mozilla_de', 'nancy', 'noise_augment_audio', 'np', 'open_bible', 'os', 'pd', 'prepare_data', 'prepare_stop_target', 'prepare_tensor', 'random', 're', 'ruslan', 'sam_accenture', 'split_dataset', 'string2filename', 'synpaflex', 'sys', 'thorsten', 'torch', 'tqdm', 'tweb', 'vctk', 'vctk_old', 'voxceleb1', 'voxceleb2']


In [8]:
from TTS.tts.datasets import ljspeech


In [9]:
import TTS.tts.datasets.ljspeech as ljspeech_module
print(dir(ljspeech_module))


ModuleNotFoundError: No module named 'TTS.tts.datasets.ljspeech'

In [10]:
from TTS.tts.datasets.ljspeech import LJSpeech  # Adjust the class name if different


ModuleNotFoundError: No module named 'TTS.tts.datasets.ljspeech'

In [14]:
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples, ljspeech  # Directly import ljspeech formatter
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.configs.shared_configs import BaseDatasetConfig

# Optional: Verify TTS Library Version
import TTS
print(f"TTS Library Version: {TTS.__version__}")

output_path = "./"

# Define the dataset configuration without the 'name' parameter
dataset_config = BaseDatasetConfig(
    meta_file_train="metadata.csv",
    path=os.path.join(output_path, "content/LJSpeech-1.1")
)

# Configure audio processing parameters
audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

# Configure the Tacotron2 model
config = Tacotron2Config(
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    ga_alpha=0.0,
    decoder_loss_alpha=0.25,
    postnet_loss_alpha=0.25,
    postnet_diff_spec_alpha=0,
    decoder_diff_spec_alpha=0,
    decoder_ssim_alpha=0,
    postnet_ssim_alpha=0,
    r=2,
    attention_type="dynamic_convolution",
    double_decoder_consistency=False,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
)

# Initialize Audio Processor and Tokenizer
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

# Assign the LJSpeech Formatter
formatter = ljspeech  # Directly use the imported ljspeech function

# Load Training and Evaluation Samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
    formatter=formatter  # Pass the formatter function directly
)

# Initialize the Tacotron2 Model
model = Tacotron2(config, ap, tokenizer)

# Initialize the Trainer
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples
)

# Start Training
trainer.fit()


TTS Library Version: 0.22.0
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024


FileNotFoundError: [Errno 2] No such file or directory: './content/LJSpeech-1.1\\metadata.csv'

In [16]:
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples, ljspeech  # Directly import ljspeech formatter
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.configs.shared_configs import BaseDatasetConfig

# Optional: Verify TTS Library Version
import TTS
print(f"TTS Library Version: {TTS.__version__}")

output_path = os.path.abspath("./")  # Use absolute path
print(f"Output Path: {output_path}")

# Define the dataset configuration
dataset_config = BaseDatasetConfig(
    meta_file_train="metadata.csv",
    path=os.path.join(output_path, "content", "LJSpeech-1.1")
)

# Verify that the dataset path exists
print(f"Dataset Path: {dataset_config.path}")
print(f"Does dataset path exist? {os.path.exists(dataset_config.path)}")

# Define the path to metadata.csv
metadata_path = os.path.join(dataset_config.path, dataset_config.meta_file_train)
print(f"Metadata Path: {metadata_path}")
print(f"Does metadata.csv exist? {os.path.exists(metadata_path)}")

# Configure audio processing parameters
audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

# Configure the Tacotron2 model
config = Tacotron2Config(
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    ga_alpha=0.0,
    decoder_loss_alpha=0.25,
    postnet_loss_alpha=0.25,
    postnet_diff_spec_alpha=0,
    decoder_diff_spec_alpha=0,
    decoder_ssim_alpha=0,
    postnet_ssim_alpha=0,
    r=2,
    attention_type="dynamic_convolution",
    double_decoder_consistency=False,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
)

# Initialize Audio Processor and Tokenizer
ap = AudioProcessor.init_from_config(config)
tokenizer, config = TTSTokenizer.init_from_config(config)

# Assign the LJSpeech Formatter
formatter = ljspeech  # Directly use the imported ljspeech function

# Load Training and Evaluation Samples
try:
    train_samples, eval_samples = load_tts_samples(
        dataset_config,
        eval_split=True,
        eval_split_max_size=config.eval_split_max_size,
        eval_split_size=config.eval_split_size,
        formatter=formatter  # Pass the formatter function directly
    )
    print(f"Number of training samples: {len(train_samples)}")
    print(f"Number of evaluation samples: {len(eval_samples)}")
except FileNotFoundError as e:
    print(f"Error loading samples: {e}")
    # Optionally, exit or handle the error appropriately
    import sys
    sys.exit(1)

# Initialize the Tacotron2 Model
model = Tacotron2(config, ap, tokenizer)

# Initialize the Trainer
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples
)

# Start Training
trainer.fit()


TTS Library Version: 0.22.0
Output Path: C:\Users\Pratheek\Desktop\nlp\tts
Dataset Path: C:\Users\Pratheek\Desktop\nlp\tts\content\LJSpeech-1.1
Does dataset path exist? False
Metadata Path: C:\Users\Pratheek\Desktop\nlp\tts\content\LJSpeech-1.1\metadata.csv
Does metadata.csv exist? False
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.71828182845

AttributeError: 'tuple' object has no attribute 'tb_frame'