In [None]:
# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs

# GlowTTSConfig: all model related values for training, validating and testing.
from configs.glow_tts_config import GlowTTSConfig

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from configs.shared_configs import BaseDatasetConfig
from datasets import load_tts_samples
from glow_tts import GlowTTS
from utils.text.tokenizer import TTSTokenizer
from utils.audio import AudioProcessor

In [None]:
#formatter

def ljspeech(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalizes the LJSpeech meta data file to TTS format
    https://keithito.com/LJ-Speech-Dataset/"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "ljspeech"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
            text = cols[2]
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
    return items


In [None]:
output_path = os.path.dirname(os.getcwd())
dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "text-to-speech/datasets/LJSpeech-1.1/")
)

In [None]:
# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.getcwd())



# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
    batch_size=48,
    eval_batch_size=32,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=2,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=50,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

In [7]:
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# We use our defined format function for the Ljspeech dataset
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=ljspeech)

# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# AND... 3,2,1... 🚀
trainer.fit()

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 13100 files in /opt/app-root/src/text-to-speech/datasets/LJSpeech-1.1
 > Using CUDA:  False
 > Number of GPUs:  0

 > Model has 28610257 parameters

[4m[1m > EPOCH: 0/2[0m
 --> /opt/app-root/src/run-March-17-2022_06+13PM-eae328f


> DataLoader 

  y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze



[1m   --> STEP: 0/270 -- GLOBAL_STEP: 0[0m
     | > current_lr: 0.00000 
     | > step_time: 2.96900  (2.96902)
     | > loader_time: 1.28270  (1.28268)


[1m   --> STEP: 50/270 -- GLOBAL_STEP: 50[0m
     | > loss: 3.68945  (3.64530)
     | > log_mle: 0.73425  (0.72657)
     | > loss_dur: 2.95519  (2.91873)
     | > grad_norm: 9.82102  (9.71756)
     | > current_lr: 0.00000 
     | > step_time: 7.42660  (5.37296)
     | > loader_time: 0.00270  (0.00253)



ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 Traceback (most recent call last):
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1403, in fit
    self._fit()
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1387, in _fit
    self.train_epoch()
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1167, in train_epoch
    _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1001, in train_step
    outputs, loss_dict_new, step_time = self._optimize(
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 890, in _optimize
    outputs, loss_dict = self._model_train_step(batch, model, criterion)
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 846, in _model_train_step
    return model.train_step

 ! Run is removed from /opt/app-root/src/run-March-17-2022_06+13PM-eae328f
Traceback (most recent call last):
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1403, in fit
    self._fit()
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1387, in _fit
    self.train_epoch()
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1167, in train_epoch
    _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 1001, in train_step
    outputs, loss_dict_new, step_time = self._optimize(
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 890, in _optimize
    outputs, loss_dict = self._model_train_step(batch, model, criterion)
  File "/opt/app-root/lib64/python3.8/site-packages/trainer/trainer.py", line 846, in _model_train_step
    return model.train_step(*input_args)
  File "/opt/app-root

TypeError: object of type 'NoneType' has no len()

In [None]:
def tts(model, text, CONFIG, p):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,
                                                                             truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
                                                                             backend='tf')
    waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
    waveform = waveform.numpy()[0, 0]
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    return alignment, mel_postnet_spec, stop_tokens, waveform

In [None]:
!tts --text "My sprinkler goes like this ststststststststststst and comes back like ttttttttttttttttttttttttttttttttttttttte" \
--model_name "tts_models/en/ljspeech/glow-tts" \
--vocoder_name "vocoder_models/universal/libri-tts/fullband-melgan" \
--out_path tts_output.wav

In [None]:
import IPython.display as ipd
ipd.Audio(filename='tts_output.wav')

In [None]:
#Vocoder trainer if we want to include it

import os

from trainer import Trainer, TrainerArgs

from TTS.utils.audio import AudioProcessor
from TTS.vocoder.configs import HifiganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN

output_path = os.path.dirname(os.path.abspath(__file__))

config = HifiganConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=5,
    epochs=1000,
    seq_len=8192,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=10,
    print_step=25,
    print_eval=False,
    mixed_precision=False,
    lr_gen=1e-4,
    lr_disc=1e-4,
    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
    output_path=output_path,
)

# init audio processor
ap = AudioProcessor(**config.audio.to_dict())

# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)

# init model
model = GAN(config)

# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
)
trainer.fit()