In [1]:
import os
# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs

# GlowTTSConfig: all model related values for training, validating and testing.
from configs.glow_tts_config import GlowTTSConfig

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from configs.shared_configs import BaseDatasetConfig
from datasets import load_tts_samples
from models.glow_tts import GlowTTS
from utils.text.tokenizer import TTSTokenizer
from utils.audio import AudioProcessor

In [2]:
#formatter

def ljspeech(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalizes the LJSpeech meta data file to TTS format
    https://keithito.com/LJ-Speech-Dataset/"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "ljspeech"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
            text = cols[2]
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
    return items


In [3]:
# DEFINE DATASET CONFIG
# Set our new folder as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and for a custom formatter.

output_path = os.path.dirname(os.getcwd())
dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "text-to-speech/datasets/LJSpeech-1.1/")
)

In [4]:
# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.getcwd())



# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
    batch_size=16,
    eval_batch_size=8,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=0,
    epochs=1,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=400,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

In [5]:
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# We use our defined format function for the Ljspeech dataset
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=ljspeech)

# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 13100 files in /opt/app-root/src/text-to-speech/datasets/LJSpeech-1.1
 > Using CUDA: False
 > Number of GPUs: 0

 > Model has 28610257 parameters


In [6]:

# AND... 3,2,1... 🚀
trainer.fit()


[4m[1m > EPOCH: 0/1[0m
 --> /opt/app-root/src/run-March-17-2022_11+00PM-9828128


> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 12969
 | > Preprocessing samples
 | > Max text length: 188
 | > Min text length: 13
 | > Avg text length: 100.90014650319993
 | 
 | > Max audio length: 222643.0
 | > Min audio length: 24499.0
 | > Avg audio length: 144984.29755570978
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.

[1m > TRAINING (2022-03-17 23:01:00) [0m


  y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze



[1m   --> STEP: 0/811 -- GLOBAL_STEP: 0[0m
     | > current_lr: 0.00000 
     | > step_time: 2.58290  (2.58294)
     | > loader_time: 0.86400  (0.86400)


[1m   --> STEP: 400/811 -- GLOBAL_STEP: 400[0m
     | > loss: 3.40052  (3.51605)
     | > log_mle: 0.79997  (0.80014)
     | > loss_dur: 2.60055  (2.71591)
     | > grad_norm: 9.84974  (10.27138)
     | > current_lr: 0.00000 
     | > step_time: 3.29220  (2.56434)
     | > loader_time: 0.00340  (0.00252)


[1m   --> STEP: 800/811 -- GLOBAL_STEP: 800[0m
     | > loss: 3.41369  (3.46281)
     | > log_mle: 0.78252  (0.79664)
     | > loss_dur: 2.63118  (2.66616)
     | > grad_norm: 9.25854  (9.91014)
     | > current_lr: 0.00000 
     | > step_time: 4.32730  (3.21377)
     | > loader_time: 0.00310  (0.00274)



> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 131
 | 

In [7]:
!tts --text "My sprinkler goes like this ststststststststststst and comes back like tttttttttttttttttttttttttttttttttttttttttttttttttte" \
--model_name "tts_models/en/ljspeech/glow-tts" \
--vocoder_name "vocoder_models/universal/libri-tts/fullband-melgan" \
--out_path output/tts_output.wav

 > tts_models/en/ljspeech/glow-tts is already downloaded.
 > vocoder_models/universal/libri-tts/fullband-melgan is already downloaded.
 > Using model: glow_tts
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Vocoder Model: fullband_melgan
 > Setting up Audio Processor...
 | > sample_rate:24000
 | >

In [8]:
import IPython.display as ipd
ipd.Audio(filename='output/tts_output.wav')