In [None]:
import os

from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
output_path = '.'
#root, meta, formatter = 'LJSpeech-1.1/', 'metadata.csv', 'ljspeech'
root, meta, formatter = 'hungarian-single-speaker-tts', 'transcript.txt', 'hungarian_tts'
dataset_config = BaseDatasetConfig(
    formatter=formatter, meta_file_train=meta, path=os.path.join(output_path, root)
)

In [None]:
config = GlowTTSConfig(
    batch_size=1,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,  # ~50 epochs should be enough, use subset of dataset first!
    text_cleaner='phoneme_cleaners',
    use_phonemes=True,
    phoneme_language='hu',  # 'en-us'
    phoneme_cache_path=os.path.join(output_path, 'phoneme_cache-ljspeech'),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

In [None]:
len(DEF_LANG_TO_PHONEMIZER), DEF_LANG_TO_PHONEMIZER['en-us'], DEF_LANG_TO_PHONEMIZER['hu']

In [None]:
config.log_model_step, config.save_step, config.plot_step, config.characters, config.phonemizer

In [None]:
ap = AudioProcessor.init_from_config(config)

In [None]:
tokenizer, config = TTSTokenizer.init_from_config(config)

In [None]:
vars(tokenizer)

In [None]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

In [None]:
len(train_samples), len(eval_samples)

In [None]:
train_samples[0]

In [None]:
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [None]:
loader = model.get_data_loader(config, {}, False, train_samples, True, 1)

In [None]:
dataset = loader.dataset

In [None]:
dataset[0]

In [None]:
plt.figure(figsize=(16, 3))
plt.plot(dataset[0]['wav'])

In [None]:
items = dataset.collate_fn([dataset[0]])

In [None]:
items.keys()

In [None]:
for k in ('token_id', 'token_id_lengths', 'speaker_names', 'linear', 'mel_lengths', 'stop_targets', 'item_idxs', 'd_vectors',
          'speaker_ids', 'attns', 'waveform', 'raw_text', 'pitch', 'energy', 'language_ids', 'audio_unique_names'):
    print(k.ljust(20), items[k])

In [None]:
plt.matshow(items['mel'][0])
plt.colorbar()

In [None]:
dataset.phoneme_dataset[0]

In [None]:
tokenizer

In [None]:
text = dataset.phoneme_dataset[0]['text']
text

In [None]:
tokenizer.text_to_ids(text)

In [None]:
tokenizer.ids_to_text(tokenizer.text_to_ids(text))

In [None]:
tokenizer.characters.characters

In [None]:
tokenizer.characters.punctuations

In [None]:
tokenizer.phonemizer

In [None]:
tokenizer.text_cleaner(text)

In [None]:
tokenizer.phonemizer.phonemize(tokenizer.text_cleaner(text), separator="")