In [1]:
import os

In [2]:
# Trainer: where stuff actually happens
# TrainingArgs: defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs

In [3]:
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig

In [4]:
# BaseDatasetConfig: defines name, formatter and path of the dataset
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [5]:
# using the same path as the notebook in the gen2 folder
output_path = os.path.dirname(os.path.abspath("/output"))

In [6]:
#DEFINE DATASET CONFIG
# set my dataset as the target and define the path
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train=r"C:\Users\rishi\Documents\GitHub\K.A.I\gen2\GidSpeechset\metadata.csv", path=os.path.join(output_path, r"C:\Users\rishi\Documents\GitHub\K.A.I\gen2\GidSpeechset"))

In [7]:
# INITIALIZE THE TRAINING CONFIGURATION
# configure the model. Every config class inherits the BaseTTSConfig
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
)

In [8]:
#INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [9]:
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

In [10]:
# LOAD DATA SAMPLES
# Each sample is a list of '''[text, audio_file_path, speaker_name]'''
# You can define your custom sample loader returning the list of samples.
# Or define your own custom formatter and pass it to the 'load_tts_samples'.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=100,
)

 | > Found 58 files in C:\Users\rishi\Documents\GitHub\K.A.I\gen2\output


In [11]:
# INITIALIZE THE MODEL
# Models take a config object and speaker manager as input
# Config define the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [12]:
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all TTS models with all perks like mixed-precision training,
# Distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# AND... 3,2,1...
trainer.fit()

 > Training Environment:
 | > Num. of CPUs: 12
 | > Num. of Torch Threads: 6
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 > Start Tensorboard: tensorboard --logdir=C:\run-May-04-2023_03+08PM-53fb302b

 > Model has 28610257 parameters

[4m[1m > EPOCH: 0/1000[0m
 --> C:\run-May-04-2023_03+08PM-53fb302b


[*] Pre-computing phonemes...


0it [00:00, ?it/s]




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 0


PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:/run-May-04-2023_03+08PM-53fb302b\\events.out.tfevents.1683209283.58ZPW2'