In [1]:
!mkdir ~/data
!git -C ~/data clone https://github.com/nii-yamagishilab/VCC2020-database.git

mkdir: cannot create directory ‘/home/jasonjjl1999/data’: File exists
fatal: destination path 'VCC2020-database' already exists and is not an empty directory.


In [None]:
!rm -rf ~/data/VCC2020-database/extract
!mkdir ~/data/VCC2020-database/extract

!unzip ~/data/VCC2020-database/vcc2020_database_training_target_task1.zip -d ~/data/VCC2020-database/extract/
!unzip ~/data/VCC2020-database/vcc2020_database_transcriptions.zip -d ~/data/VCC2020-database/extract/

In [None]:
# !rm -rf capstone
# !git clone -b tts-finetune https://github.com/renrichard/capstone.git

In [None]:
# !pip install pytorch_lightning
# !apt-get update && apt-get install -y libsndfile1 ffmpeg
# !pip install Cython
# !pip install nemo_toolkit['all']

# !git clone https://github.com/NVIDIA/apex
# %cd apex
# !pip install -v --disable-pip-version-check --no-cache-dir ./
# %cd ..

# !wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_e2e_fastpitchhifigan/versions/1.0.0/zip -O tts_en_e2e_fastpitchhifigan_1.0.0.zip -P ~/checkpoints

# !unzip ~/content/tts_en_e2e_fastpitchhifigan_1.0.0.zip -d ~/checkpoint

# !gdown https://drive.google.com/uc?id=15FoehxQEZN8OSoIg8am7Wq-7vLS7QIgu

# !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/tts/conf/fastpitch_align.yaml

# !git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream

# !pip install pytorch-lightning

In [None]:
# import sys
# sys.path.append('capstone')

# from capstone.preprocess.json.create_vcc_2020_json import create_vcc_2020_json
# create_vcc_2020_json()

# Finetuning FastPitch for a new speaker

In this tutorial, we will finetune a single speaker FastPitch (with alignment) model on limited amount of new speaker's data. We cover two finetuning techniques: 
1. We finetune the model parameters only on new speaker's text and speech pairs; 
2. We add a learnable speaker embedding layer to the model, and finetune on a mixture of original speaker's and new speaker's data.

We will first prepare filelists containing the audiopaths and text of the samples on which we wish to finetune the model, then generate and run a training command to finetune Fastpitch on 5 mins of data, and finally synthesize the audio from the trained checkpoint.

## Creating filelists for training

In [None]:
import random
import os
import json
import torch

import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

from capstone.preprocess.json.vcc_2020_paths import data_dir, filelist_dir, exp_base_dir

def make_sub_file_list(speaker_id, num_samples, total_duration_mins, seed=42):
	"""
	Creates a subset of training data for a HiFiTTS speaker. Specify either the num_samples or total_duration_mins
	Saves the filelist in the filelist_dir. split is either "train" or "dev"

	Arguments:
	speaker_id -- speaker id of the new HiFiTTS speaker
	clean_other -- "clean" or "other" depending on type of data of new HiFiTTS speaker
	split -- "train" or "dev"
	num_samples -- Number samples of new speaker (set None if specifying total_duration_mins)
	total_duration_mins -- Total duration of new speaker's data (set None if specifying num_samples)
	"""
	file_list_name = "{}_metadata.json".format(speaker_id)
	with open(os.path.join(data_dir, file_list_name), 'r') as f:
		all_records = [json.loads(l) for l in f.read().split("\n") if len(l) > 0]
	random.seed(seed)
	random.shuffle(all_records)

	if num_samples is not None and total_duration_mins is None:
		sub_records = all_records[:num_samples]
		fname_extension = "ns_{}".format(num_samples)
	elif num_samples is None and total_duration_mins is not None:
		sub_record_duration = 0.0
		sub_records = []
		for r in all_records:
			sub_record_duration += r['duration']
			if sub_record_duration > total_duration_mins * 60.0:
				print("Duration reached {} mins using {} records".format(total_duration_mins, len(sub_records)))
				break
			sub_records.append(r)
		fname_extension = "dur_{}_mins".format(int(round(total_duration_mins)))
	elif num_samples is None and total_duration_mins is None:
		sub_records = all_records
		fname_extension = "ns_all"
	else:
		raise NotImplementedError()
	print("num sub records", len(sub_records))

	if not os.path.exists(filelist_dir):
		os.makedirs(filelist_dir)

	target_fp = os.path.join(filelist_dir, "{}_metadata_{}_local.json".format(speaker_id, fname_extension))
	with open(target_fp, 'w') as f:
		for record in json.loads(json.dumps(sub_records)):
			record['audio_filepath'] = os.path.join(data_dir, record['audio_filepath'])
			f.write(json.dumps(record) + "\n")

In [None]:
make_sub_file_list('TEF1', None, 5)

## Finetuning the model on filelists

In [None]:
# pitch statistics of the new speakers
# These can be computed from the pitch contours extracted using librosa yin
# Finetuning can still work without these, but we get better results using speaker specific pitch stats
# pitch_stats = {
#     92 : {
#         'mean' : 214.5, # female speaker
#         'std' : 30.9,
#         'fmin' : 80,
#         'fmax' : 512
#     },
#     6097 : {
#         'mean' : 121.9, # male speaker
#         'std' : 23.1,
#         'fmin' : 30,
#         'fmax' : 512
#     }
# }


def generate_training_command(new_speaker_id, duration_mins, mixing_enabled, original_speaker_id, ckpt, use_new_pitch_stats=False):
    """
    Generates the training command string to be run from the NeMo/ directory. Assumes we have created the finetuning filelists
    using the instructions given above.
    
    Arguments:
    new_speaker_id -- speaker id of the new HiFiTTS speaker
    duration_mins -- total minutes of the new speaker data (same as that used for creating the filelists)
    mixing_enabled -- True or False depending on whether we want to mix the original speaker data or not
    original_speaker_id -- speaker id of the original HiFiTTS speaker
    use_new_pitch_stats -- whether to use pitch_stats dictionary given above or not
    ckpt: Path to pretrained FastPitch checkpoint
    
    Returns:
    Training command string
    """
    def _find_epochs(duration_mins, mixing_enabled, n_orig=None):
        # estimated num of epochs 
        if duration_mins == 5:
            epochs = 1000
        elif duration_mins == 30:
            epochs = 300
        elif duration_mins == 60:
            epochs = 150
        
        if mixing_enabled:
            if duration_mins == 5:
                epochs = epochs/50 + 1
            elif duration_mins == 30:
                epochs = epochs/10 + 1
            elif duration_mins == 60:
                epochs = epochs/5 + 1
        
        return int(epochs)
            
            
    if ckpt.endswith(".nemo"):
        ckpt_arg_name = "init_from_nemo_model"
    else:
        ckpt_arg_name = "init_from_ptl_ckpt"
    if not mixing_enabled:
        train_dataset = "{}_metadata_dur_{}_mins_local.json".format(new_speaker_id, duration_mins)
        val_dataset = "{}_mainifest_dev_ns_all_local.json".format(new_speaker_id)
        prior_folder = os.path.join(data_dir, "Priors{}".format(new_speaker_id))
        exp_dir = "{}_to_{}_no_mixing_{}_mins".format(original_speaker_id, new_speaker_id, duration_mins)
        n_speakers = 1
    else:
        train_dataset = "{}_mainifest_train_dur_{}_mins_local_mix_{}.json".format(new_speaker_id, duration_mins, original_speaker_id)
        val_dataset = "{}_mainifest_dev_ns_all_local.json".format(new_speaker_id)
        prior_folder = os.path.join(data_dir, "Priors_{}_mix_{}".format(new_speaker_id, original_speaker_id))
        exp_dir = "{}_to_{}_mixing_{}_mins".format(original_speaker_id, new_speaker_id, duration_mins)
        n_speakers = 2
    train_dataset = os.path.join(filelist_dir, train_dataset)
    val_dataset = os.path.join(filelist_dir, val_dataset)
    exp_dir = os.path.join(exp_base_dir, exp_dir)
                                    
    max_epochs = _find_epochs(duration_mins, mixing_enabled, n_orig=None)
    config_name = "fastpitch_align_44100.yaml"
    
    training_command = "python /content/capstone/finetune/fastpitch2_finetune.py --config-name={} train_dataset={} validation_datasets={} +{}={} trainer.max_epochs={} trainer.check_val_every_n_epoch=1 prior_folder={} model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24 exp_manager.exp_dir={} model.n_speakers={}".format(
        config_name, train_dataset, val_dataset, ckpt_arg_name, ckpt, max_epochs, prior_folder, exp_dir, n_speakers)
    # if use_new_pitch_stats:
    #     training_command += " model.pitch_avg={} model.pitch_std={} model.pitch_fmin={} model.pitch_fmax={}".format(
    #         pitch_stats[new_speaker_id]['mean'], 
    #         pitch_stats[new_speaker_id]['std'],
    #         pitch_stats[new_speaker_id]['fmin'],
    #         pitch_stats[new_speaker_id]['fmax']
    #     )
    training_command += " model.optim.lr=2e-4 ~model.optim.sched"
    return training_command
    

In [None]:
new_speaker_id = "TEF1"
duration_mins = 5
mixing = False
original_speaker_id = "TEM1"
ckpt_path = "/root/checkpoint"
print(generate_training_command(new_speaker_id, duration_mins, mixing, original_speaker_id, ckpt_path, True))

In [None]:
# import torch
# torch.cuda.is_available()
# torch.cuda.get_device_name(torch.cuda.current_device())

In [None]:
!python ~/capstone/capstone/finetune/fastpitch2_finetune.py --config-name=fastpitch_align.yaml train_dataset=/home/ryan/data/VCC2020-database/extract/target_task1/filelist/TEF1_metadata_dur_5_mins_local.json validation_datasets=/home/ryan/data/VCC2020-database/extract/target_task1/filelist/TEF1_metadata_dur_5_mins_local.json +init_from_nemo_model=/home/ryan/capstone/FastPitch-Align-LJSpeech.nemo trainer.max_epochs=5000 trainer.check_val_every_n_epoch=1 prior_folder=/home/ryan/data/VCC2020-database/extract/target_task1/json/PriorsTEF1 model.train_ds.dataloader_params.batch_size=24 model.validation_ds.dataloader_params.batch_size=24 exp_manager.exp_dir=/home/ryan/data/VCC2020-database/extract/target_task1/exp_base/TEM1_to_TEF1_no_mixing_5_mins model.n_speakers=1 model.optim.lr=2e-4 ~model.optim.sched

## Synthesize samples from finetuned checkpoints

Once we have finetuned our FastPitch model, we can synthesize the audio samples for given text using the following inference steps. We use a HiFiGAN vocoder trained on multiple speakers, get the trained checkpoint path for our trained model and synthesize audio for a given text as follows.

In [None]:
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import FastPitchModel

hifigan_ckpt_path =  "/root/checkpoint"
# vocoder = HifiGanModel.load_from_checkpoint(hifigan_ckpt_path)
vocoder = HifiGanModel.from_pretrained("tts_hifigan")
vocoder.eval().cuda()

In [None]:
from nemo.collections.tts.models import FastPitchModel, HifiGanModel 
import IPython.display as ipd
import torch

fp = FastPitchModel.restore_from("./FastPitch-Align-LJSpeech_5k.nemo")  # The file shared above
hf = HifiGanModel.from_pretrained("tts_hifigan")  # This will fetch our publicily available model from the cloud

tokens = fp.parse("Hello world! I am a generated speaker.").cuda()
with torch.no_grad():
    spectrogram = fp.generate_spectrogram(tokens=tokens)
    audio = hf(spec=spectrogram).squeeze(1)
ipd.display(ipd.Audio(audio.cpu().numpy(), rate=22050))

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker = None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Arguments:
    spec_gen_model -- Instance of FastPitch model
    vocoder_model -- Instance of a vocoder model (HiFiGAN in our case)
    str_input -- Text input for the synthesis
    speaker -- Speaker number (in the case of a multi-speaker model -- in the mixing case)
    
    Returns:
    spectrogram, waveform of the synthesized audio.
    """
    parser_model = spec_gen_model
    with torch.no_grad():
        parsed = parser_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().cuda()
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker = speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

def get_best_ckpt(experiment_base_dir, new_speaker_id, duration_mins, mixing_enabled, original_speaker_id):
    """
    Gives the model checkpoint paths of an experiment  we ran. 
    
    Arguments:
    experiment_base_dir -- Base experiment directory (specified on top of this notebook as exp_base_dir)
    new_speaker_id -- Speaker id of new HiFiTTS speaker we finetuned FastPitch on
    duration_mins -- total minutes of the new speaker data
    mixing_enabled -- True or False depending on whether we want to mix the original speaker data or not
    original_speaker_id -- speaker id of the original HiFiTTS speaker
    
    Returns:
    List of all checkpoint paths sorted by validation error, Last checkpoint path
    """
    if not mixing_enabled:
        exp_dir = "{}/{}_to_{}_no_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    else:
        exp_dir = "{}/{}_to_{}_mixing_{}_mins".format(experiment_base_dir, original_speaker_id, new_speaker_id, duration_mins)
    
    ckpt_candidates = []
    last_ckpt = None
    for root, dirs, files in os.walk(exp_dir):
        for file in files:
            if file.endswith(".ckpt"):
                val_error = float(file.split("v_loss=")[1].split("-epoch")[0])
                if "last" in file:
                    last_ckpt = os.path.join(root, file)
                ckpt_candidates.append( (val_error, os.path.join(root, file)))
    ckpt_candidates.sort()
    
    return ckpt_candidates, last_ckpt

Specify the speaker id, duration mins and mixing variable to find the relevant checkpoint from the exp_base_dir and compare the synthesized audio with validation samples of the new speaker.

In [None]:
new_speaker_id = "TEF1"
duration_mins = 5
mixing = False
original_speaker_id = "TEM1"


_ ,last_ckpt = get_best_ckpt(exp_base_dir, new_speaker_id, duration_mins, mixing, original_speaker_id)
print(last_ckpt)

cfg = {'name': 'FastPitch', 'sample_rate': 44100, 'train_dataset': '/root/data/VCC2020-database/extract/target_task1/filelist/TEF1_metadata_dur_5_mins_local.json', 'validation_datasets': '/root/data/VCC2020-database/extract/target_task1/filelist/TEF1_metadata_dur_5_mins_local.json', 'prior_folder': '/root/data/VCC2020-database/extract/target_task1/json/PriorsTEF1', 'model': {'learn_alignment': True, 'n_speakers': 1, 'symbols_embedding_dim': 384, 'max_token_duration': 75, 'n_mel_channels': 80, 'pitch_embedding_kernel_size': 3, 'n_window_size': 2048, 'n_window_stride': 512, 'fmax': None, 'pitch_fmin': 80, 'pitch_fmax': 640, 'pitch_avg': 211.27540199742586, 'pitch_std': 52.1851002822779, 'train_ds': {'dataset': {'_target_': 'nemo.collections.asr.data.audio_to_text.AudioToCharWithPriorAndPitchDataset', 'manifest_filepath': '${train_dataset}', 'max_duration': None, 'min_duration': 0.1, 'int_values': False, 'normalize': True, 'sample_rate': '${sample_rate}', 'trim': False, 'sup_data_path': '${prior_folder}', 'n_window_stride': '${model.n_window_stride}', 'n_window_size': '${model.n_window_size}', 'pitch_fmin': '${model.pitch_fmin}', 'pitch_fmax': '${model.pitch_fmax}', 'pitch_avg': '${model.pitch_avg}', 'pitch_std': '${model.pitch_std}', 'vocab': {'notation': 'phonemes', 'punct': True, 'spaces': True, 'stresses': True, 'add_blank_at': 'None', 'pad_with_space': True, 'chars': True, 'improved_version_g2p': True}}, 'dataloader_params': {'drop_last': False, 'shuffle': True, 'batch_size': 24, 'num_workers': 12}}, 'validation_ds': {'dataset': {'_target_': 'nemo.collections.asr.data.audio_to_text.AudioToCharWithPriorAndPitchDataset', 'manifest_filepath': '${validation_datasets}', 'max_duration': None, 'min_duration': None, 'int_values': False, 'normalize': True, 'sample_rate': '${sample_rate}', 'trim': False, 'sup_data_path': '${prior_folder}', 'n_window_stride': '${model.n_window_stride}', 'n_window_size': '${model.n_window_size}', 'pitch_fmin': '${model.pitch_fmin}', 'pitch_fmax': '${model.pitch_fmax}', 'pitch_avg': '${model.pitch_avg}', 'pitch_std': '${model.pitch_std}', 'vocab': {'notation': 'phonemes', 'punct': True, 'spaces': True, 'stresses': True, 'add_blank_at': 'None', 'pad_with_space': True, 'chars': True, 'improved_version_g2p': True}}, 'dataloader_params': {'drop_last': False, 'shuffle': False, 'batch_size': 24, 'num_workers': 8}}, 'preprocessor': {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'dither': 0.0, 'features': '${model.n_mel_channels}', 'frame_splicing': 1, 'highfreq': None, 'log': True, 'log_zero_guard_type': 'add', 'log_zero_guard_value': 1e-05, 'lowfreq': 0, 'mag_power': 1.0, 'n_fft': '${model.n_window_size}', 'n_window_size': '${model.n_window_size}', 'n_window_stride': '${model.n_window_stride}', 'normalize': None, 'pad_to': 1, 'pad_value': 0, 'preemph': None, 'sample_rate': '${sample_rate}', 'window': 'hann', 'window_size': None, 'window_stride': None}, 'input_fft': {'_target_': 'nemo.collections.tts.modules.transformer.FFTransformerEncoder', 'n_layer': 6, 'n_head': 1, 'd_model': '${model.symbols_embedding_dim}', 'd_head': 64, 'd_inner': 1536, 'kernel_size': 3, 'dropout': 0.1, 'dropatt': 0.1, 'dropemb': 0.0, 'd_embed': '${model.symbols_embedding_dim}'}, 'output_fft': {'_target_': 'nemo.collections.tts.modules.transformer.FFTransformerDecoder', 'n_layer': 6, 'n_head': 1, 'd_model': '${model.symbols_embedding_dim}', 'd_head': 64, 'd_inner': 1536, 'kernel_size': 3, 'dropout': 0.1, 'dropatt': 0.1, 'dropemb': 0.0}, 'alignment_module': {'_target_': 'nemo.collections.tts.modules.aligner.AlignmentEncoder', 'n_text_channels': '${model.symbols_embedding_dim}'}, 'duration_predictor': {'_target_': 'nemo.collections.tts.modules.fastpitch.TemporalPredictor', 'input_size': '${model.symbols_embedding_dim}', 'kernel_size': 3, 'filter_size': 256, 'dropout': 0.1, 'n_layers': 2}, 'pitch_predictor': {'_target_': 'nemo.collections.tts.modules.fastpitch.TemporalPredictor', 'input_size': '${model.symbols_embedding_dim}', 'kernel_size': 3, 'filter_size': 256, 'dropout': 0.1, 'n_layers': 2}, 'optim': {'name': 'adam', 'lr': 0.0002, 'betas': [0.9, 0.98], 'weight_decay': 1e-06}}, 'trainer': {'gpus': -1, 'max_epochs': 10, 'num_nodes': 1, 'accelerator': 'ddp', 'accumulate_grad_batches': 1, 'checkpoint_callback': False, 'logger': False, 'gradient_clip_val': 1000.0, 'flush_logs_every_n_steps': 1000, 'log_every_n_steps': 100, 'check_val_every_n_epoch': 1}, 'exp_manager': {'exp_dir': '/root/data/VCC2020-database/extract/target_task1/exp_base/TEM1_to_TEF1_no_mixing_5_mins', 'name': '${name}', 'create_tensorboard_logger': True, 'create_checkpoint_callback': True, 'checkpoint_callback_params': {'monitor': 'v_loss'}}, 'init_from_ptl_ckpt': '/root/data/VCC2020-database/extract/target_task1/exp_base/TEM1_to_TEF1_no_mixing_5_mins/FastPitch/2021-09-14_19-16-49/checkpoints/FastPitch--v_loss=17.86-epoch=9-last.ckpt'}
spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)#, cfg={'name': 'FastPitch', 'sample_rate': 44100, 'train_dataset': '/root/data/VCC2020-database/extract/target_task1/filelist/TEF1_metadata_dur_5_mins_local.json', 'validation_datasets': '/root/data/VCC2020-database/extract/target_task1/filelist/TEF1_metadata_dur_5_mins_local.json', 'prior_folder': '/root/data/VCC2020-database/extract/target_task1/json/PriorsTEF1', 'model': {'learn_alignment': True, 'n_speakers': 1, 'symbols_embedding_dim': 384, 'max_token_duration': 75, 'n_mel_channels': 80, 'pitch_embedding_kernel_size': 3, 'n_window_size': 2048, 'n_window_stride': 512, 'fmax': None, 'pitch_fmin': 80, 'pitch_fmax': 640, 'pitch_avg': 211.27540199742586, 'pitch_std': 52.1851002822779, 'train_ds': {'dataset': {'_target_': 'nemo.collections.asr.data.audio_to_text.AudioToCharWithPriorAndPitchDataset', 'manifest_filepath': '${train_dataset}', 'max_duration': None, 'min_duration': 0.1, 'int_values': False, 'normalize': True, 'sample_rate': '${sample_rate}', 'trim': False, 'sup_data_path': '${prior_folder}', 'n_window_stride': '${model.n_window_stride}', 'n_window_size': '${model.n_window_size}', 'pitch_fmin': '${model.pitch_fmin}', 'pitch_fmax': '${model.pitch_fmax}', 'pitch_avg': '${model.pitch_avg}', 'pitch_std': '${model.pitch_std}', 'vocab': {'notation': 'phonemes', 'punct': True, 'spaces': True, 'stresses': True, 'add_blank_at': 'None', 'pad_with_space': True, 'chars': True, 'improved_version_g2p': True}}, 'dataloader_params': {'drop_last': False, 'shuffle': True, 'batch_size': 24, 'num_workers': 12}}, 'validation_ds': {'dataset': {'_target_': 'nemo.collections.asr.data.audio_to_text.AudioToCharWithPriorAndPitchDataset', 'manifest_filepath': '${validation_datasets}', 'max_duration': None, 'min_duration': None, 'int_values': False, 'normalize': True, 'sample_rate': '${sample_rate}', 'trim': False, 'sup_data_path': '${prior_folder}', 'n_window_stride': '${model.n_window_stride}', 'n_window_size': '${model.n_window_size}', 'pitch_fmin': '${model.pitch_fmin}', 'pitch_fmax': '${model.pitch_fmax}', 'pitch_avg': '${model.pitch_avg}', 'pitch_std': '${model.pitch_std}', 'vocab': {'notation': 'phonemes', 'punct': True, 'spaces': True, 'stresses': True, 'add_blank_at': 'None', 'pad_with_space': True, 'chars': True, 'improved_version_g2p': True}}, 'dataloader_params': {'drop_last': False, 'shuffle': False, 'batch_size': 24, 'num_workers': 8}}, 'preprocessor': {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'dither': 0.0, 'features': '${model.n_mel_channels}', 'frame_splicing': 1, 'highfreq': None, 'log': True, 'log_zero_guard_type': 'add', 'log_zero_guard_value': 1e-05, 'lowfreq': 0, 'mag_power': 1.0, 'n_fft': '${model.n_window_size}', 'n_window_size': '${model.n_window_size}', 'n_window_stride': '${model.n_window_stride}', 'normalize': None, 'pad_to': 1, 'pad_value': 0, 'preemph': None, 'sample_rate': '${sample_rate}', 'window': 'hann', 'window_size': None, 'window_stride': None}, 'input_fft': {'_target_': 'nemo.collections.tts.modules.transformer.FFTransformerEncoder', 'n_layer': 6, 'n_head': 1, 'd_model': '${model.symbols_embedding_dim}', 'd_head': 64, 'd_inner': 1536, 'kernel_size': 3, 'dropout': 0.1, 'dropatt': 0.1, 'dropemb': 0.0, 'd_embed': '${model.symbols_embedding_dim}'}, 'output_fft': {'_target_': 'nemo.collections.tts.modules.transformer.FFTransformerDecoder', 'n_layer': 6, 'n_head': 1, 'd_model': '${model.symbols_embedding_dim}', 'd_head': 64, 'd_inner': 1536, 'kernel_size': 3, 'dropout': 0.1, 'dropatt': 0.1, 'dropemb': 0.0}, 'alignment_module': {'_target_': 'nemo.collections.tts.modules.aligner.AlignmentEncoder', 'n_text_channels': '${model.symbols_embedding_dim}'}, 'duration_predictor': {'_target_': 'nemo.collections.tts.modules.fastpitch.TemporalPredictor', 'input_size': '${model.symbols_embedding_dim}', 'kernel_size': 3, 'filter_size': 256, 'dropout': 0.1, 'n_layers': 2}, 'pitch_predictor': {'_target_': 'nemo.collections.tts.modules.fastpitch.TemporalPredictor', 'input_size': '${model.symbols_embedding_dim}', 'kernel_size': 3, 'filter_size': 256, 'dropout': 0.1, 'n_layers': 2}, 'optim': {'name': 'adam', 'lr': 0.0002, 'betas': [0.9, 0.98], 'weight_decay': 1e-06}}, 'trainer': {'gpus': -1, 'max_epochs': 10, 'num_nodes': 1, 'accelerator': 'ddp', 'accumulate_grad_batches': 1, 'checkpoint_callback': False, 'logger': False, 'gradient_clip_val': 1000.0, 'flush_logs_every_n_steps': 1000, 'log_every_n_steps': 100, 'check_val_every_n_epoch': 1}, 'exp_manager': {'exp_dir': '/root/data/VCC2020-database/extract/target_task1/exp_base/TEM1_to_TEF1_no_mixing_5_mins', 'name': '${name}', 'create_tensorboard_logger': True, 'create_checkpoint_callback': True, 'checkpoint_callback_params': {'monitor': 'v_loss'}}, 'init_from_ptl_ckpt': '/root/data/VCC2020-database/extract/target_task1/exp_base/TEM1_to_TEF1_no_mixing_5_mins/FastPitch/2021-09-14_19-16-49/checkpoints/FastPitch--v_loss=17.86-epoch=9-last.ckpt'})
spec_model.eval().cuda()
_speaker=None
if mixing:
    _speaker = 1

num_val = 2

manifest_path = os.path.join(filelist_dir, "{}_mainifest_dev_ns_all_local.json".format(new_speaker_id))
val_records = []
with open(manifest_path, "r") as f:
    for i, line in enumerate(f):
        val_records.append( json.loads(line) )
        if len(val_records) >= num_val:
            break
            
for val_record in val_records:
    print ("Real validation audio")
    ipd.display(ipd.Audio(val_record['audio_filepath'], rate=44100))
    print ("SYNTHESIZED FOR -- Speaker: {} | Dataset size: {} mins | Mixing:{} | Text: {}".format(new_speaker_id, duration_mins, mixing, val_record['text']))
    spec, audio = infer(spec_model, vocoder, val_record['text'], speaker = _speaker)
    ipd.display(ipd.Audio(audio, rate=44100))
    %matplotlib inline
    #if spec is not None:
    imshow(spec, origin="lower", aspect = "auto")
    plt.show()