In [None]:
from nemo.collections.tts.models import T5TTS_Model
from nemo.collections.tts.data.text_to_speech_dataset import T5TTSDataset, DatasetSample
from omegaconf.omegaconf import OmegaConf, open_dict
import torch
import os
import soundfile as sf
from IPython.display import display, Audio
import numpy as np
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

### Checkpoint Paths

In [None]:
hparams_file = "/data/t5_new_cp/configs/unnormalizedLalign005_singleencoder_kernel3_hparams.yaml"
checkpoint_file = "/data/t5_new_cp/checkpoints/unnormalizedLalign005_singleencoder_kernel3_epoch_20.ckpt" #"/datap/misc/continuouscheckpoints/edresson_epoch21.ckpt"
out_dir = "/datap/misc/t5tts_inference_notebook_samples"
#codecmodel_path = "/datap/misc/checkpoints/AudioCodec_21Hz_no_eliz.nemo"
codecmodel_path = "/data/codec_checkpoints/codecs-no-eliz/AudioCodec_21Hz_no_eliz_without_wavlm_disc.nemo"

In [None]:
#hparams_file = "yt_weight_0.25_plus18k__dim1536__enc3_fixes_hparams.yaml"
#checkpoint_file = "yt_weight_0.25_plus18k__dim1536__enc3_fixes_val_loss_5.1870_epoch_25.ckpt"

#hparams_file = "/data/t5_new_cp/configs/unnormalizedLalign005_singleencoder_kernel3_hparams.yaml"
#checkpoint_file = "/data/t5_new_cp/checkpoints/unnormalizedLalign005_singleencoder_kernel3_epoch_20.ckpt" #"/datap/misc/continuouscheckpoints/edresson_epoch21.ckpt"
hparams_file = "/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_hparams.yaml"
checkpoint_file ="/datap/misc/continuouscheckpoints/yt_weight0.25_plus_18k_single_stage_decoder_context_kernel1_fixes_epoch_61.ckpt" 
#hparams_file = "/datap/misc/continuouscheckpoints/decoder_context_large_hparams.yaml"
#checkpoint_file ="/datap/misc/continuouscheckpoints/decoder_context_large_epoch_14.ckpt" 

out_dir = "inference_output__adi_prompt_v2"
#codecmodel_path = "/datap/misc/checkpoints/AudioCodec_21Hz_no_eliz.nemo"
codecmodel_path = "/data/codec_checkpoints/codecs-no-eliz/AudioCodec_21Hz_no_eliz_without_wavlm_disc.nemo"

model_cfg = OmegaConf.load(hparams_file).cfg

with open_dict(model_cfg):
    model_cfg.codecmodel_path = codecmodel_path
    if hasattr(model_cfg, 'text_tokenizer'):
        # Backward compatibility for models trained with absolute paths in text_tokenizer
        model_cfg.text_tokenizer.g2p.phoneme_dict = "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
        model_cfg.text_tokenizer.g2p.heteronyms = "scripts/tts_dataset_files/heteronyms-052722"
        model_cfg.text_tokenizer.g2p.phoneme_probability = 1.0
    model_cfg.train_ds = None
    model_cfg.validation_ds = None


model = T5TTS_Model(cfg=model_cfg)
print("Loading weights from checkpoint")
ckpt = torch.load(checkpoint_file)
model.load_state_dict(ckpt['state_dict'])
print("Loaded weights.")

if model_cfg.t5_decoder.pos_emb == "learnable":
    if (model_cfg.t5_decoder.use_flash_self_attention) is False and (model_cfg.t5_decoder.use_flash_self_attention is False):
        print("Using kv cache for inference.")
        model.use_kv_cache_for_inference = True

model.cuda()
model.eval()

### Initialize Dataset class and helper functions

In [None]:
test_dataset = T5TTSDataset(
    dataset_meta={},
    sample_rate=model_cfg.sample_rate,
    min_duration=0.5,
    max_duration=20,
    codec_model_downsample_factor=model_cfg.codec_model_downsample_factor,
    bos_id=model.bos_id,
    eos_id=model.eos_id,
    context_audio_bos_id=model.context_audio_bos_id,
    context_audio_eos_id=model.context_audio_eos_id,
    audio_bos_id=model.audio_bos_id,
    audio_eos_id=model.audio_eos_id,
    num_audio_codebooks=model_cfg.num_audio_codebooks,
    prior_scaling_factor=None,
    load_cached_codes_if_available=True,
    dataset_type='test',
    tokenizer_config=None,
    load_16khz_audio=model.model_type == 'single_encoder_sv_tts',
    use_text_conditioning_tokenizer=model.use_text_conditioning_encoder,
    pad_context_text_to_max_duration=model.pad_context_text_to_max_duration,
    context_duration_min=model.cfg.get('context_duration_min', 5.0),
    context_duration_max=model.cfg.get('context_duration_max', 5.0),
)
test_dataset.text_tokenizer, test_dataset.text_conditioning_tokenizer = model._setup_tokenizers(model.cfg, mode='test')



def get_audio_duration(file_path):
    with sf.SoundFile(file_path) as audio_file:
        # Calculate the duration
        duration = len(audio_file) / audio_file.samplerate
        return duration

def create_record(text, context_audio_filepath=None, context_text=None):
    dummy_audio_fp = os.path.join(out_dir, "dummy_audio.wav")
    dummy_audio = sf.write(dummy_audio_fp, np.zeros(22050 * 3), 22050)  # 3 seconds of silence
    record = {
        'audio_filepath' : dummy_audio_fp,
        'duration': 3.0,
        'text': text,
        'speaker': "dummy",
    }
    if context_text is not None:
        assert context_audio_filepath is None
        record['context_text'] = context_text
    else:
        assert context_audio_filepath is not None
        record['context_audio_filepath'] = context_audio_filepath
        record['context_audio_duration'] = get_audio_duration(context_audio_filepath)
    
    return record

### Set transcript and context pairs to test

In [64]:
usg_cfg = True
cfg_scale = 1.8
audio_dir = "/home/rfejgin/kb-snippets"
texts = ["NVIDIA's Riva is a powerful speech AI toolkit that offers state-of-the-art ASR and TTS capabilities.",
 'The platform supports multiple languages and provides enterprise-grade speech technology through GPU-accelerated SDKs and APIs.',
 'What makes Riva unique is its ability to be customized for specific use cases while maintaining high performance and accuracy.',
 'The platform supports both cloud and edge deployment, making it versatile for various enterprise applications.',
 'When comparing Heavenly and Northstar ski resorts in Lake Tahoe, each offers unique advantages.',
 'Heavenly boasts more vertical feet of skiing and spectacular lake views, with 4,800 acres of skiable terrain across two states.',
 'Northstar offers a more intimate, luxury experience with excellent grooming, a charming village atmosphere, and some of the best tree skiing in Tahoe.',
 'Heavenly might be better for advanced skiers seeking variety, while Northstar excels for families and intermediate skiers looking for a refined experience.',
 "Being a Product Manager at NVIDIA is unique because you're at the forefront of AI innovation, working with cutting-edge technology that shapes the future of computing.",
 'The role combines technical depth with market strategy, requiring understanding of both deep learning models and enterprise customer needs.',
 'You get to collaborate with world-class researchers and engineers while driving products that enable breakthrough AI applications across industries.',
 'The boy was there when the sun rose.',
 'A rod is used to catch pink salmon.',
 'The source of the huge river is the clear spring.',
 'Kick the ball straight and follow through.',
 'Help the woman get back to her feet.',
 'A pot of tea helps to pass the evening.',
 'Smoky fires lack flame and heat.',
 "The soft cushion broke the man's fall.",
 'The salt breeze came across from the sea.',
 'The girl at the booth sold fifty bonds.']
entries = []
for i,text in enumerate(texts):
    entry = {"audio_filepath": "adi-snippet1.wav",
                "duration": 4.89,
                "text": text,
                "speaker": "dummy",
                "context_audio_filepath": "adi-snippet1.wav",
                "context_audio_duration":  4.89
    }
    entries.append(entry)
data_samples = [DatasetSample(
    dataset_name="sample",
    manifest_entry=entry,
    audio_dir=audio_dir,
    feature_dir=audio_dir,
    text=entry['text'],
    speaker=None,
    speaker_index=0
) for entry in entries]
num_repeat = 5
test_dataset.data_samples = data_samples

test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=6,
    collate_fn=test_dataset.collate_fn,
    num_workers=0,
    shuffle=False
)

In [65]:
if False:

    text_index = 0
    texts = ["Welcome to the conversational AI face-to-face meeting. I hope you're enjoying it and learning a lot. By the way, I never said any of this." ,
            # text generated by ChatGPT
            "Our GPUs aren’t just processors; they are engines for discovery, powering breakthroughs in everything from self-driving cars to disease research.",
            
            # text from t5-tts paper
            "Experiments demonstrate that our alignment learning procedure improves the reliability of TTS synthesis, especially for challenging text inputs and outperforms prior LLM-based TTS models on both intelligibility and naturalness.", # from t5-tts
            
            # from KB interview
            "The book is about seeing different things and finding similarities. Each kid in the book looks a little bit different, but also a little bit the same.",

            # from NVIDIA website
            "Speech AI lets people converse with devices, machines, and computers to simplify and augment their lives.", # from Nvidia website
            
            # text from CES keynote
            "And today, computing is revolutionized in every single layer, from hand coding, instructions that run on CPUs to create software tools that humans use.", # text from CES keynote

            # text from CES keynote
            "And so we have models for vision, for understanding languages, for speech, for animation, for digital biology, and we have some new exciting models coming for physical AI.",
            #  "One amazing AI world foundation model, the world's first physical AI foundation model is open, available to activate the world's industries of robotics and such, and three robots working on agentic AI, humanoid robots, and self-driving cars.", # text from CES keynote
            #  "That's all going to be either highly autonomous or fully autonomous coming up. And so this is going to be a very large industry." # text from CES keynote
            ]

    do_jhsd = True
    use_cfg = True
    cfg_scale = 1.8
    kb=True
    sqam=True
    if kb:
        audio_dir = "/home/rfejgin/kb-snippets"
        #audio_dir = "/datap/misc/RodneyLindy44/Lindy/44khz/WIZWIKI"
        entry = {
                #"audio_filepath": "Lindy/22khz/CMU_HAPPY/LINDY_CMU_HAPPY_000567.wav",
                #"audio_filepath": "LINDY_WIZWIKI_007592.wav",#"kb-snippet-bahamas.wav",
                #"audio_filepath":"boris-snippet-denoised.wav",
                "audio_filepath":"adi-snippet1.wav",
                "duration": 9.54,
                #"text": "Speech AI lets people converse with devices, machines, and computers to simplify and augment their lives.",
                "text": texts[text_index],
                "speaker": "dummy",
            #     "context_text": "Speaker and Emotion: | Language:en Dataset:Riva Speaker:Lindy_CMU_FEARFUL |",
                "context_audio_filepath": "adi-snippet1.wav", #"LINDY_WIZWIKI_007592.wav",#"siddhesh.wav", #"Rodney/22khz/DROP/RODNEY_DROP_000060.wav",
                "context_audio_duration": 4.89,
            }
    elif sqam:
        audio_dir = "/home/rfejgin/t5-util/mos"
        entry = {
                #"audio_filepath": "Lindy/22khz/CMU_HAPPY/LINDY_CMU_HAPPY_000567.wav",
                "audio_filepath": "sqam_cd_49_short_mono.wav",
                "duration": 6.44,
                "text": texts[text_index],
                "speaker": "dummy",
            #     "context_text": "Speaker and Emotion: | Language:en Dataset:Riva Speaker:Lindy_CMU_FEARFUL |",
                "context_audio_filepath": "sqam_cd_49_short_mono.wav", #"Rodney/22khz/DROP/RODNEY_DROP_000060.wav",
                "context_audio_duration": 6.44,
            }  
    else:
        if not do_jhsd:
            audio_dir =  "/datap/misc/RodneyLindy44" #"/datap/misc/Datasets/riva"
            entry = {
                #"audio_filepath": "Lindy/22khz/CMU_HAPPY/LINDY_CMU_HAPPY_000567.wav",
                "audio_filepath": "Lindy/44khz/CMU_HAPPY/LINDY_CMU_HAPPY_000567.wav",
                "duration": 6.275604,
                "text": texts[text_index],
                "speaker": "dummy",
            #     "context_text": "Speaker and Emotion: | Language:en Dataset:Riva Speaker:Lindy_CMU_FEARFUL |",
                "context_audio_filepath": "Rodney/44khz/DROP/RODNEY_DROP_000060.wav", #"Rodney/22khz/DROP/RODNEY_DROP_000060.wav",
                "context_audio_duration": 8.0,
            }
        else:

            # Jensen data
            #audio_dir =  "/data/NV-RESTRICTED/JHSD/22khz"
            audio_dir =  "/data/NV-RESTRICTED/JHSD/22khz_denoised"
            entry = {
                #"audio_filepath": "Lindy/22khz/CMU_HAPPY/LINDY_CMU_HAPPY_000567.wav",
                #"audio_filepath": "GTC_FALL_2021_KEYNOTE_V0Only-44khz-16bit-mono_CH07_0042.wav",
                "audio_filepath": "GTC20_SPRING_KEYNOTE-VOOnly-44khz-16bit-mono_327.wav",
                "duration": 4.84,
                "text": texts[text_index],
                #"text": "It was simply amazing to watch. I've never seen collaboration on such scale.",
                "speaker": "not used",
            #     "context_text": "Speaker and Emotion: | Language:en Dataset:Riva Speaker:Lindy_CMU_FEARFUL |",
                "context_audio_filepath": "GTC_FALL_2021_KEYNOTE_V0Only-44khz-16bit-mono_CH07_0042.wav",
                #"context_audio_filepath": "GTC20_FALL_KEYNOTE-VOOnly-44khz-16bit-mono_221.wav",
                #"context_audio_filepath": "roy2_22050.wav",#"AMP20_KEYNOTE-VOOnly-44khz-16bit-mono_6.wav", #"Rodney/22khz/DROP/RODNEY_DROP_000060.wav",
                #"context_audio_duration": 5.35,# 8.02,
                "context_audio_duration": 6.24,
            }
    data_sample = DatasetSample(
        dataset_name="sample",
        manifest_entry=entry,
        audio_dir=audio_dir,
        feature_dir=audio_dir,
        text=entry['text'],
        speaker=None,
        speaker_index=0
    )
    num_repeat = 5
    test_dataset.data_samples = [data_sample for _ in range(num_repeat)]

    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=6,
        collate_fn=test_dataset.collate_fn,
        num_workers=0,
        shuffle=False
    )


In [None]:
item_idx = 0
usg_cfg = True
for bidx, batch in enumerate(test_data_loader):
    print("Processing batch {} out of {}".format(bidx, len(test_data_loader)))
    model.t5_decoder.reset_cache(use_cache=True)
    batch_cuda ={}
    for key in batch:
        if isinstance(batch[key], torch.Tensor):
            batch_cuda[key] = batch[key].cuda()
        else:
            batch_cuda[key] = batch[key]
    import time
    st = time.time()
    use_cfg = True
    predicted_audio, predicted_audio_lens, _, _ = model.infer_batch(batch_cuda, max_decoder_steps=500, temperature=0.5, topk=80,\
                                                                    use_cfg=use_cfg, cfg_scale=cfg_scale)
    print("generation time", time.time() - st)
    for idx in range(predicted_audio.size(0)):
        predicted_audio_np = predicted_audio[idx].float().detach().cpu().numpy()
        predicted_audio_np = predicted_audio_np[:predicted_audio_lens[idx]]
        model_name = os.path.basename(checkpoint_file)
        #audio_path = os.path.join(out_dir, f"text_{text_index}__predicted_audio_{item_idx}.wav")
        audio_path = os.path.join(out_dir, f"predicted_audio_{item_idx}.wav")
        print(f"Writing {audio_path}")
        sf.write(audio_path, predicted_audio_np, model.cfg.sample_rate)
        display(Audio(audio_path))
        item_idx += 1

In [None]:
print(f"Checkpoint: {checkpoint_file}")
context_filepath = os.path.join(audio_dir, entry['context_audio_filepath'])
display(Audio(context_filepath))


In [None]:
entry['context_audio_filepath']

In [None]:
usg_cfg