In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import json
from omegaconf.omegaconf import OmegaConf, open_dict
import shutil

from nemo.collections.tts.models.speechllm.megatron_t5_speechllm_model import MegatronT5SpeechLMModel
from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from IPython.display import Audio, display
import torchaudio

# CHANGE THIS TO A LOCAL DIRECTORY
EXP_DIR = "/Data/NotebookInference"

if not os.path.exists(EXP_DIR):
    os.makedirs(EXP_DIR)

## Save a dummy manifest to setup Model Test Step

In [2]:
def write_records(fp, records):
    with open(fp, "w") as f:
        for record in records:
            f.write(json.dumps(record) + "\n")

dummy_codes = torch.ones(8, 300).cpu().type(torch.int16)
dummy_codes_fp = os.path.join(EXP_DIR, "dummy_codes.pt")
torch.save(dummy_codes, dummy_codes_fp)


dummy_record = {
    "question" : "Phoneme TTS Sample Text",
    "answer" : dummy_codes_fp,
    "context" : dummy_codes_fp,
    "context_type" : "REFSPEAKERCODEC",
    "question_type" : "TEXT",
    "answer_type" : "AUDIOCODEC",
    "context_duration" : 5.0,
    "answer_duration" : 5.0,
    "taskname" : "squad"
}

dummy_val_file = os.path.join(EXP_DIR, "dummy_val.json")

write_records(dummy_val_file, [dummy_record])

## Load and setup the model

In [5]:
# CHANGE THESE PATHS TO RELEVANT MOUNTED PATHS IN DOCKER
config_path = "/home/shehzeenh/Code/NeMo/examples/tts/speechllm/conf/megatron_t5_speechllm_inference_multiencoder.yaml"
# checkpoint_path = "/datap/misc/temp_checkpoints_new/desta_less_sophia_highLR_step159600.ckpt"
# checkpoint_path = "/Data/Checkpoints/multiencoder21Hz2k_nodesta_onlyIPA_step298k.ckpt"
checkpoint_path = "/Data/dpo_checkpoints/dpo_onlyipa_LR2e-7_corruptionProb_00_beta0.01_bf16_step4216.ckpt"
# checkpoint_path = "/Data/21_hz_release_candidates/no_yt/megatron_t5_speechllm_tts--val_loss=41.662-step=162478.ckpt"
# codecmodel_path = "/Data/Checkpoints/AudioCodec_21Hz-2k-codes_updated.nemo"
codecmodel_path = "/Data/Checkpoints/AudioCodec_21Hz_no_eliz.nemo"
vocab_file = "/Data/Checkpoints/9a77f10c2793465e8e8a3fa5fcbef8b0_vocab.txt"

cfg = OmegaConf.load(config_path)

if "gradient_as_bucket_view" not in cfg.model:
    with open_dict(cfg):
        cfg.model.gradient_as_bucket_view=False

trainer = MegatronTrainerBuilder(cfg).create_trainer()
exp_manager(trainer, cfg.exp_manager)

with open_dict(cfg):
    cfg.exp_manager.exp_dir=EXP_DIR
    cfg.checkpoint_path = checkpoint_path
    cfg.model.data.sup_data_path="/datap/misc/speechllm_codecdatasets/"
    cfg.model.global_batch_size=1
    cfg.model.micro_batch_size=1
    cfg.model.lm_vocab_size=30000
    cfg.model.data.add_special_tokens_to_only_first_codebook=True
    cfg.model.data.train_task="all"
    cfg.model.freeze_model=False
    cfg.model.data.max_seq_length=512
    cfg.model.max_inference_timesteps=510
    cfg.model.data.context_duration_min=20.0
    cfg.model.data.context_duration_max=20.0
    cfg.model.top_k=80
    cfg.model.temperature=0.85
    cfg.model.data.speech_offset=30128
    cfg.model.data.speech_codebook_size=2048
    cfg.model.codecmodel_path=codecmodel_path
    cfg.trainer.devices=1
    cfg.trainer.precision="bf16"
    cfg.model.precision = cfg.trainer.precision
    cfg.model.override_tokenizer_vocab_file=vocab_file
    cfg.model.english_only_model=True
    cfg.model.asr_model_name="stt_en_conformer_transducer_large"
    cfg.model.frozen_model.decoder.layer_type=[1,1,1,2,2,2,2,2,2,2,1,1]
    cfg.model.alignment_decoder_layerids=[0,1,2,3,4]
    cfg.model.enc_output_to_layers=[[8,9],[3,4,5,6,7]]
    cfg.model.data.test_ds=[dummy_val_file]
    cfg.model.data.num_workers = 0
    cfg.model.data.use_ipa=True
    cfg.model.data.codebook_fps=21


checkpoint_path = cfg.get('checkpoint_path', None)
assert checkpoint_path is not None, "checkpoint path needs to be valid"

model = MegatronT5SpeechLMModel.load_from_checkpoint(
        checkpoint_path=checkpoint_path, trainer=trainer, cfg=cfg.model
    )
model.eval()
model = model.cuda()

codec_model = model.additional_models['codec']
trainer.test(model)


[NeMo I 2024-11-18 20:52:43 megatron_trainer_builder:51] Detected interactive environment, using NLPDDPStrategyNotebook


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[NeMo W 2024-11-18 20:52:43 exp_manager:780] No version folders would be created under the log folder as 'resume_if_exists' is enabled.
[NeMo W 2024-11-18 20:52:43 exp_manager:636] There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :/home/shehzeenh/Code/NeMo/nemo_experiments/megatron_t5_speechllm/checkpoints. Training from scratch.


[NeMo I 2024-11-18 20:52:43 exp_manager:402] Experiments will be logged at /home/shehzeenh/Code/NeMo/nemo_experiments/megatron_t5_speechllm
[NeMo I 2024-11-18 20:52:43 exp_manager:863] TensorboardLogger has been set up


[NeMo W 2024-11-18 20:52:43 exp_manager:973] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to 250000. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.
[NeMo W 2024-11-18 20:52:43 nemo_model_checkpoint:58] Found save_best_model is True and save_nemo_on_train_end is False. Set save_nemo_on_train_end to True to automatically save the best model.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMMode

[NeMo I 2024-11-18 20:52:44 megatron_init:241] Rank 0 has data parallel group : [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:247] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:252] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-11-18 20:52:44 megatron_init:255] Ranks 0 has data parallel rank: 0
[NeMo I 2024-11-18 20:52:44 megatron_init:272] Rank 0 has context parallel group: [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:275] All context parallel group ranks: [[0]]
[NeMo I 2024-11-18 20:52:44 megatron_init:276] Ranks 0 has context parallel rank: 0
[NeMo I 2024-11-18 20:52:44 megatron_init:287] Rank 0 has model parallel group: [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:288] All model parallel group ranks: [[0]]
[NeMo I 2024-11-18 20:52:44 megatron_init:298] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:302] All tensor model parallel group ranks: 

[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5SpeechLMModel() does 

[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does 

Training from scratch!
[NeMo I 2024-11-18 20:52:44 megatron_init:241] Rank 0 has data parallel group : [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:247] Rank 0 has combined group of data parallel and context parallel : [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:252] All data parallel group ranks with context parallel combined: [[0]]
[NeMo I 2024-11-18 20:52:44 megatron_init:255] Ranks 0 has data parallel rank: 0
[NeMo I 2024-11-18 20:52:44 megatron_init:272] Rank 0 has context parallel group: [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:275] All context parallel group ranks: [[0]]
[NeMo I 2024-11-18 20:52:44 megatron_init:276] Ranks 0 has context parallel rank: 0
[NeMo I 2024-11-18 20:52:44 megatron_init:287] Rank 0 has model parallel group: [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:288] All model parallel group ranks: [[0]]
[NeMo I 2024-11-18 20:52:44 megatron_init:298] Rank 0 has tensor model parallel group: [0]
[NeMo I 2024-11-18 20:52:44 megatron_init:302] All tensor model

[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.
[NeMo W 2024-11-18 20:52:44 megatron_base_model:1078] The model: MegatronT5OverrideModel() does 

[NeMo I 2024-11-18 20:52:44 tokenizer_utils:204] Getting Megatron tokenizer for pretrained model name: megatron-bert-345m-cased, custom vocab file: /Data/Checkpoints/9a77f10c2793465e8e8a3fa5fcbef8b0_vocab.txt, and merges file: None
[NeMo I 2024-11-18 20:52:44 tokenizer_utils:130] Getting HuggingFace AutoTokenizer with pretrained_model_name: bert-large-cased, vocab_file: /Data/Checkpoints/9a77f10c2793465e8e8a3fa5fcbef8b0_vocab.txt, merges_files: None, special_tokens_dict: {}, and use_fast: False


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

[NeMo I 2024-11-18 20:53:49 auto_tokenizer:173] 10128 special tokens added, resize your model accordingly.
[NeMo I 2024-11-18 20:53:49 megatron_base_model:520] Padded vocab_size: 39168, original vocab_size: 39124, dummy tokens: 44.
Number of parameters: 217198848
[NeMo I 2024-11-18 20:53:49 megatron_t5_speechllm_model:535] self.frozen_model MegatronT5OverrideModel(
      (enc_dec_model): MegatronTokenLevelEncoderDecoderSpeechLLMModule(
        (encoder_embedding): Embedding(
          (word_embeddings): VocabParallelEmbedding()
          (position_embeddings): Embedding(512, 768)
          (embedding_dropout): Dropout(p=0.1, inplace=False)
        )
        (decoder_embedding): Embedding(
          (word_embeddings): VocabParallelEmbedding()
          (position_embeddings): Embedding(512, 768)
          (embedding_dropout): Dropout(p=0.1, inplace=False)
        )
        (enc_dec_model): MegatronTransformerEncoderDecoderModule(
          (encoder): MultiMegatronTransformerEncoderModule

[NeMo W 2024-11-18 20:53:49 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      dataset_type: tarred_vocoder
      dataset_args:
        dataset_meta:
          mls_english:
            manifest_path: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/mls_english/filtered_22khz/tarred_audio/train_manifest.json
            tar_filepath: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/mls_english/filtered_22khz/tarred_audio/audio_{0..1999}.tar
          cv:
            manifest_path: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/commonvoice13/22khz/tarred_audio/train_manifest.json
            tar_filepath: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/commonvoice13/22khz/tarred_audio/audio_{0..279}.tar
    

[NeMo I 2024-11-18 20:53:50 audio_codec:93] Vector quantizer does not support commit loss.


config.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

      return self.fget.__get__(instance, owner)()
    
Some weights of the model checkpoint at microsoft/wavlm-base-plus were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-base-plus and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model o

[NeMo I 2024-11-18 20:53:55 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:58 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:58 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:58 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:58 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:58 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:58 features:289] PADDING: 1
[NeMo I 2024-11-18 20:53:59 save_restore_connector:263] Model AudioCodecModel was successfully restored from /Data/Checkpoints/AudioCodec_21Hz_no_eliz.nemo.


Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 1 processes
----------------------------------------------------------------------------------------------------

[NeMo W 2024-11-18 20:53:59 experimental:26] `<class 'nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p'>` is experimental and not ready for production yet. Use at your own risk.
[NeMo W 2024-11-18 20:54:00 i18n_ipa:124] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2024-11-18 20:54:00 experimental:26] `<class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'>` is experimental and not ready

[NeMo I 2024-11-18 20:54:00 base_prompt_learning_dataset:61] Loading and tokenizing dataset ... 
[NeMo I 2024-11-18 20:54:00 t5_speechllm_dataset:341] copy_dataset len === 1


  0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2024-11-18 20:54:00 t5_speechllm_dataset:441] Skipped 0 sentences, sequence length too short or too long even after truncation
[NeMo I 2024-11-18 20:54:00 megatron_t5_speechllm_model:1572] build success: 1 ['/Data/NotebookInference/dummy_val.json']


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
      rank_zero_warn(
    


Testing: 0it [00:00, ?it/s]

[NeMo I 2024-11-18 20:54:02 megatron_t5_speechllm_model:1881] End detected for item 0 at timestep 36
All ends detected
[NeMo I 2024-11-18 20:54:02 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2024-11-18 20:54:02 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2024-11-18 20:54:02 common:815] Instantiating model from pre-trained checkpoint


[NeMo W 2024-11-18 20:54:02 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2024-11-18 20:54:02 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2024-11-18 20:54:02 features:289] PADDING: 16
[NeMo I 2024-11-18 20:54:02 save_restore_connector:263] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2024-11-18 20:54:02 megatron_t5_speechllm_model:1985] Loaded SV Model: EncDecSpeakerLabelModel(
      (loss): AngularSoftmaxLoss()
      (eval_loss): AngularSoftmaxLoss()
      (_accuracy): TopKClassificationAccuracy()
      (preprocessor): AudioToMelSpectrogramPreprocessor(
        (featurizer): FilterbankFeatures()
      )
      (encoder): ConvASREncoder(
        (encoder): Sequential(
          (0): JasperBlock(
            (mconv): ModuleList(
              (0): MaskedConv1d(
                (conv): Conv1d(80, 80, kernel_size=(3,), stride=(1,), padding=(1,), groups=80, bias=False)
              )
              (1): MaskedConv1d(
                (conv): Conv1d(80, 1024, kernel_size=(1,), stride=(1,), bias=False

[NeMo I 2024-11-18 20:54:02 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.23.0rc0/stt_en_conformer_transducer_large/1919c44e1281bbcba59356c6091a6b94/stt_en_conformer_transducer_large.nemo.
[NeMo I 2024-11-18 20:54:02 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.23.0rc0/stt_en_conformer_transducer_large/1919c44e1281bbcba59356c6091a6b94/stt_en_conformer_transducer_large.nemo
[NeMo I 2024-11-18 20:54:02 common:815] Instantiating model from pre-trained checkpoint
[NeMo I 2024-11-18 20:54:03 mixins:170] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-11-18 20:54:04 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket1/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket2/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket3/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket4/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket5/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket6/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket7/tarred_audio_manifest.json
    - - /data/NeMo_ASR_SET/English/v3.0/train_bucketed/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    

[NeMo I 2024-11-18 20:54:04 features:289] PADDING: 0


    


[NeMo I 2024-11-18 20:54:05 rnnt_models:222] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2024-11-18 20:54:05 save_restore_connector:263] Model EncDecRNNTBPEModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.23.0rc0/stt_en_conformer_transducer_large/1919c44e1281bbcba59356c6091a6b94/stt_en_conformer_transducer_large.nemo.
[NeMo I 2024-11-18 20:54:05 megatron_t5_speechllm_model:2001] Loaded ASR Model: EncDecRNNTBPEModel(
      (preprocessor): AudioToMelSpectrogramPreprocessor(
        (featurizer): FilterbankFeatures()
      )
      (encoder): ConformerEncoder(
        (pre_encode): ConvSubsampling(
          (out): Linear(in_features=10240, out_features=512, bias=True)
          (conv): Sequential(
            (0): Conv2d(1, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
            (1): ReLU(inplace=True)
            (2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/58.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/wavlm-base-plus-sv were not used when initializing WavLMForXVector: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForXVector were not initialized from the model checkpoint at microsoft/wavlm-base-plus-sv and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a d

[NeMo I 2024-11-18 20:54:11 megatron_t5_speechllm_model:2024] Loaded SV Model: WavLMForXVector(
      (wavlm): WavLMModel(
        (feature_extractor): WavLMFeatureEncoder(
          (conv_layers): ModuleList(
            (0): WavLMGroupNormConvLayer(
              (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
              (activation): GELUActivation()
              (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
            )
            (1-4): 4 x WavLMNoLayerNormConvLayer(
              (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
              (activation): GELUActivation()
            )
            (5-6): 2 x WavLMNoLayerNormConvLayer(
              (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
              (activation): GELUActivation()
            )
          )
        )
        (feature_projection): WavLMFeatureProjection(
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
    
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.

Transcribing:   0%|                                                                          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.57it/s][A

[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test titanet_avg_cossim: -0.10892503708600998
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test titanet_avg_cossim_context_pred: -0.10892503708600998
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test titanet_avg_cossim_context_gt: 1.0000001192092896
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test wavlm_avg_cossim: 0.6574785709381104
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test wavlm_avg_cossim_context_pred: 0.657564640045166
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test wavlm_avg_cossim_context_gt: 1.0
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test squim_mos_pred_GT: 3.5183446407318115
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test squim_mos_GT_context: 3.9421274662017822
[NeMo I 2024-11-18 20:54:15 megatron_t5_speechllm_model:1480] Test squim_mos_pred_context: 3.4339842796325684
[NeMo I 202


    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    


[{'test_titanet_avg_cossim': -0.10892503708600998,
  'test_titanet_avg_cossim_context_pred': -0.10892503708600998,
  'test_titanet_avg_cossim_context_gt': 1.0000001192092896,
  'test_wavlm_avg_cossim': 0.6574785709381104,
  'test_wavlm_avg_cossim_context_pred': 0.657564640045166,
  'test_wavlm_avg_cossim_context_gt': 1.0,
  'test_squim_mos_pred_GT': 3.5183446407318115,
  'test_squim_mos_GT_context': 3.9421274662017822,
  'test_squim_mos_pred_context': 3.4339842796325684,
  'test_cer_transcript': 1.0,
  'test_wer_transcript': 1.0,
  'test_cer_phoneme': 1.0,
  'test_wer_phoneme': 1.0,
  'test_cer_tts': 0.0,
  'test_wer_tts': 0.0,
  'test_cer_transcript_gt': 0.0,
  'test_wer_transcript_gt': 0.0,
  'test_cer_phoneme_gt': 0.0,
  'test_wer_phoneme_gt': 0.0,
  'test_cer_tts_gt': 0.0,
  'test_wer_tts_gt': 0.0,
  'test_RTF': 0.9444248676300049}]

## Helper functions

In [6]:
out_dir = os.path.join( model.trainer.logger.save_dir, model.trainer.logger.name, model.trainer.logger.version, "Sample_Audios")
out_path = os.path.join(out_dir, 'predicted_wav_0.wav')


def encode(wav_path):
    # Convert an audio file to nemo codec codes
    features = AudioSegment.segment_from_file(
                    wav_path, target_sr=codec_model.sample_rate, n_segments=-1, trim=False,
                )
    audio_samples = features.samples
    audio = torch.tensor(audio_samples).cuda()
    audio_length = torch.tensor(audio.size(0)).long().cuda()
    print(f"audio {audio.size()} audio_length {audio_length}")
    print(f"audio {audio.device} audio_length {audio_length.device} codec_model {codec_model.device}")

    original_codec_codes, _ = codec_model.encode(audio=audio.unsqueeze(0), audio_len=audio_length.unsqueeze(0))
    original_codec_codes = original_codec_codes[0]
    print(f"original_codec_codes {original_codec_codes.size()} audio {audio.size()} audio_length {audio_length}")
    duration = original_codec_codes.size()[1] / 86
    
    target_codec_filepath = wav_path[:-4] + "_codes.pt"
    torch.save(original_codec_codes.cpu().type(torch.int16), target_codec_filepath)
    return original_codec_codes, target_codec_filepath, duration
    
    
    
def play_codec(codec_path):
    # Convert nemo codecs to audio and play it
    codec = torch.load(codec_path)
    codec = codec.to('cuda')
    codec = codec.unsqueeze(0)
    codec_lens = torch.Tensor([codec.shape[2]]).long().cuda()
    codec_decoded_audios, _ = codec_model.decode(tokens=codec.long(), tokens_len=codec_lens)
    codec_decoded_audio = codec_decoded_audios[0]
    temp_wav_path = os.path.join(EXP_DIR, "temp.wav")
    torchaudio.save(temp_wav_path, codec_decoded_audio[None].cpu(), 22050)
    display(Audio(temp_wav_path))

def generate_new_audio(
    text,
    context,
    context_duration=4.0,
    context_type="REFSPEAKERCODEC",
    temperature=0.85,
    top_k=80,
    text_task="Phoneme TTS "
    ):
    # Prepare data in speechllm format
    model.cfg.temperature = temperature
    model.cfg.top_k = top_k
    dummy_answer = dummy_codes_fp
    json_in = {}
    json_in["question"] = text_task + text
    json_in["question_type"] = "TEXT"
    json_in["answer"] = dummy_answer 
    json_in["context"] = context 
    json_in["answer_type"] = "AUDIOCODEC"
    json_in["context_type"] = context_type
    json_in["context_duration"] = context_duration
    json_in["answer_duration"] = 2.0
    json_in["taskname"] = "squad"
    json_in["lang"] = "en"
    json_in = [json_in]
    
    # Prepare dataloader
    model._test_ds.examples = []
    model._test_ds.examples = model._test_ds.load_data(json_in)
    
    sampler = torch.utils.data.distributed.DistributedSampler(
            model._test_ds, num_replicas=1, rank=0, shuffle=False, seed=1
        )

    model._test_dl = torch.utils.data.DataLoader(
        model._test_ds,
        collate_fn=model._test_ds.collate_fn,
        sampler=sampler,
        batch_size=1,
        drop_last=False,
        num_workers=1,
        pin_memory=False,
        persistent_workers=True
    )
    
    # Run inference
    model.cfg.data.test_ds = None
    trainer.test(model, model._test_dl)
    print("Out path:", out_path)
    print("Inference done")

In [7]:
text_contexts = [
    "TEXT CONTEXT: | Language:en Dataset:Riva Speaker:Lindy_WIZWIKI |",
    "TEXT CONTEXT: | Language:en Dataset:Riva Speaker:Lindy_CMU_FEARFUL |",
    "TEXT CONTEXT: | Language:en Dataset:Riva Speaker:Lindy_CMU_HAPPY |",
    "TEXT CONTEXT: | Language:en Dataset:Riva Speaker:Rodney_CMU_HAPPY |",
    "TEXT CONTEXT: | Language:en  Dataset:PromptTTS Gender:female SpeakingRate:2. Slow emotion:neutral Pitch:4. High SNR:5. Clean REVERB:5. Very close-sounding |"
]

## Generate audio from a text context

In [45]:
texts = [
    "Hi, how are you and how is your day going?",
    "My date of birth is May eleventh, nineteen eighty two.",
    "My phone number is eight one two three seven six nine one nine two. And I live, ummm, in California.",
    "It’s seven eight two, N R eight, no N R seven twenty one. Email I D is guru gram at g mail dot com, guru and then gram.",
    "Can I have reference number please?",
    "Gavlick, spelled as g a v l i c k. L as in London and I as in India",
    "The date is tweleve eleven nineteen ninety-one.",
    "The zip code is nine four one zero-zero seven.",
    "I am calling from, uhmmm, Walgreens pharmacy.",
    "Stop! What are you doing?",
    "Are you coming? I hope so!",
    "What a wonderful surprise!",
    "The H T T P protocol uses T C P or I P for data transmission.",
    "I wouldn't've done that if I'd known better.",
    "The multilayered complexity of environmental ecosystems demonstrates the interconnectedness of all living organisms."
    
]
# text = "As I closed my laptop for the night, my reflection in the screen continued to smile back at me."

import shutil

for tidx, text in enumerate(texts):
    if tidx != 13:
        continue
    text_task = "Phoneme TTS " # Can be "Text to speech this " (for sentence-piece tokenizer) or "Phoneme TTS " (for phoneme tokenizer)
    context = text_contexts[0] # Sample Text Context
    context_type = "TEXT" # Can be REFSPEAKERCODEC (for audio context), TEXT (for text context)
    generate_new_audio(
        text, 
        context, 
        context_type=context_type, 
        context_duration=5.0, # Does not matter, should just be > 3 so that dataset does not filter it out.
        top_k=80, # Can play around with this to check roubstness
        temperature=0.7, # Can play around with this. temperature < 0.85 can be more robust
        text_task=text_task
    )
    display(Audio(out_path))
    shutil.copyfile(out_path, "/Data/VikashGenerations/t5tts_{}.wav".format(tidx))

[NeMo I 2024-11-18 21:25:03 t5_speechllm_dataset:341] copy_dataset len === 1


  0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2024-11-18 21:25:03 t5_speechllm_dataset:441] Skipped 0 sentences, sequence length too short or too long even after truncation


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
      rank_zero_warn(
    


Testing: 0it [00:00, ?it/s]

[NeMo I 2024-11-18 21:25:05 megatron_t5_speechllm_model:1881] End detected for item 0 at timestep 47
All ends detected
[NeMo I 2024-11-18 21:25:05 megatron_t5_speechllm_model:2112] Clipping until end index for audio 0


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
    

Transcribing:   0%|                                                                          | 0/1 [00:00<?, ?it/s][A
Transcribing: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.45it/s][A
      return _methods._mean(a, axis=axis, dtype=dtype,
    
      ret = ret.dtype.type(ret / rcount)
    


[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test titanet_avg_cossim: -0.0092331539546389
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test titanet_avg_cossim_context_pred: nan
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test titanet_avg_cossim_context_gt: nan
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test wavlm_avg_cossim: 0.61204337818282
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test wavlm_avg_cossim_context_pred: nan
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test wavlm_avg_cossim_context_gt: nan
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test squim_mos_pred_GT: 1.9439193759645734
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test squim_mos_GT_context: nan
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test squim_mos_pred_context: nan
[NeMo I 2024-11-18 21:25:06 megatron_t5_speechllm_model:1480] Test cer_transcript: 1.0
[Ne

Out path: /home/shehzeenh/Code/NeMo/nemo_experiments/megatron_t5_speechllm/Sample_Audios/predicted_wav_0.wav
Inference done


## Listen to some ground-truth context audios

In [None]:
context_paths = [
    "/datap/misc/speechllm_codecdatasets/codecs/RivattsAllLanguagesUpdated_train_nemo_codec_bw_6.0/target_codes_en_Lindy_44khz_CMU_HAPPY_LINDY_CMU_HAPPY_000570.pt",
]

for cidx, context_path in enumerate(context_paths):
    print(cidx, context_path)
    play_codec(context_path)

## Generate audio from an audio context

In [None]:
text = "As I closed my laptop for the night, my reflection in the screen continued to smile back at me."
text_task = "Text to speech this " # Can be "Text to speech this " (for sentence-piece tokenizer) or "Phoneme TTS " (for phoneme tokenizer)
context = context_paths[0] # Sample Text Context
context_type = "REFSPEAKERCODEC" # Can be REFSPEAKERCODEC (for audio context), TEXT (for text context)
generate_new_audio(
    text, 
    context, 
    context_type=context_type, 
    context_duration=5.0, # Does not matter, should just be > 3 so that dataset does not filter it out.
    temperature=0.8, # Can play around with this. temperature < 0.85 can be more robust
    text_task=text_task
)
display(Audio(out_path))