In [None]:
## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode

# ## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

## Install TorchAudio
!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html

In [56]:
import os
import wget
import json
from tqdm import tqdm
from omegaconf import OmegaConf
from nemo.collections.asr.models.msdd_models import NeuralDiarizer

In [20]:
MODEL_CONFIG = 'diar_infer_telephonic.yaml'
if not os.path.exists(MODEL_CONFIG):
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"
    MODEL_CONFIG = wget.download(config_url)

config = OmegaConf.load(MODEL_CONFIG)
# print(OmegaConf.to_yaml(config))

In [61]:
config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5,1.25,1.0,0.75,0.5]
config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75,0.625,0.5,0.375,0.1]
config.diarizer.speaker_embeddings.parameters.multiscale_weights= [1,1,1,1,1]

config.diarizer.msdd_model.model_path = 'diar_msdd_telephonic' # Telephonic speaker diarization model
config.diarizer.msdd_model.parameters.sigmoid_threshold = [0.7, 1.0] # Evaluate with T=0.7 and T=1.0

config.num_workers = 1 # Workaround for multiprocessing hanging with ipython issue

config.diarizer.speaker_embeddings.model_path = 'titanet_large'
config.diarizer.oracle_vad = False # compute VAD provided with model_path to vad config
config.diarizer.clustering.parameters.oracle_num_speakers=False

config.diarizer.vad.model_path = 'vad_multilingual_marblenet'
config.diarizer.vad.parameters.onset = 0.8
config.diarizer.vad.parameters.offset = 0.6
config.diarizer.vad.parameters.pad_offset = -0.05

config.diarizer.manifest_filepath = 'input_manifest.json'
config.diarizer.out_dir = 'outputs'

In [62]:
def nemo_diarize(path):
    base_name = os.path.splitext(os.path.basename(path))[0]
    if os.path.isfile('input_manifest.json'):
        os.system('rm input_manifest.json')
    if os.path.isdir('outputs'):
        os.system('rm -r outputs')
    meta = {
        'audio_filepath': path,
        'offset': 0,
        'duration': None,
        'label': 'infer',
        'text': '-',
        'num_speakers': 2,
        'rttm_filepath': None,
        'uem_filepath' : None
    }
    with open('input_manifest.json','w') as fp:
        json.dump(meta, fp)
        fp.write('\n')

    config.diarizer.manifest_filepath = 'input_manifest.json'
    config.diarizer.out_dir = 'outputs'
    system_vad_msdd_model = NeuralDiarizer(cfg=config)
    system_vad_msdd_model.diarize()

    rttm = []
    with open(f"outputs/pred_rttms/{base_name}.rttm", 'r') as f:
        for line in f.readlines():
            line = line.strip().split()
            rttm.append((float(line[3]), float(line[3]) + float(line[4]), line[7]))
    return rttm

In [63]:
out_dir = "timestamps_nemo_pretrained"
os.mkdir(out_dir)

src_dir = "primock57/output/mixed_audio/"
for audio in tqdm(os.listdir(src_dir)):
    uri = audio[:-4]
    if not uri in ["day5_consultation07", "day5_consultation08", "day5_consultation09", "day5_consultation10", "day5_consultation11", "day5_consultation12"]:
        continue
    audio_path = os.path.join(src_dir, audio)
    diarization = nemo_diarize(audio_path)
    with open(os.path.join(out_dir, uri + ".txt"), 'w') as f:
        for st, en, spk_id in diarization:
            f.write(f"{st:.3f},{en:.3f},{spk_id}\n")


  0%|          | 0/57 [00:00<?, ?it/s]

[NeMo I 2024-01-15 18:45:42 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-01-15 18:45:42 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 18:45:42 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-01-15 18:45:42 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 18:45:43 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-01-15 18:45:43 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-01-15 18:45:43 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-01-15 18:45:43 features:289] PADDING: 16
[NeMo I 2024-01-15 18:45:43 features:289] PADDING: 16
[NeMo I 2024-01-15 18:45:44 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 18:45:44 features:289] PADDING: 16
[NeMo I 2024-01-15 18:45:44 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-01-15 18:45:44 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 18:45:44 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-01-15 18:45:44 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 18:45:44 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-01-15 18:45:44 features:289] PADDING: 16
[NeMo I 2024-01-15 18:45:44 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 18:45:44 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-01-15 18:45:44 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-01-15 18:45:44 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:45:44 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest: 100%|██████████| 1/1 [00:00<00:00, 25.01it/s]

[NeMo I 2024-01-15 18:45:44 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-01-15 18:45:44 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:45:44 collections:446] Dataset loaded with 10 items, total duration of  0.14 hours.
[NeMo I 2024-01-15 18:45:44 collections:448] # 10 files loaded accounting to # 1 labels




vad:   0%|          | 0/10 [00:00<?, ?it/s][A
vad:  10%|█         | 1/10 [00:00<00:03,  2.28it/s][A
vad:  20%|██        | 2/10 [00:00<00:02,  3.00it/s][A
vad:  30%|███       | 3/10 [00:00<00:02,  3.17it/s][A
vad:  40%|████      | 4/10 [00:01<00:01,  3.30it/s][A
vad:  50%|█████     | 5/10 [00:01<00:01,  3.27it/s][A
vad:  60%|██████    | 6/10 [00:01<00:01,  3.27it/s][A
vad:  70%|███████   | 7/10 [00:02<00:00,  3.32it/s][A
vad:  80%|████████  | 8/10 [00:02<00:00,  3.45it/s][A
vad:  90%|█████████ | 9/10 [00:02<00:00,  3.72it/s][A
vad: 100%|██████████| 10/10 [00:02<00:00,  3.37it/s]

[NeMo I 2024-01-15 18:45:47 clustering_diarizer:250] Generating predictions with overlapping input segments




generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A
generating preds: 100%|██████████| 1/1 [00:04<00:00,  4.31s/it][A
                                                               [A

[NeMo I 2024-01-15 18:45:52 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]

[NeMo I 2024-01-15 18:45:52 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-01-15 18:45:52 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:45:52 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:45:52 collections:446] Dataset loaded with 410 items, total duration of  0.13 hours.
[NeMo I 2024-01-15 18:45:52 collections:448] # 410 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/7 [00:00<?, ?it/s][A
[1/5] extract embeddings:  14%|█▍        | 1/7 [00:10<01:03, 10.62s/it][A
[1/5] extract embeddings:  29%|██▊       | 2/7 [00:18<00:45,  9.09s/it][A
[1/5] extract embeddings:  43%|████▎     | 3/7 [00:28<00:36,  9.22s/it][A
[1/5] extract embeddings:  57%|█████▋    | 4/7 [00:37<00:27,  9.23s/it][A
[1/5] extract embeddings:  71%|███████▏  | 5/7 [00:45<00:17,  8.90s/it][A
[1/5] extract embeddings:  86%|████████▌ | 6/7 [00:54<00:09,  9.01s/it][A
[1/5] extract embeddings: 100%|██████████| 7/7 [00:57<00:00,  8.26s/it]

[NeMo I 2024-01-15 18:46:50 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:46:50 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-01-15 18:46:50 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:46:50 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:46:50 collections:446] Dataset loaded with 499 items, total duration of  0.14 hours.
[NeMo I 2024-01-15 18:46:50 collections:448] # 499 files loaded accounting to # 1 labels




[2/5] extract embeddings:   0%|          | 0/8 [00:00<?, ?it/s][A
[2/5] extract embeddings:  12%|█▎        | 1/8 [00:08<00:57,  8.15s/it][A
[2/5] extract embeddings:  25%|██▌       | 2/8 [00:14<00:43,  7.33s/it][A
[2/5] extract embeddings:  38%|███▊      | 3/8 [00:22<00:38,  7.60s/it][A
[2/5] extract embeddings:  50%|█████     | 4/8 [00:29<00:29,  7.30s/it][A
[2/5] extract embeddings:  62%|██████▎   | 5/8 [00:37<00:22,  7.42s/it][A
[2/5] extract embeddings:  75%|███████▌  | 6/8 [00:44<00:14,  7.39s/it][A
[2/5] extract embeddings:  88%|████████▊ | 7/8 [00:51<00:07,  7.29s/it][A
[2/5] extract embeddings: 100%|██████████| 8/8 [00:57<00:00,  7.13s/it]

[NeMo I 2024-01-15 18:47:47 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:47:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-01-15 18:47:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:47:47 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:47:47 collections:446] Dataset loaded with 597 items, total duration of  0.14 hours.
[NeMo I 2024-01-15 18:47:47 collections:448] # 597 files loaded accounting to # 1 labels




[3/5] extract embeddings:   0%|          | 0/10 [00:00<?, ?it/s][A
[3/5] extract embeddings:  10%|█         | 1/10 [00:06<00:58,  6.46s/it][A
[3/5] extract embeddings:  20%|██        | 2/10 [00:12<00:48,  6.05s/it][A
[3/5] extract embeddings:  30%|███       | 3/10 [00:18<00:44,  6.35s/it][A
[3/5] extract embeddings:  40%|████      | 4/10 [00:24<00:36,  6.00s/it][A
[3/5] extract embeddings:  50%|█████     | 5/10 [00:30<00:31,  6.21s/it][A
[3/5] extract embeddings:  60%|██████    | 6/10 [00:36<00:24,  6.01s/it][A
[3/5] extract embeddings:  70%|███████   | 7/10 [00:43<00:18,  6.18s/it][A
[3/5] extract embeddings:  80%|████████  | 8/10 [00:48<00:11,  5.96s/it][A
[3/5] extract embeddings:  90%|█████████ | 9/10 [00:54<00:06,  6.04s/it][A
[3/5] extract embeddings: 100%|██████████| 10/10 [00:56<00:00,  5.70s/it]

[NeMo I 2024-01-15 18:48:44 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:48:44 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-01-15 18:48:44 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:48:44 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:48:44 collections:446] Dataset loaded with 801 items, total duration of  0.15 hours.
[NeMo I 2024-01-15 18:48:44 collections:448] # 801 files loaded accounting to # 1 labels




[4/5] extract embeddings:   0%|          | 0/13 [00:00<?, ?it/s][A
[4/5] extract embeddings:   8%|▊         | 1/13 [00:04<00:49,  4.09s/it][A
[4/5] extract embeddings:  15%|█▌        | 2/13 [00:07<00:42,  3.86s/it][A
[4/5] extract embeddings:  23%|██▎       | 3/13 [00:12<00:42,  4.22s/it][A
[4/5] extract embeddings:  31%|███       | 4/13 [00:16<00:36,  4.08s/it][A
[4/5] extract embeddings:  38%|███▊      | 5/13 [00:20<00:32,  4.02s/it][A
[4/5] extract embeddings:  46%|████▌     | 6/13 [00:24<00:28,  4.08s/it][A
[4/5] extract embeddings:  54%|█████▍    | 7/13 [00:28<00:24,  4.07s/it][A
[4/5] extract embeddings:  62%|██████▏   | 8/13 [00:32<00:19,  3.93s/it][A
[4/5] extract embeddings:  69%|██████▉   | 9/13 [00:35<00:15,  3.84s/it][A
[4/5] extract embeddings:  77%|███████▋  | 10/13 [00:40<00:12,  4.09s/it][A
[4/5] extract embeddings:  85%|████████▍ | 11/13 [00:44<00:07,  3.97s/it][A
[4/5] extract embeddings:  92%|█████████▏| 12/13 [00:47<00:03,  3.87s/it][A
[4/5] extract e

[NeMo I 2024-01-15 18:49:34 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:49:34 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, outputs/speaker_outputs/subsegments_scale4.json





[NeMo I 2024-01-15 18:49:34 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:49:34 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:49:34 collections:446] Dataset loaded with 2554 items, total duration of  0.35 hours.
[NeMo I 2024-01-15 18:49:34 collections:448] # 2554 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/40 [00:00<?, ?it/s][A
[5/5] extract embeddings:   2%|▎         | 1/40 [00:04<02:46,  4.26s/it][A
[5/5] extract embeddings:   5%|▌         | 2/40 [00:07<02:15,  3.56s/it][A
[5/5] extract embeddings:   8%|▊         | 3/40 [00:10<02:02,  3.30s/it][A
[5/5] extract embeddings:  10%|█         | 4/40 [00:13<01:57,  3.27s/it][A
[5/5] extract embeddings:  12%|█▎        | 5/40 [00:17<02:03,  3.52s/it][A
[5/5] extract embeddings:  15%|█▌        | 6/40 [00:20<01:52,  3.32s/it][A
[5/5] extract embeddings:  18%|█▊        | 7/40 [00:23<01:45,  3.20s/it][A
[5/5] extract embeddings:  20%|██        | 8/40 [00:26<01:40,  3.14s/it][A
[5/5] extract embeddings:  22%|██▎       | 9/40 [00:30<01:46,  3.45s/it][A
[5/5] extract embeddings:  25%|██▌       | 10/40 [00:33<01:39,  3.30s/it][A
[5/5] extract embeddings:  28%|██▊       | 11/40 [00:36<01:32,  3.20s/it][A
[5/5] extract embeddings:  30%|███       | 12/40 [00:39<01:28,  3.15s/it][A
[5/5] extract em

[NeMo I 2024-01-15 18:51:44 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings


[NeMo W 2024-01-15 18:51:44 speaker_utils:464] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.

clustering:   0%|          | 0/1 [00:00<?, ?it/s][A
clustering: 100%|██████████| 1/1 [00:04<00:00,  4.62s/it]

[NeMo I 2024-01-15 18:51:49 clustering_diarizer:464] Outputs are saved in /content/outputs directory



[NeMo W 2024-01-15 18:51:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:49 msdd_models:960] Loading embedding pickle file of scale:0 at outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-01-15 18:51:49 msdd_models:960] Loading embedding pickle file of scale:1 at outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-01-15 18:51:49 msdd_models:960] Loading embedding pickle file of scale:2 at outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-01-15 18:51:49 msdd_models:960] Loading embedding pickle file of scale:3 at outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-01-15 18:51:49 msdd_models:960] Loading embedding pickle file of scale:4 at outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-01-15 18:51:49 msdd_models:938] Loading cluster label file from outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2024-01-15 18:51:49 collections:761] Filtered duration for loading c


100%|██████████| 1/1 [00:00<00:00, 10.28it/s]

[NeMo I 2024-01-15 18:51:49 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2024-01-15 18:51:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:51:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:51:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:49 msdd_models:1431]   
    
[NeMo I 2024-01-15 18:51:49 msdd_models:1403]      [Threshold: 1.0000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:51:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:51:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:51:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:51:50 msdd_models:1431]   
    


  9%|▉         | 5/57 [06:08<1:03:47, 73.60s/it]

[NeMo I 2024-01-15 18:51:50 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-01-15 18:51:50 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 18:51:50 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-01-15 18:51:50 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 18:51:51 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-01-15 18:51:51 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-01-15 18:51:51 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-01-15 18:51:51 features:289] PADDING: 16
[NeMo I 2024-01-15 18:51:51 features:289] PADDING: 16
[NeMo I 2024-01-15 18:51:52 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 18:51:53 features:289] PADDING: 16
[NeMo I 2024-01-15 18:51:53 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-01-15 18:51:53 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 18:51:53 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-01-15 18:51:53 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 18:51:53 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-01-15 18:51:53 features:289] PADDING: 16
[NeMo I 2024-01-15 18:51:53 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 18:51:53 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-01-15 18:51:53 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-01-15 18:51:53 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:51:53 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest: 100%|██████████| 1/1 [00:00<00:00, 16.68it/s]

[NeMo I 2024-01-15 18:51:53 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-01-15 18:51:53 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:51:53 collections:446] Dataset loaded with 12 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 18:51:53 collections:448] # 12 files loaded accounting to # 1 labels




vad:   0%|          | 0/12 [00:00<?, ?it/s][A
vad:   8%|▊         | 1/12 [00:00<00:04,  2.41it/s][A
vad:  17%|█▋        | 2/12 [00:00<00:03,  3.20it/s][A
vad:  25%|██▌       | 3/12 [00:00<00:02,  3.64it/s][A
vad:  33%|███▎      | 4/12 [00:01<00:02,  3.94it/s][A
vad:  42%|████▏     | 5/12 [00:01<00:01,  4.19it/s][A
vad:  50%|█████     | 6/12 [00:01<00:01,  4.41it/s][A
vad:  58%|█████▊    | 7/12 [00:01<00:01,  4.55it/s][A
vad:  67%|██████▋   | 8/12 [00:01<00:00,  4.72it/s][A
vad:  75%|███████▌  | 9/12 [00:02<00:00,  4.81it/s][A
vad:  83%|████████▎ | 10/12 [00:02<00:00,  4.93it/s][A
vad:  92%|█████████▏| 11/12 [00:02<00:00,  5.10it/s][A
vad: 100%|██████████| 12/12 [00:02<00:00,  4.44it/s]

[NeMo I 2024-01-15 18:51:56 clustering_diarizer:250] Generating predictions with overlapping input segments




generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A
generating preds: 100%|██████████| 1/1 [00:04<00:00,  4.96s/it][A
                                                               [A

[NeMo I 2024-01-15 18:52:01 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  2.38it/s]

[NeMo I 2024-01-15 18:52:02 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-01-15 18:52:02 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:52:02 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:52:02 collections:446] Dataset loaded with 485 items, total duration of  0.15 hours.
[NeMo I 2024-01-15 18:52:02 collections:448] # 485 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/8 [00:00<?, ?it/s][A
[1/5] extract embeddings:  12%|█▎        | 1/8 [00:10<01:13, 10.54s/it][A
[1/5] extract embeddings:  25%|██▌       | 2/8 [00:21<01:03, 10.52s/it][A
[1/5] extract embeddings:  38%|███▊      | 3/8 [00:31<00:51, 10.39s/it][A
[1/5] extract embeddings:  50%|█████     | 4/8 [00:40<00:40, 10.05s/it][A
[1/5] extract embeddings:  62%|██████▎   | 5/8 [00:51<00:30, 10.13s/it][A
[1/5] extract embeddings:  75%|███████▌  | 6/8 [01:01<00:20, 10.27s/it][A
[1/5] extract embeddings:  88%|████████▊ | 7/8 [01:12<00:10, 10.32s/it][A
[1/5] extract embeddings: 100%|██████████| 8/8 [01:16<00:00,  9.58s/it]

[NeMo I 2024-01-15 18:53:18 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:53:18 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-01-15 18:53:18 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:53:18 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:53:18 collections:446] Dataset loaded with 574 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 18:53:18 collections:448] # 574 files loaded accounting to # 1 labels




[2/5] extract embeddings:   0%|          | 0/9 [00:00<?, ?it/s][A
[2/5] extract embeddings:  11%|█         | 1/9 [00:07<00:58,  7.28s/it][A
[2/5] extract embeddings:  22%|██▏       | 2/9 [00:13<00:47,  6.74s/it][A
[2/5] extract embeddings:  33%|███▎      | 3/9 [00:20<00:40,  6.79s/it][A
[2/5] extract embeddings:  44%|████▍     | 4/9 [00:26<00:32,  6.57s/it][A
[2/5] extract embeddings:  56%|█████▌    | 5/9 [00:33<00:26,  6.55s/it][A
[2/5] extract embeddings:  67%|██████▋   | 6/9 [00:39<00:19,  6.59s/it][A
[2/5] extract embeddings:  78%|███████▊  | 7/9 [00:46<00:13,  6.58s/it][A
[2/5] extract embeddings:  89%|████████▉ | 8/9 [00:52<00:06,  6.56s/it][A
[2/5] extract embeddings: 100%|██████████| 9/9 [00:59<00:00,  6.57s/it]

[NeMo I 2024-01-15 18:54:17 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:54:17 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-01-15 18:54:17 clustering_diarizer:343] Extracting embeddings for Diarization





[NeMo I 2024-01-15 18:54:17 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:54:17 collections:446] Dataset loaded with 700 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 18:54:17 collections:448] # 700 files loaded accounting to # 1 labels



[3/5] extract embeddings:   0%|          | 0/11 [00:00<?, ?it/s][A
[3/5] extract embeddings:   9%|▉         | 1/11 [00:06<01:00,  6.02s/it][A
[3/5] extract embeddings:  18%|█▊        | 2/11 [00:11<00:49,  5.47s/it][A
[3/5] extract embeddings:  27%|██▋       | 3/11 [00:17<00:46,  5.80s/it][A
[3/5] extract embeddings:  36%|███▋      | 4/11 [00:22<00:38,  5.54s/it][A
[3/5] extract embeddings:  45%|████▌     | 5/11 [00:28<00:33,  5.60s/it][A
[3/5] extract embeddings:  55%|█████▍    | 6/11 [00:33<00:27,  5.48s/it][A
[3/5] extract embeddings:  64%|██████▎   | 7/11 [00:38<00:21,  5.32s/it][A
[3/5] extract embeddings:  73%|███████▎  | 8/11 [00:44<00:16,  5.60s/it][A
[3/5] extract embeddings:  82%|████████▏ | 9/11 [00:49<00:10,  5.44s/it][A
[3/5] extract embeddings:  91%|█████████ | 10/11 [00:55<00:05,  5.55s/it][A
[3/5] extract embeddings: 100%|██████████| 11/11 [01:00<00:00,  5.50s/it]

[NeMo I 2024-01-15 18:55:18 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:55:18 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-01-15 18:55:18 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:55:18 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:55:18 collections:446] Dataset loaded with 921 items, total duration of  0.17 hours.
[NeMo I 2024-01-15 18:55:18 collections:448] # 921 files loaded accounting to # 1 labels




[4/5] extract embeddings:   0%|          | 0/15 [00:00<?, ?it/s][A
[4/5] extract embeddings:   7%|▋         | 1/15 [00:04<00:56,  4.01s/it][A
[4/5] extract embeddings:  13%|█▎        | 2/15 [00:08<00:54,  4.22s/it][A
[4/5] extract embeddings:  20%|██        | 3/15 [00:12<00:48,  4.01s/it][A
[4/5] extract embeddings:  27%|██▋       | 4/15 [00:15<00:42,  3.86s/it][A
[4/5] extract embeddings:  33%|███▎      | 5/15 [00:19<00:38,  3.81s/it][A
[4/5] extract embeddings:  40%|████      | 6/15 [00:24<00:36,  4.09s/it][A
[4/5] extract embeddings:  47%|████▋     | 7/15 [00:27<00:31,  3.94s/it][A
[4/5] extract embeddings:  53%|█████▎    | 8/15 [00:31<00:26,  3.83s/it][A
[4/5] extract embeddings:  60%|██████    | 9/15 [00:35<00:24,  4.04s/it][A
[4/5] extract embeddings:  67%|██████▋   | 10/15 [00:39<00:19,  3.94s/it][A
[4/5] extract embeddings:  73%|███████▎  | 11/15 [00:43<00:15,  3.86s/it][A
[4/5] extract embeddings:  80%|████████  | 12/15 [00:47<00:11,  3.90s/it][A
[4/5] extract e

[NeMo I 2024-01-15 18:56:15 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:56:15 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, outputs/speaker_outputs/subsegments_scale4.json
[NeMo I 2024-01-15 18:56:15 clustering_diarizer:343] Extracting embeddings for Diarization





[NeMo I 2024-01-15 18:56:15 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:56:15 collections:446] Dataset loaded with 2911 items, total duration of  0.40 hours.
[NeMo I 2024-01-15 18:56:15 collections:448] # 2911 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/46 [00:00<?, ?it/s][A
[5/5] extract embeddings:   2%|▏         | 1/46 [00:03<02:35,  3.46s/it][A
[5/5] extract embeddings:   4%|▍         | 2/46 [00:07<02:38,  3.61s/it][A
[5/5] extract embeddings:   7%|▋         | 3/46 [00:10<02:22,  3.31s/it][A
[5/5] extract embeddings:   9%|▊         | 4/46 [00:13<02:11,  3.14s/it][A
[5/5] extract embeddings:  11%|█         | 5/46 [00:15<02:04,  3.05s/it][A
[5/5] extract embeddings:  13%|█▎        | 6/46 [00:19<02:13,  3.34s/it][A
[5/5] extract embeddings:  15%|█▌        | 7/46 [00:22<02:04,  3.20s/it][A
[5/5] extract embeddings:  17%|█▋        | 8/46 [00:25<01:58,  3.11s/it][A
[5/5] extract embeddings:  20%|█▉        | 9/46 [00:28<01:53,  3.06s/it][A
[5/5] extract embeddings:  22%|██▏       | 10/46 [00:32<01:55,  3.20s/it][A
[5/5] extract embeddings:  24%|██▍       | 11/46 [00:35<01:52,  3.21s/it][A
[5/5] extract embeddings:  26%|██▌       | 12/46 [00:38<01:45,  3.11s/it][A
[5/5] extract em

[NeMo I 2024-01-15 18:58:39 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings


[NeMo W 2024-01-15 18:58:39 speaker_utils:464] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.

clustering:   0%|          | 0/1 [00:00<?, ?it/s][A
clustering: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]

[NeMo I 2024-01-15 18:58:45 clustering_diarizer:464] Outputs are saved in /content/outputs directory



[NeMo W 2024-01-15 18:58:45 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:45 msdd_models:960] Loading embedding pickle file of scale:0 at outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-01-15 18:58:45 msdd_models:960] Loading embedding pickle file of scale:1 at outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-01-15 18:58:45 msdd_models:960] Loading embedding pickle file of scale:2 at outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-01-15 18:58:45 msdd_models:960] Loading embedding pickle file of scale:3 at outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-01-15 18:58:45 msdd_models:960] Loading embedding pickle file of scale:4 at outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-01-15 18:58:45 msdd_models:938] Loading cluster label file from outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2024-01-15 18:58:45 collections:761] Filtered duration for loading c


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  8.74it/s]

[NeMo I 2024-01-15 18:58:46 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2024-01-15 18:58:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:58:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:58:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:46 msdd_models:1431]   
    
[NeMo I 2024-01-15 18:58:46 msdd_models:1403]      [Threshold: 1.0000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:58:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:58:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:46 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 18:58:46 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 18:58:46 msdd_models:1431]   
    


 12%|█▏        | 7/57 [13:04<1:42:09, 122.59s/it]

[NeMo I 2024-01-15 18:58:46 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-01-15 18:58:46 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 18:58:46 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-01-15 18:58:46 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 18:58:48 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-01-15 18:58:48 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-01-15 18:58:48 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-01-15 18:58:48 features:289] PADDING: 16
[NeMo I 2024-01-15 18:58:48 features:289] PADDING: 16
[NeMo I 2024-01-15 18:58:49 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 18:58:49 features:289] PADDING: 16
[NeMo I 2024-01-15 18:58:50 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-01-15 18:58:50 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 18:58:50 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-01-15 18:58:50 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 18:58:50 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-01-15 18:58:50 features:289] PADDING: 16
[NeMo I 2024-01-15 18:58:50 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 18:58:50 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-01-15 18:58:50 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-01-15 18:58:50 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 18:58:50 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest: 100%|██████████| 1/1 [00:00<00:00, 33.29it/s]

[NeMo I 2024-01-15 18:58:50 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-01-15 18:58:50 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:58:50 collections:446] Dataset loaded with 7 items, total duration of  0.09 hours.
[NeMo I 2024-01-15 18:58:50 collections:448] # 7 files loaded accounting to # 1 labels




vad:   0%|          | 0/7 [00:00<?, ?it/s][A
vad:  14%|█▍        | 1/7 [00:00<00:02,  2.33it/s][A
vad:  29%|██▊       | 2/7 [00:00<00:01,  3.19it/s][A
vad:  43%|████▎     | 3/7 [00:00<00:01,  3.72it/s][A
vad:  57%|█████▋    | 4/7 [00:01<00:00,  4.10it/s][A
vad:  71%|███████▏  | 5/7 [00:01<00:00,  4.44it/s][A
vad:  86%|████████▌ | 6/7 [00:01<00:00,  4.67it/s][A
vad: 100%|██████████| 7/7 [00:01<00:00,  4.24it/s]

[NeMo I 2024-01-15 18:58:52 clustering_diarizer:250] Generating predictions with overlapping input segments




generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A
generating preds: 100%|██████████| 1/1 [00:02<00:00,  2.62s/it][A
                                                               [A

[NeMo I 2024-01-15 18:58:55 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s]

[NeMo I 2024-01-15 18:58:55 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-01-15 18:58:55 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:58:55 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:58:55 collections:446] Dataset loaded with 262 items, total duration of  0.08 hours.
[NeMo I 2024-01-15 18:58:55 collections:448] # 262 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/5 [00:00<?, ?it/s][A
[1/5] extract embeddings:  20%|██        | 1/5 [00:10<00:42, 10.73s/it][A
[1/5] extract embeddings:  40%|████      | 2/5 [00:20<00:30, 10.13s/it][A
[1/5] extract embeddings:  60%|██████    | 3/5 [00:30<00:19,  9.90s/it][A
[1/5] extract embeddings:  80%|████████  | 4/5 [00:40<00:10, 10.03s/it][A
[1/5] extract embeddings: 100%|██████████| 5/5 [00:40<00:00,  8.20s/it]

[NeMo I 2024-01-15 18:59:36 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 18:59:36 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-01-15 18:59:36 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 18:59:36 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 18:59:36 collections:446] Dataset loaded with 305 items, total duration of  0.08 hours.
[NeMo I 2024-01-15 18:59:36 collections:448] # 305 files loaded accounting to # 1 labels




[2/5] extract embeddings:   0%|          | 0/5 [00:00<?, ?it/s][A
[2/5] extract embeddings:  20%|██        | 1/5 [00:07<00:31,  7.78s/it][A
[2/5] extract embeddings:  40%|████      | 2/5 [00:13<00:20,  6.85s/it][A
[2/5] extract embeddings:  60%|██████    | 3/5 [00:21<00:13,  6.96s/it][A
[2/5] extract embeddings:  80%|████████  | 4/5 [00:27<00:06,  6.71s/it][A
[2/5] extract embeddings: 100%|██████████| 5/5 [00:32<00:00,  6.48s/it]

[NeMo I 2024-01-15 19:00:08 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:00:08 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-01-15 19:00:08 clustering_diarizer:343] Extracting embeddings for Diarization





[NeMo I 2024-01-15 19:00:08 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:00:08 collections:446] Dataset loaded with 377 items, total duration of  0.09 hours.
[NeMo I 2024-01-15 19:00:08 collections:448] # 377 files loaded accounting to # 1 labels



[3/5] extract embeddings:   0%|          | 0/6 [00:00<?, ?it/s][A
[3/5] extract embeddings:  17%|█▋        | 1/6 [00:06<00:31,  6.23s/it][A
[3/5] extract embeddings:  33%|███▎      | 2/6 [00:11<00:22,  5.74s/it][A
[3/5] extract embeddings:  50%|█████     | 3/6 [00:18<00:18,  6.05s/it][A
[3/5] extract embeddings:  67%|██████▋   | 4/6 [00:23<00:11,  5.87s/it][A
[3/5] extract embeddings:  83%|████████▎ | 5/6 [00:30<00:06,  6.16s/it][A
[3/5] extract embeddings: 100%|██████████| 6/6 [00:35<00:00,  5.88s/it]

[NeMo I 2024-01-15 19:00:44 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:00:44 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-01-15 19:00:44 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:00:44 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:00:44 collections:446] Dataset loaded with 488 items, total duration of  0.09 hours.
[NeMo I 2024-01-15 19:00:44 collections:448] # 488 files loaded accounting to # 1 labels




[4/5] extract embeddings:   0%|          | 0/8 [00:00<?, ?it/s][A
[4/5] extract embeddings:  12%|█▎        | 1/8 [00:04<00:30,  4.38s/it][A
[4/5] extract embeddings:  25%|██▌       | 2/8 [00:08<00:26,  4.41s/it][A
[4/5] extract embeddings:  38%|███▊      | 3/8 [00:12<00:20,  4.09s/it][A
[4/5] extract embeddings:  50%|█████     | 4/8 [00:16<00:15,  3.91s/it][A
[4/5] extract embeddings:  62%|██████▎   | 5/8 [00:21<00:12,  4.28s/it][A
[4/5] extract embeddings:  75%|███████▌  | 6/8 [00:24<00:08,  4.06s/it][A
[4/5] extract embeddings:  88%|████████▊ | 7/8 [00:28<00:03,  3.95s/it][A
[4/5] extract embeddings: 100%|██████████| 8/8 [00:30<00:00,  3.85s/it]

[NeMo I 2024-01-15 19:01:15 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:01:15 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, outputs/speaker_outputs/subsegments_scale4.json





[NeMo I 2024-01-15 19:01:15 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:01:15 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:01:15 collections:446] Dataset loaded with 1525 items, total duration of  0.21 hours.
[NeMo I 2024-01-15 19:01:15 collections:448] # 1525 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/24 [00:00<?, ?it/s][A
[5/5] extract embeddings:   4%|▍         | 1/24 [00:04<01:35,  4.15s/it][A
[5/5] extract embeddings:   8%|▊         | 2/24 [00:07<01:14,  3.40s/it][A
[5/5] extract embeddings:  12%|█▎        | 3/24 [00:09<01:07,  3.19s/it][A
[5/5] extract embeddings:  17%|█▋        | 4/24 [00:12<01:01,  3.07s/it][A
[5/5] extract embeddings:  21%|██        | 5/24 [00:16<01:03,  3.33s/it][A
[5/5] extract embeddings:  25%|██▌       | 6/24 [00:19<00:57,  3.18s/it][A
[5/5] extract embeddings:  29%|██▉       | 7/24 [00:22<00:52,  3.06s/it][A
[5/5] extract embeddings:  33%|███▎      | 8/24 [00:25<00:48,  3.00s/it][A
[5/5] extract embeddings:  38%|███▊      | 9/24 [00:28<00:46,  3.13s/it][A
[5/5] extract embeddings:  42%|████▏     | 10/24 [00:31<00:44,  3.18s/it][A
[5/5] extract embeddings:  46%|████▌     | 11/24 [00:34<00:39,  3.06s/it][A
[5/5] extract embeddings:  50%|█████     | 12/24 [00:37<00:35,  3.00s/it][A
[5/5] extract em

[NeMo I 2024-01-15 19:02:30 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings


[NeMo W 2024-01-15 19:02:30 speaker_utils:464] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.

clustering:   0%|          | 0/1 [00:00<?, ?it/s][A
clustering: 100%|██████████| 1/1 [00:03<00:00,  3.49s/it]

[NeMo I 2024-01-15 19:02:33 clustering_diarizer:464] Outputs are saved in /content/outputs directory



[NeMo W 2024-01-15 19:02:33 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:33 msdd_models:960] Loading embedding pickle file of scale:0 at outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-01-15 19:02:33 msdd_models:960] Loading embedding pickle file of scale:1 at outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-01-15 19:02:33 msdd_models:960] Loading embedding pickle file of scale:2 at outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-01-15 19:02:33 msdd_models:960] Loading embedding pickle file of scale:3 at outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-01-15 19:02:33 msdd_models:960] Loading embedding pickle file of scale:4 at outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-01-15 19:02:33 msdd_models:938] Loading cluster label file from outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2024-01-15 19:02:33 collections:761] Filtered duration for loading c


100%|██████████| 1/1 [00:00<00:00, 19.66it/s]

[NeMo I 2024-01-15 19:02:33 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:02:33 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:02:33 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2024-01-15 19:02:33 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:33 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:02:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:34 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:02:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:34 msdd_models:1431]   
    
[NeMo I 2024-01-15 19:02:34 msdd_models:1403]      [Threshold: 1.0000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:02:34 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:02:34 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:02:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:34 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:02:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:34 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:02:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:02:34 msdd_models:1431]   
    


 16%|█▌        | 9/57 [16:52<1:35:38, 119.55s/it]

[NeMo I 2024-01-15 19:02:34 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-01-15 19:02:34 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 19:02:34 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-01-15 19:02:34 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 19:02:35 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-01-15 19:02:35 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-01-15 19:02:35 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-01-15 19:02:35 features:289] PADDING: 16
[NeMo I 2024-01-15 19:02:35 features:289] PADDING: 16
[NeMo I 2024-01-15 19:02:36 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 19:02:36 features:289] PADDING: 16
[NeMo I 2024-01-15 19:02:37 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-01-15 19:02:37 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 19:02:37 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-01-15 19:02:37 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 19:02:37 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-01-15 19:02:37 features:289] PADDING: 16
[NeMo I 2024-01-15 19:02:37 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 19:02:37 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-01-15 19:02:37 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-01-15 19:02:37 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:02:37 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest: 100%|██████████| 1/1 [00:00<00:00, 12.54it/s]


[NeMo I 2024-01-15 19:02:37 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-01-15 19:02:37 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:02:37 collections:446] Dataset loaded with 14 items, total duration of  0.19 hours.
[NeMo I 2024-01-15 19:02:37 collections:448] # 14 files loaded accounting to # 1 labels



vad:   0%|          | 0/14 [00:00<?, ?it/s][A
vad:   7%|▋         | 1/14 [00:00<00:08,  1.52it/s][A
vad:  14%|█▍        | 2/14 [00:00<00:05,  2.32it/s][A
vad:  21%|██▏       | 3/14 [00:01<00:03,  3.00it/s][A
vad:  29%|██▊       | 4/14 [00:01<00:02,  3.49it/s][A
vad:  36%|███▌      | 5/14 [00:01<00:02,  3.86it/s][A
vad:  43%|████▎     | 6/14 [00:01<00:01,  4.19it/s][A
vad:  50%|█████     | 7/14 [00:01<00:01,  4.38it/s][A
vad:  57%|█████▋    | 8/14 [00:02<00:01,  4.53it/s][A
vad:  64%|██████▍   | 9/14 [00:02<00:01,  4.54it/s][A
vad:  71%|███████▏  | 10/14 [00:02<00:00,  4.55it/s][A
vad:  79%|███████▊  | 11/14 [00:02<00:00,  4.68it/s][A
vad:  86%|████████▌ | 12/14 [00:03<00:00,  4.83it/s][A
vad: 100%|██████████| 14/14 [00:03<00:00,  4.16it/s]

[NeMo I 2024-01-15 19:02:41 clustering_diarizer:250] Generating predictions with overlapping input segments




generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A
generating preds: 100%|██████████| 1/1 [00:05<00:00,  5.53s/it][A
                                                               [A

[NeMo I 2024-01-15 19:02:46 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  2.15it/s]

[NeMo I 2024-01-15 19:02:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-01-15 19:02:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:02:47 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:02:47 collections:446] Dataset loaded with 621 items, total duration of  0.20 hours.
[NeMo I 2024-01-15 19:02:47 collections:448] # 621 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/10 [00:00<?, ?it/s][A
[1/5] extract embeddings:  10%|█         | 1/10 [00:09<01:29,  9.93s/it][A
[1/5] extract embeddings:  20%|██        | 2/10 [00:19<01:17,  9.73s/it][A
[1/5] extract embeddings:  30%|███       | 3/10 [00:28<01:05,  9.29s/it][A
[1/5] extract embeddings:  40%|████      | 4/10 [00:37<00:55,  9.30s/it][A
[1/5] extract embeddings:  50%|█████     | 5/10 [00:47<00:46,  9.40s/it][A
[1/5] extract embeddings:  60%|██████    | 6/10 [00:55<00:36,  9.18s/it][A
[1/5] extract embeddings:  70%|███████   | 7/10 [01:05<00:27,  9.31s/it][A
[1/5] extract embeddings:  80%|████████  | 8/10 [01:15<00:19,  9.51s/it][A
[1/5] extract embeddings:  90%|█████████ | 9/10 [01:24<00:09,  9.43s/it][A
[1/5] extract embeddings: 100%|██████████| 10/10 [01:30<00:00,  9.04s/it]

[NeMo I 2024-01-15 19:04:17 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:04:17 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-01-15 19:04:17 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:04:17 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:04:17 collections:446] Dataset loaded with 735 items, total duration of  0.21 hours.
[NeMo I 2024-01-15 19:04:17 collections:448] # 735 files loaded accounting to # 1 labels




[2/5] extract embeddings:   0%|          | 0/12 [00:00<?, ?it/s][A
[2/5] extract embeddings:   8%|▊         | 1/12 [00:08<01:33,  8.52s/it][A
[2/5] extract embeddings:  17%|█▋        | 2/12 [00:15<01:17,  7.78s/it][A
[2/5] extract embeddings:  25%|██▌       | 3/12 [00:24<01:12,  8.07s/it][A
[2/5] extract embeddings:  33%|███▎      | 4/12 [00:31<01:01,  7.74s/it][A
[2/5] extract embeddings:  42%|████▏     | 5/12 [00:39<00:55,  7.91s/it][A
[2/5] extract embeddings:  50%|█████     | 6/12 [00:48<00:49,  8.17s/it][A
[2/5] extract embeddings:  58%|█████▊    | 7/12 [00:55<00:39,  7.95s/it][A
[2/5] extract embeddings:  67%|██████▋   | 8/12 [01:04<00:32,  8.15s/it][A
[2/5] extract embeddings:  75%|███████▌  | 9/12 [01:11<00:23,  7.91s/it][A
[2/5] extract embeddings:  83%|████████▎ | 10/12 [01:20<00:16,  8.05s/it][A
[2/5] extract embeddings:  92%|█████████▏| 11/12 [01:28<00:08,  8.06s/it][A
[2/5] extract embeddings: 100%|██████████| 12/12 [01:31<00:00,  7.62s/it]

[NeMo I 2024-01-15 19:05:49 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:05:49 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-01-15 19:05:49 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:05:49 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:05:49 collections:446] Dataset loaded with 903 items, total duration of  0.22 hours.
[NeMo I 2024-01-15 19:05:49 collections:448] # 903 files loaded accounting to # 1 labels




[3/5] extract embeddings:   0%|          | 0/15 [00:00<?, ?it/s][A
[3/5] extract embeddings:   7%|▋         | 1/15 [00:05<01:18,  5.60s/it][A
[3/5] extract embeddings:  13%|█▎        | 2/15 [00:11<01:17,  5.95s/it][A
[3/5] extract embeddings:  20%|██        | 3/15 [00:16<01:07,  5.59s/it][A
[3/5] extract embeddings:  27%|██▋       | 4/15 [00:22<01:01,  5.58s/it][A
[3/5] extract embeddings:  33%|███▎      | 5/15 [00:28<00:55,  5.56s/it][A
[3/5] extract embeddings:  40%|████      | 6/15 [00:33<00:48,  5.42s/it][A
[3/5] extract embeddings:  47%|████▋     | 7/15 [00:39<00:45,  5.71s/it][A
[3/5] extract embeddings:  53%|█████▎    | 8/15 [00:44<00:38,  5.49s/it][A
[3/5] extract embeddings:  60%|██████    | 9/15 [00:50<00:33,  5.54s/it][A
[3/5] extract embeddings:  67%|██████▋   | 10/15 [00:55<00:27,  5.48s/it][A
[3/5] extract embeddings:  73%|███████▎  | 11/15 [01:00<00:21,  5.36s/it][A
[3/5] extract embeddings:  80%|████████  | 12/15 [01:06<00:16,  5.65s/it][A
[3/5] extract e

[NeMo I 2024-01-15 19:07:08 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:07:08 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-01-15 19:07:08 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:07:08 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:07:08 collections:446] Dataset loaded with 1198 items, total duration of  0.23 hours.
[NeMo I 2024-01-15 19:07:08 collections:448] # 1198 files loaded accounting to # 1 labels



[4/5] extract embeddings:   0%|          | 0/19 [00:00<?, ?it/s][A
[4/5] extract embeddings:   5%|▌         | 1/19 [00:04<01:16,  4.24s/it][A
[4/5] extract embeddings:  11%|█         | 2/19 [00:07<01:07,  3.95s/it][A
[4/5] extract embeddings:  16%|█▌        | 3/19 [00:12<01:06,  4.18s/it][A
[4/5] extract embeddings:  21%|██        | 4/19 [00:16<01:01,  4.09s/it][A
[4/5] extract embeddings:  26%|██▋       | 5/19 [00:20<00:55,  3.96s/it][A
[4/5] extract embeddings:  32%|███▏      | 6/19 [00:24<00:51,  3.93s/it][A
[4/5] extract embeddings:  37%|███▋      | 7/19 [00:28<00:49,  4.15s/it][A
[4/5] extract embeddings:  42%|████▏     | 8/19 [00:32<00:44,  4.03s/it][A
[4/5] extract embeddings:  47%|████▋     | 9/19 [00:36<00:39,  3.94s/it][A
[4/5] extract embeddings:  53%|█████▎    | 10/19 [00:40<00:38,  4.23s/it][A
[4/5] extract embeddings:  58%|█████▊    | 11/19 [00:44<00:32,  4.10s/it][A
[4/5] extract embeddings:  63%|██████▎   | 12/19 [00:48<00:27,  3.98s/it][A
[4/5] extract em

[NeMo I 2024-01-15 19:08:24 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:08:24 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, outputs/speaker_outputs/subsegments_scale4.json





[NeMo I 2024-01-15 19:08:24 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:08:24 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:08:24 collections:446] Dataset loaded with 3944 items, total duration of  0.54 hours.
[NeMo I 2024-01-15 19:08:24 collections:448] # 3944 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/62 [00:00<?, ?it/s][A
[5/5] extract embeddings:   2%|▏         | 1/62 [00:04<04:08,  4.07s/it][A
[5/5] extract embeddings:   3%|▎         | 2/62 [00:07<03:28,  3.48s/it][A
[5/5] extract embeddings:   5%|▍         | 3/62 [00:10<03:10,  3.24s/it][A
[5/5] extract embeddings:   6%|▋         | 4/62 [00:13<03:00,  3.12s/it][A
[5/5] extract embeddings:   8%|▊         | 5/62 [00:16<03:03,  3.21s/it][A
[5/5] extract embeddings:  10%|▉         | 6/62 [00:19<03:04,  3.30s/it][A
[5/5] extract embeddings:  11%|█▏        | 7/62 [00:22<02:53,  3.16s/it][A
[5/5] extract embeddings:  13%|█▎        | 8/62 [00:25<02:45,  3.07s/it][A
[5/5] extract embeddings:  15%|█▍        | 9/62 [00:28<02:41,  3.04s/it][A
[5/5] extract embeddings:  16%|█▌        | 10/62 [00:32<02:53,  3.34s/it][A
[5/5] extract embeddings:  18%|█▊        | 11/62 [00:35<02:43,  3.20s/it][A
[5/5] extract embeddings:  19%|█▉        | 12/62 [00:38<02:35,  3.11s/it][A
[5/5] extract em

[NeMo I 2024-01-15 19:11:37 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings


[NeMo W 2024-01-15 19:11:37 speaker_utils:464] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.

clustering:   0%|          | 0/1 [00:00<?, ?it/s][A
clustering: 100%|██████████| 1/1 [00:10<00:00, 10.51s/it]

[NeMo I 2024-01-15 19:11:48 clustering_diarizer:464] Outputs are saved in /content/outputs directory



[NeMo W 2024-01-15 19:11:48 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:48 msdd_models:960] Loading embedding pickle file of scale:0 at outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-01-15 19:11:48 msdd_models:960] Loading embedding pickle file of scale:1 at outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-01-15 19:11:48 msdd_models:960] Loading embedding pickle file of scale:2 at outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-01-15 19:11:48 msdd_models:960] Loading embedding pickle file of scale:3 at outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-01-15 19:11:48 msdd_models:960] Loading embedding pickle file of scale:4 at outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-01-15 19:11:48 msdd_models:938] Loading cluster label file from outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2024-01-15 19:11:49 collections:761] Filtered duration for loading c


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  4.12it/s]

[NeMo I 2024-01-15 19:11:49 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:11:49 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:11:49 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2024-01-15 19:11:49 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:49 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:11:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:11:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:50 msdd_models:1431]   
    
[NeMo I 2024-01-15 19:11:50 msdd_models:1403]      [Threshold: 1.0000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:11:50 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:11:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:11:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:11:50 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:50 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:11:51 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:11:51 msdd_models:1431]   
    


 23%|██▎       | 13/57 [26:08<1:34:47, 129.27s/it]

[NeMo I 2024-01-15 19:11:51 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-01-15 19:11:51 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 19:11:51 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-01-15 19:11:51 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 19:11:52 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-01-15 19:11:52 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-01-15 19:11:52 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-01-15 19:11:52 features:289] PADDING: 16
[NeMo I 2024-01-15 19:11:52 features:289] PADDING: 16
[NeMo I 2024-01-15 19:11:53 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 19:11:53 features:289] PADDING: 16
[NeMo I 2024-01-15 19:11:53 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-01-15 19:11:53 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 19:11:53 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-01-15 19:11:53 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 19:11:53 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-01-15 19:11:53 features:289] PADDING: 16
[NeMo I 2024-01-15 19:11:53 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 19:11:53 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-01-15 19:11:53 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-01-15 19:11:53 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:11:53 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest: 100%|██████████| 1/1 [00:00<00:00, 19.13it/s]

[NeMo I 2024-01-15 19:11:53 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-01-15 19:11:53 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:11:53 collections:446] Dataset loaded with 12 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 19:11:53 collections:448] # 12 files loaded accounting to # 1 labels




vad:   0%|          | 0/12 [00:00<?, ?it/s][A
vad:   8%|▊         | 1/12 [00:00<00:05,  2.17it/s][A
vad:  17%|█▋        | 2/12 [00:00<00:03,  3.00it/s][A
vad:  25%|██▌       | 3/12 [00:00<00:02,  3.57it/s][A
vad:  33%|███▎      | 4/12 [00:01<00:02,  3.94it/s][A
vad:  42%|████▏     | 5/12 [00:01<00:01,  4.17it/s][A
vad:  50%|█████     | 6/12 [00:01<00:01,  4.32it/s][A
vad:  58%|█████▊    | 7/12 [00:01<00:01,  4.51it/s][A
vad:  67%|██████▋   | 8/12 [00:01<00:00,  4.68it/s][A
vad:  75%|███████▌  | 9/12 [00:02<00:00,  4.77it/s][A
vad:  83%|████████▎ | 10/12 [00:02<00:00,  4.91it/s][A
vad:  92%|█████████▏| 11/12 [00:02<00:00,  5.09it/s][A
vad: 100%|██████████| 12/12 [00:02<00:00,  4.38it/s]


[NeMo I 2024-01-15 19:11:56 clustering_diarizer:250] Generating predictions with overlapping input segments



generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A
generating preds: 100%|██████████| 1/1 [00:04<00:00,  4.78s/it][A
                                                               [A

[NeMo I 2024-01-15 19:12:01 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s]

[NeMo I 2024-01-15 19:12:02 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-01-15 19:12:02 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:12:02 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:12:02 collections:446] Dataset loaded with 477 items, total duration of  0.14 hours.
[NeMo I 2024-01-15 19:12:02 collections:448] # 477 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/8 [00:00<?, ?it/s][A
[1/5] extract embeddings:  12%|█▎        | 1/8 [00:09<01:06,  9.52s/it][A
[1/5] extract embeddings:  25%|██▌       | 2/8 [00:19<00:57,  9.61s/it][A
[1/5] extract embeddings:  38%|███▊      | 3/8 [00:28<00:47,  9.59s/it][A
[1/5] extract embeddings:  50%|█████     | 4/8 [00:37<00:37,  9.30s/it][A
[1/5] extract embeddings:  62%|██████▎   | 5/8 [00:47<00:28,  9.40s/it][A
[1/5] extract embeddings:  75%|███████▌  | 6/8 [00:56<00:18,  9.50s/it][A
[1/5] extract embeddings:  88%|████████▊ | 7/8 [01:05<00:09,  9.25s/it][A
[1/5] extract embeddings: 100%|██████████| 8/8 [01:09<00:00,  8.74s/it]

[NeMo I 2024-01-15 19:13:12 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:13:12 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-01-15 19:13:12 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:13:12 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:13:12 collections:446] Dataset loaded with 563 items, total duration of  0.15 hours.
[NeMo I 2024-01-15 19:13:12 collections:448] # 563 files loaded accounting to # 1 labels




[2/5] extract embeddings:   0%|          | 0/9 [00:00<?, ?it/s][A
[2/5] extract embeddings:  11%|█         | 1/9 [00:07<00:58,  7.36s/it][A
[2/5] extract embeddings:  22%|██▏       | 2/9 [00:15<00:55,  7.86s/it][A
[2/5] extract embeddings:  33%|███▎      | 3/9 [00:22<00:45,  7.52s/it][A
[2/5] extract embeddings:  44%|████▍     | 4/9 [00:30<00:38,  7.66s/it][A
[2/5] extract embeddings:  56%|█████▌    | 5/9 [00:37<00:30,  7.53s/it][A
[2/5] extract embeddings:  67%|██████▋   | 6/9 [00:45<00:22,  7.46s/it][A
[2/5] extract embeddings:  78%|███████▊  | 7/9 [00:52<00:15,  7.56s/it][A
[2/5] extract embeddings:  89%|████████▉ | 8/9 [00:59<00:07,  7.33s/it][A
[2/5] extract embeddings: 100%|██████████| 9/9 [01:05<00:00,  7.29s/it]

[NeMo I 2024-01-15 19:14:17 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:14:17 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, outputs/speaker_outputs/subsegments_scale2.json
[NeMo I 2024-01-15 19:14:17 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:14:17 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:14:17 collections:446] Dataset loaded with 679 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 19:14:17 collections:448] # 679 files loaded accounting to # 1 labels




[3/5] extract embeddings:   0%|          | 0/11 [00:00<?, ?it/s][A
[3/5] extract embeddings:   9%|▉         | 1/11 [00:05<00:53,  5.32s/it][A
[3/5] extract embeddings:  18%|█▊        | 2/11 [00:10<00:47,  5.27s/it][A
[3/5] extract embeddings:  27%|██▋       | 3/11 [00:16<00:43,  5.48s/it][A
[3/5] extract embeddings:  36%|███▋      | 4/11 [00:21<00:36,  5.26s/it][A
[3/5] extract embeddings:  45%|████▌     | 5/11 [00:27<00:32,  5.49s/it][A
[3/5] extract embeddings:  55%|█████▍    | 6/11 [00:31<00:26,  5.27s/it][A
[3/5] extract embeddings:  64%|██████▎   | 7/11 [00:37<00:20,  5.23s/it][A
[3/5] extract embeddings:  73%|███████▎  | 8/11 [00:42<00:16,  5.41s/it][A
[3/5] extract embeddings:  82%|████████▏ | 9/11 [00:47<00:10,  5.29s/it][A
[3/5] extract embeddings:  91%|█████████ | 10/11 [00:53<00:05,  5.50s/it][A
[3/5] extract embeddings: 100%|██████████| 11/11 [00:57<00:00,  5.19s/it]

[NeMo I 2024-01-15 19:15:14 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:15:14 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-01-15 19:15:14 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:15:14 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:15:14 collections:446] Dataset loaded with 909 items, total duration of  0.17 hours.
[NeMo I 2024-01-15 19:15:14 collections:448] # 909 files loaded accounting to # 1 labels




[4/5] extract embeddings:   0%|          | 0/15 [00:00<?, ?it/s][A
[4/5] extract embeddings:   7%|▋         | 1/15 [00:04<00:57,  4.08s/it][A
[4/5] extract embeddings:  13%|█▎        | 2/15 [00:08<00:55,  4.27s/it][A
[4/5] extract embeddings:  20%|██        | 3/15 [00:12<00:49,  4.09s/it][A
[4/5] extract embeddings:  27%|██▋       | 4/15 [00:16<00:43,  3.93s/it][A
[4/5] extract embeddings:  33%|███▎      | 5/15 [00:19<00:38,  3.88s/it][A
[4/5] extract embeddings:  40%|████      | 6/15 [00:24<00:36,  4.10s/it][A
[4/5] extract embeddings:  47%|████▋     | 7/15 [00:28<00:31,  3.98s/it][A
[4/5] extract embeddings:  53%|█████▎    | 8/15 [00:31<00:27,  3.88s/it][A
[4/5] extract embeddings:  60%|██████    | 9/15 [00:36<00:24,  4.12s/it][A
[4/5] extract embeddings:  67%|██████▋   | 10/15 [00:40<00:19,  4.00s/it][A
[4/5] extract embeddings:  73%|███████▎  | 11/15 [00:43<00:15,  3.90s/it][A
[4/5] extract embeddings:  80%|████████  | 12/15 [00:47<00:11,  3.97s/it][A
[4/5] extract e

[NeMo I 2024-01-15 19:16:11 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:16:11 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, outputs/speaker_outputs/subsegments_scale4.json





[NeMo I 2024-01-15 19:16:11 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:16:11 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:16:11 collections:446] Dataset loaded with 2863 items, total duration of  0.39 hours.
[NeMo I 2024-01-15 19:16:11 collections:448] # 2863 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/45 [00:00<?, ?it/s][A
[5/5] extract embeddings:   2%|▏         | 1/45 [00:03<02:22,  3.23s/it][A
[5/5] extract embeddings:   4%|▍         | 2/45 [00:07<02:34,  3.58s/it][A
[5/5] extract embeddings:   7%|▋         | 3/45 [00:09<02:14,  3.21s/it][A
[5/5] extract embeddings:   9%|▉         | 4/45 [00:12<02:05,  3.06s/it][A
[5/5] extract embeddings:  11%|█         | 5/45 [00:15<01:58,  2.97s/it][A
[5/5] extract embeddings:  13%|█▎        | 6/45 [00:18<02:02,  3.15s/it][A
[5/5] extract embeddings:  16%|█▌        | 7/45 [00:22<01:59,  3.14s/it][A
[5/5] extract embeddings:  18%|█▊        | 8/45 [00:24<01:52,  3.04s/it][A
[5/5] extract embeddings:  20%|██        | 9/45 [00:27<01:46,  2.97s/it][A
[5/5] extract embeddings:  22%|██▏       | 10/45 [00:30<01:45,  3.03s/it][A
[5/5] extract embeddings:  24%|██▍       | 11/45 [00:34<01:48,  3.20s/it][A
[5/5] extract embeddings:  27%|██▋       | 12/45 [00:37<01:42,  3.10s/it][A
[5/5] extract em

[NeMo I 2024-01-15 19:18:28 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings


[NeMo W 2024-01-15 19:18:28 speaker_utils:464] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.

clustering:   0%|          | 0/1 [00:00<?, ?it/s][A
clustering: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]

[NeMo I 2024-01-15 19:18:34 clustering_diarizer:464] Outputs are saved in /content/outputs directory



[NeMo W 2024-01-15 19:18:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:34 msdd_models:960] Loading embedding pickle file of scale:0 at outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-01-15 19:18:34 msdd_models:960] Loading embedding pickle file of scale:1 at outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-01-15 19:18:34 msdd_models:960] Loading embedding pickle file of scale:2 at outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-01-15 19:18:34 msdd_models:960] Loading embedding pickle file of scale:3 at outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-01-15 19:18:34 msdd_models:960] Loading embedding pickle file of scale:4 at outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-01-15 19:18:34 msdd_models:938] Loading cluster label file from outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2024-01-15 19:18:34 collections:761] Filtered duration for loading c


100%|██████████| 1/1 [00:00<00:00, 10.04it/s]

[NeMo I 2024-01-15 19:18:34 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:18:34 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:18:34 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2024-01-15 19:18:34 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:34 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:18:35 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:35 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:18:35 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:35 msdd_models:1431]   
    
[NeMo I 2024-01-15 19:18:35 msdd_models:1403]      [Threshold: 1.0000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:18:35 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:18:35 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:18:35 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:35 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:18:35 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:35 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:18:35 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:18:35 msdd_models:1431]   
    


 56%|█████▌    | 32/57 [32:53<19:12, 46.10s/it]   

[NeMo I 2024-01-15 19:18:35 msdd_models:1092] Loading pretrained diar_msdd_telephonic model from NGC
[NeMo I 2024-01-15 19:18:35 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 19:18:35 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo
[NeMo I 2024-01-15 19:18:35 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 19:18:36 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: true
    
[NeMo W 2024-01-15 19:18:36 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    emb_dir: null
    sample_rate: 16000
    num_spks: 2
    soft_label_thres: 0.5
    labels: null
    batch_size: 15
    emb_batch_size: 0
    shuffle: false
    
[NeMo W 2024-01-15 19:18:36 modelPT:174] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple

[NeMo I 2024-01-15 19:18:36 features:289] PADDING: 16
[NeMo I 2024-01-15 19:18:37 features:289] PADDING: 16
[NeMo I 2024-01-15 19:18:37 save_restore_connector:249] Model EncDecDiarLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/diar_msdd_telephonic/3c3697a0a46f945574fa407149975a13/diar_msdd_telephonic.nemo.
[NeMo I 2024-01-15 19:18:37 features:289] PADDING: 16
[NeMo I 2024-01-15 19:18:38 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2024-01-15 19:18:38 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 19:18:38 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo
[NeMo I 2024-01-15 19:18:38 common:913] Instantiating model from pre-trained checkpoint


[NeMo W 2024-01-15 19:18:38 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2024-01-15 19:18:38 features:289] PADDING: 16
[NeMo I 2024-01-15 19:18:38 save_restore_connector:249] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_1.21.0rc0/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2024-01-15 19:18:38 msdd_models:864] Multiscale Weights: [1, 1, 1, 1, 1]
[NeMo I 2024-01-15 19:18:38 msdd_models:865] Clustering Parameters: {
        "oracle_num_speakers": false,
        "max_num_speakers": 8,
        "enhanced_count_thres": 80,
        "max_rp_threshold": 0.25,
        "sparse_search_volume": 30,
        "maj_vote_spk_count": false,
        "chunk_cluster_count": 50,
        "embeddings_per_chunk": 10000
    }
[NeMo I 2024-01-15 19:18:38 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:18:38 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue



splitting manifest: 100%|██████████| 1/1 [00:00<00:00, 19.72it/s]

[NeMo I 2024-01-15 19:18:38 classification_models:273] Perform streaming frame-level VAD
[NeMo I 2024-01-15 19:18:38 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:18:38 collections:446] Dataset loaded with 12 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 19:18:38 collections:448] # 12 files loaded accounting to # 1 labels




vad:   0%|          | 0/12 [00:00<?, ?it/s][A
vad:   8%|▊         | 1/12 [00:00<00:05,  2.18it/s][A
vad:  17%|█▋        | 2/12 [00:00<00:03,  3.04it/s][A
vad:  25%|██▌       | 3/12 [00:00<00:02,  3.56it/s][A
vad:  33%|███▎      | 4/12 [00:01<00:02,  3.99it/s][A
vad:  42%|████▏     | 5/12 [00:01<00:01,  4.26it/s][A
vad:  50%|█████     | 6/12 [00:01<00:01,  4.42it/s][A
vad:  58%|█████▊    | 7/12 [00:01<00:01,  4.49it/s][A
vad:  67%|██████▋   | 8/12 [00:01<00:00,  4.64it/s][A
vad:  75%|███████▌  | 9/12 [00:02<00:00,  4.74it/s][A
vad:  83%|████████▎ | 10/12 [00:02<00:00,  4.91it/s][A
vad: 100%|██████████| 12/12 [00:02<00:00,  4.46it/s]

[NeMo I 2024-01-15 19:18:41 clustering_diarizer:250] Generating predictions with overlapping input segments




generating preds:   0%|          | 0/1 [00:00<?, ?it/s][A
generating preds: 100%|██████████| 1/1 [00:05<00:00,  5.46s/it][A
                                                               [A

[NeMo I 2024-01-15 19:18:46 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments:   0%|          | 0/1 [00:00<?, ?it/s][A
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]

[NeMo I 2024-01-15 19:18:47 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, outputs/speaker_outputs/subsegments_scale0.json
[NeMo I 2024-01-15 19:18:47 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:18:47 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:18:47 collections:446] Dataset loaded with 454 items, total duration of  0.14 hours.
[NeMo I 2024-01-15 19:18:47 collections:448] # 454 files loaded accounting to # 1 labels




[1/5] extract embeddings:   0%|          | 0/8 [00:00<?, ?it/s][A
[1/5] extract embeddings:  12%|█▎        | 1/8 [00:09<01:04,  9.17s/it][A
[1/5] extract embeddings:  25%|██▌       | 2/8 [00:18<00:54,  9.06s/it][A
[1/5] extract embeddings:  38%|███▊      | 3/8 [00:27<00:46,  9.31s/it][A
[1/5] extract embeddings:  50%|█████     | 4/8 [00:36<00:36,  9.19s/it][A
[1/5] extract embeddings:  62%|██████▎   | 5/8 [00:45<00:27,  9.07s/it][A
[1/5] extract embeddings:  75%|███████▌  | 6/8 [00:55<00:18,  9.32s/it][A
[1/5] extract embeddings:  88%|████████▊ | 7/8 [01:04<00:09,  9.29s/it][A
[1/5] extract embeddings: 100%|██████████| 8/8 [01:05<00:00,  8.16s/it]

[NeMo I 2024-01-15 19:19:52 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:19:52 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, outputs/speaker_outputs/subsegments_scale1.json
[NeMo I 2024-01-15 19:19:52 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:19:52 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:19:52 collections:446] Dataset loaded with 542 items, total duration of  0.15 hours.
[NeMo I 2024-01-15 19:19:52 collections:448] # 542 files loaded accounting to # 1 labels




[2/5] extract embeddings:   0%|          | 0/9 [00:00<?, ?it/s][A
[2/5] extract embeddings:  11%|█         | 1/9 [00:07<00:58,  7.29s/it][A
[2/5] extract embeddings:  22%|██▏       | 2/9 [00:15<00:55,  7.91s/it][A
[2/5] extract embeddings:  33%|███▎      | 3/9 [00:22<00:44,  7.46s/it][A
[2/5] extract embeddings:  44%|████▍     | 4/9 [00:30<00:38,  7.63s/it][A
[2/5] extract embeddings:  56%|█████▌    | 5/9 [00:37<00:30,  7.53s/it][A
[2/5] extract embeddings:  67%|██████▋   | 6/9 [00:45<00:22,  7.58s/it][A
[2/5] extract embeddings:  78%|███████▊  | 7/9 [00:53<00:15,  7.74s/it][A
[2/5] extract embeddings:  89%|████████▉ | 8/9 [01:00<00:07,  7.48s/it][A
[2/5] extract embeddings: 100%|██████████| 9/9 [01:03<00:00,  7.06s/it]

[NeMo I 2024-01-15 19:20:56 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:20:56 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, outputs/speaker_outputs/subsegments_scale2.json





[NeMo I 2024-01-15 19:20:56 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:20:56 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:20:56 collections:446] Dataset loaded with 658 items, total duration of  0.15 hours.
[NeMo I 2024-01-15 19:20:56 collections:448] # 658 files loaded accounting to # 1 labels



[3/5] extract embeddings:   0%|          | 0/11 [00:00<?, ?it/s][A
[3/5] extract embeddings:   9%|▉         | 1/11 [00:05<00:58,  5.85s/it][A
[3/5] extract embeddings:  18%|█▊        | 2/11 [00:10<00:48,  5.36s/it][A
[3/5] extract embeddings:  27%|██▋       | 3/11 [00:16<00:45,  5.64s/it][A
[3/5] extract embeddings:  36%|███▋      | 4/11 [00:21<00:37,  5.40s/it][A
[3/5] extract embeddings:  45%|████▌     | 5/11 [00:27<00:32,  5.47s/it][A
[3/5] extract embeddings:  55%|█████▍    | 6/11 [00:32<00:27,  5.46s/it][A
[3/5] extract embeddings:  64%|██████▎   | 7/11 [00:37<00:21,  5.30s/it][A
[3/5] extract embeddings:  73%|███████▎  | 8/11 [00:44<00:16,  5.57s/it][A
[3/5] extract embeddings:  82%|████████▏ | 9/11 [00:49<00:10,  5.39s/it][A
[3/5] extract embeddings:  91%|█████████ | 10/11 [00:54<00:05,  5.45s/it][A
[3/5] extract embeddings: 100%|██████████| 11/11 [00:56<00:00,  5.13s/it]

[NeMo I 2024-01-15 19:21:52 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:21:52 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, outputs/speaker_outputs/subsegments_scale3.json
[NeMo I 2024-01-15 19:21:52 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:21:52 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:21:52 collections:446] Dataset loaded with 862 items, total duration of  0.16 hours.
[NeMo I 2024-01-15 19:21:52 collections:448] # 862 files loaded accounting to # 1 labels




[4/5] extract embeddings:   0%|          | 0/14 [00:00<?, ?it/s][A
[4/5] extract embeddings:   7%|▋         | 1/14 [00:04<00:52,  4.03s/it][A
[4/5] extract embeddings:  14%|█▍        | 2/14 [00:07<00:45,  3.77s/it][A
[4/5] extract embeddings:  21%|██▏       | 3/14 [00:11<00:44,  4.04s/it][A
[4/5] extract embeddings:  29%|██▊       | 4/14 [00:15<00:39,  3.96s/it][A
[4/5] extract embeddings:  36%|███▌      | 5/14 [00:19<00:34,  3.79s/it][A
[4/5] extract embeddings:  43%|████▎     | 6/14 [00:22<00:29,  3.74s/it][A
[4/5] extract embeddings:  50%|█████     | 7/14 [00:27<00:27,  3.96s/it][A
[4/5] extract embeddings:  57%|█████▋    | 8/14 [00:30<00:22,  3.83s/it][A
[4/5] extract embeddings:  64%|██████▍   | 9/14 [00:34<00:18,  3.72s/it][A
[4/5] extract embeddings:  71%|███████▏  | 10/14 [00:38<00:15,  3.87s/it][A
[4/5] extract embeddings:  79%|███████▊  | 11/14 [00:42<00:11,  3.85s/it][A
[4/5] extract embeddings:  86%|████████▌ | 12/14 [00:46<00:07,  3.81s/it][A
[4/5] extract e

[NeMo I 2024-01-15 19:22:45 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings
[NeMo I 2024-01-15 19:22:45 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, outputs/speaker_outputs/subsegments_scale4.json
[NeMo I 2024-01-15 19:22:45 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-01-15 19:22:45 collections:445] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-01-15 19:22:45 collections:446] Dataset loaded with 2761 items, total duration of  0.38 hours.
[NeMo I 2024-01-15 19:22:45 collections:448] # 2761 files loaded accounting to # 1 labels



[5/5] extract embeddings:   0%|          | 0/44 [00:00<?, ?it/s][A
[5/5] extract embeddings:   2%|▏         | 1/44 [00:03<02:27,  3.43s/it][A
[5/5] extract embeddings:   5%|▍         | 2/44 [00:06<02:08,  3.05s/it][A
[5/5] extract embeddings:   7%|▋         | 3/44 [00:08<01:59,  2.92s/it][A
[5/5] extract embeddings:   9%|▉         | 4/44 [00:12<02:03,  3.09s/it][A
[5/5] extract embeddings:  11%|█▏        | 5/44 [00:15<02:02,  3.14s/it][A
[5/5] extract embeddings:  14%|█▎        | 6/44 [00:18<01:54,  3.01s/it][A
[5/5] extract embeddings:  16%|█▌        | 7/44 [00:21<01:48,  2.93s/it][A
[5/5] extract embeddings:  18%|█▊        | 8/44 [00:23<01:44,  2.91s/it][A
[5/5] extract embeddings:  20%|██        | 9/44 [00:27<01:50,  3.16s/it][A
[5/5] extract embeddings:  23%|██▎       | 10/44 [00:30<01:42,  3.03s/it][A
[5/5] extract embeddings:  25%|██▌       | 11/44 [00:33<01:37,  2.96s/it][A
[5/5] extract embeddings:  27%|██▋       | 12/44 [00:35<01:32,  2.89s/it][A
[5/5] extract em

[NeMo I 2024-01-15 19:24:54 clustering_diarizer:389] Saved embedding files to outputs/speaker_outputs/embeddings


[NeMo W 2024-01-15 19:24:54 speaker_utils:464] cuda=False, using CPU for eigen decomposition. This might slow down the clustering process.

clustering:   0%|          | 0/1 [00:00<?, ?it/s][A
clustering: 100%|██████████| 1/1 [00:05<00:00,  5.30s/it]

[NeMo I 2024-01-15 19:25:00 clustering_diarizer:464] Outputs are saved in /content/outputs directory



[NeMo W 2024-01-15 19:25:00 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:00 msdd_models:960] Loading embedding pickle file of scale:0 at outputs/speaker_outputs/embeddings/subsegments_scale0_embeddings.pkl
[NeMo I 2024-01-15 19:25:00 msdd_models:960] Loading embedding pickle file of scale:1 at outputs/speaker_outputs/embeddings/subsegments_scale1_embeddings.pkl
[NeMo I 2024-01-15 19:25:00 msdd_models:960] Loading embedding pickle file of scale:2 at outputs/speaker_outputs/embeddings/subsegments_scale2_embeddings.pkl
[NeMo I 2024-01-15 19:25:00 msdd_models:960] Loading embedding pickle file of scale:3 at outputs/speaker_outputs/embeddings/subsegments_scale3_embeddings.pkl
[NeMo I 2024-01-15 19:25:00 msdd_models:960] Loading embedding pickle file of scale:4 at outputs/speaker_outputs/embeddings/subsegments_scale4_embeddings.pkl
[NeMo I 2024-01-15 19:25:00 msdd_models:938] Loading cluster label file from outputs/speaker_outputs/subsegments_scale4_cluster.label
[NeMo I 2024-01-15 19:25:00 collections:761] Filtered duration for loading c


  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  8.89it/s]

[NeMo I 2024-01-15 19:25:00 msdd_models:1403]      [Threshold: 0.7000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:25:00 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:25:00 speaker_utils:93] Number of files to diarize: 1



[NeMo W 2024-01-15 19:25:01 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:01 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:25:01 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:01 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:25:01 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:01 msdd_models:1431]   
    
[NeMo I 2024-01-15 19:25:01 msdd_models:1403]      [Threshold: 1.0000] [use_clus_as_main=False] [diar_window=50]
[NeMo I 2024-01-15 19:25:01 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-01-15 19:25:01 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:25:01 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:01 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:25:01 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:01 speaker_utils:93] Number of files to diarize: 1


[NeMo W 2024-01-15 19:25:01 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2024-01-15 19:25:01 msdd_models:1431]   
    


100%|██████████| 57/57 [39:19<00:00, 41.39s/it]
