In [1]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from funasr import AutoModel
import soundfile as sf
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from speechbrain.inference.VAD import VAD
import seaborn as sns
from pyannote.core import Segment
from pyannote.audio import Pipeline

Notice: ffmpeg is not installed. torchaudio is used to load audio
If you want to use ffmpeg backend to load audio, please install it by:
	sudo apt install ffmpeg # ubuntu
	# brew install ffmpeg # mac


In [2]:
# silero

SAMPLING_RATE = 16000
torch.set_num_threads(1)

torch.hub.set_dir('../models/.cache')
model_silero, utils_silero = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=False)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils_silero

 # pyannote
pipeline = Pipeline.from_pretrained (
        "pyannote/voice-activity-detection",
         use_auth_token="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", # Once while downloading the model
        cache_dir="../models/.cache"
        )

# speechbrain
vad = VAD.from_hparams(
        source="speechbrain/vad-crdnn-libriparty",
        savedir="../models/.cache"  # Save the model in a cache folder
)

# funasr
model_funasr = AutoModel(model="fsmn-vad", model_revision="v2.0.4", device="cuda")

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to ../models/.cache/master.zip
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.3.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.3.1. Bad things might happen unless you revert torch to 1.x.


2024-09-17 15:54:37,864 - modelscope - INFO - PyTorch version 2.3.1 Found.
2024-09-17 15:54:37,865 - modelscope - INFO - Loading ast index from /Users/saurabh/.cache/modelscope/ast_indexer
2024-09-17 15:54:37,934 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 270895fc7d76b5c7655183a5b4e2f1dd and a total number of 980 components indexed
2024-09-17 15:54:39,995 - modelscope - INFO - Use user-specified model revision: v2.0.4


In [3]:
import sys
sys.path.append("/Users/saurabh/Documents/projects/Voice-Activity-Detection")

from helper import vad_inference_pyannote, print_timestamps_pyannote, run_vad_on_noisy_audio_pyannote, visualize_metrics_vs_SNR_pyannote
from helper import vad_inference_funasr, convert_to_timestamps_funasr, run_vad_on_noisy_audio_funasr, visualize_metrics_vs_SNR_funasr
from helper import vad_inference_silero, print_timestamps_silero, run_vad_on_noisy_audio_silero, visualize_metrics_vs_SNR_silero
from helper import vad_inference_speechbrain, print_timestamps_speechbrain, run_vad_on_noisy_audio_speechbrain, visualize_metrics_vs_SNR_speechbrain
from helper.vad import parse_annotations_file_bh, evaluate_vad, add_noise, save_audio, plot_SNR, extract_metrics, visualize_all_metrics, evaluate_vad_cmatrix, plot_confusion_matrices, get_file_paths, read_path, parse_annotations_file, average_metrics, show_vad_matrix_bh, save_results_to_csv, extract_speech_segments, count_continuous_zeros_after_start_segments, count_continuous_ones_after_end_segments, calculate_fec, calculate_msc, calculate_over, calculate_nds, save_results_to_csv1, show_vad_metrics_matrix1, save_multiple_speech_segments_as_text, extract_filenames_as_string

In [4]:
wav_directory = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/vani_dataset/audios"
label_directory = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/vani_dataset/label"


audio_paths, label_paths = read_path(wav_directory, label_directory)
audio_paths.sort()
label_paths.sort()
annotated_segments = [parse_annotations_file_bh(label_path) for label_path in label_paths]

In [5]:
res_pyannote = []
res_silero = []
res_speechbrain = []
res_funasr = []
rr = []

In [6]:
for i in range(len(audio_paths)):
    pyannote = vad_inference_pyannote(pipeline, audio_paths[i])
    funasr = vad_inference_funasr(audio_paths[i], model_funasr)
    silero = vad_inference_silero(audio_paths[i], model_silero, utils_silero, sampling_rate=SAMPLING_RATE)
    speechbrain = vad_inference_speechbrain(audio_paths[i], vad)

    pyannote = print_timestamps_pyannote(pyannote)
    funasr = convert_to_timestamps_funasr(funasr)
    silero = print_timestamps_silero(silero)
    speechbrain = print_timestamps_speechbrain(speechbrain)

    if(pyannote == [] or funasr == [] or silero == [] or speechbrain == []):
        rr.append(i)
        continue

    res_pyannote.append(pyannote)
    res_silero.append(silero)
    res_speechbrain.append(speechbrain)
    res_funasr.append(funasr)

rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.004: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.004: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.004: 100%|[34m█[0m[0m
rtf_avg: 0.004: 100%|[34m█[0m[0m
rtf_avg: 0.006: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.006: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.005: 100%|[34m█[0m[0m
rtf_avg: 0.004: 100%|[34m█

In [9]:
rr.sort(reverse=True)

for index in rr:
    audio_paths.pop(index)
    label_paths.pop(index)
    annotated_segments.pop(index)

In [8]:
file_names = extract_filenames_as_string(label_paths)

save_multiple_speech_segments_as_text(res_pyannote, file_names, "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/vani_dataset/predicted_label/pyannote")
save_multiple_speech_segments_as_text(res_silero, file_names, "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/vani_dataset/predicted_label/silero")
save_multiple_speech_segments_as_text(res_speechbrain, file_names, "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/vani_dataset/predicted_label/speechbrain")
save_multiple_speech_segments_as_text(res_funasr, file_names, "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/vani_dataset/predicted_label/funasr")